Implementation of VIS C API

To: port-sparc64%NetBSD.org@localhost
Subject: Implementation of VIS C API
From: nia <nia%NetBSD.org@localhost>
Date: Mon, 10 Nov 2025 12:10:50 +0000

Hi, recently got interested in SIMD instructions on sparc64.

While GCC provides some builtins, they're both incomplete and
incompatible with the C API described in "VIS Instruction
Set User's Manual", published by Sun in 2001.

These builtins are also unavailable in Clang, so writing my
own implementation (compatible with Sun's) seemed like the best
path forward. It is compatible with all (known) UltraSparc
processors - I don't know any that don't implement VIS, although
they theoretically might exist, NetBSD doesn't support them.

Since I'm a beginner to SIMD programming, the test suite
I wrote as a learning exercise mostly covers basic packed
arithmetic.

The complete patch is attached.

Index: include/Makefile
===================================================================
RCS file: /cvsroot/src/include/Makefile,v
retrieving revision 1.149
diff -u -r1.149 Makefile
--- include/Makefile	8 Oct 2024 22:53:20 -0000	1.149
+++ include/Makefile	10 Nov 2025 08:06:18 -0000
@@ -39,6 +39,10 @@
 
 INCS+=  ssp/ssp.h ssp/stdio.h ssp/string.h ssp/strings.h ssp/unistd.h
 
+.if (${MACHINE_ARCH} == "sparc64")
+INCS+=	vis_proto.h vis_types.h
+.endif
+
 .if (${MACHINE_ARCH} != "vax")
 INCS+=	ieeefp.h
 .endif
Index: include/vis_proto.h
===================================================================
RCS file: include/vis_proto.h
diff -N include/vis_proto.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ include/vis_proto.h	10 Nov 2025 08:06:18 -0000
@@ -0,0 +1,1305 @@
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Nia Alarie.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This should be compatible with what was shipped with SunPro.
+ * 
+ * VIS Instruction Set User's Manual
+ * Sun Microsystems
+ * Part Number: 805-1394-03
+ * May 2001
+ *
+ * Version of available VIS instructions can be detected through
+ * the `machdep.vis` sysctl. A value of "0" means that such
+ * instructions are unavailable. All SPARCv9 hardware should support
+ * at least VIS 1, while VIS 2 requires UltraSPARC-III or newer.
+ *
+ * GCC needs -mvis for VIS, and -mvis2 for VIS 2. However, its
+ * builtins are incomplete and some cause problematic typing issues
+ * with Sun's API, so they're mostly avoided.
+ */
+
+#ifndef _VIS_PROTO_H
+#define _VIS_PROTO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vis_types.h"
+
+#define _VISATTR \
+	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+/* 4.6.1 Arithmetic - addition and subtraction */
+
+_VISATTR
+static __inline vis_d64
+vis_fpadd16(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+
+	__asm("fpadd16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fpsub16(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+
+	__asm("fpsub16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fpadd32(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+
+	__asm("fpadd32 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fpsub32(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+
+	__asm("fpsub32 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fpadd16s(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+
+	__asm("fpadd16s %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fpsub16s(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+
+	__asm("fpsub16s %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fpadd32s(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+
+	__asm("fpadd32s %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fpsub32s(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+
+	__asm("fpsub32s %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+/* 4.7 Pixel formatting - packing */
+
+_VISATTR
+static __inline vis_f32
+vis_fpack16(vis_d64 r1)
+{
+	vis_f32 out;
+
+	__asm("fpack16 %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fpack32(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+
+	__asm("fpack32 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fpackfix(vis_d64 r1)
+{
+	vis_f32 out;
+
+	__asm("fpackfix %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fexpand(vis_f32 r1)
+{
+	vis_d64 out;
+
+	__asm("fexpand %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fpmerge(vis_f32 r1, vis_f32 r2)
+{
+	vis_d64 out;
+
+	__asm("fpmerge %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+/* 4.7.6 Aligned address calculation */
+
+_VISATTR
+static __inline void *
+vis_alignaddr(void *addr, int offset)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_alignaddr(addr, offset);
+#else
+	void *out;
+
+	__asm("alginaddr %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(addr), "r"(offset));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_d64
+vis_faligndata(vis_d64 hi, vis_d64 lo)
+{
+	vis_d64 out;
+
+	__asm("faligndata %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(hi), "f"(lo));
+	return out;
+}
+
+/* 4.7.7 Edge handling */
+
+_VISATTR
+static __inline vis_s32
+vis_edge8(void *a1, void *a2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_edge8(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge8 %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "f"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge16(void *a1, void *a2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_edge16(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge16 %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "f"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge32(void *a1, void *a2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_edge32(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge32 %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "f"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge8l(void *a1, void *a2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_edge8l(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge8l %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "f"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge16l(void *a1, void *a2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_edge16l(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge16l %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "f"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge32l(void *a1, void *a2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_edge32l(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge32l %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "f"(a2));
+	return out;
+#endif
+}
+
+/* 4.9 Array coordinate translation */
+
+_VISATTR
+static __inline vis_addr
+_VISATTR
+vis_array8(vis_u64 d1, vis_s32 d2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_array8(d1, d2);
+#else
+	vis_addr out;
+
+	__asm("array8 %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(d1), "f"(d2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_addr
+vis_array16(vis_u64 d1, vis_s32 d2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_array16(d1, d2);
+#else
+	vis_addr out;
+
+	__asm("array16 %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(d1), "f"(d2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_addr
+vis_array32(vis_u64 d1, vis_s32 d2)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_array32(d1, d2);
+#else
+	vis_addr out;
+
+	__asm("array32 %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(d1), "r"(d2));
+	return out;
+#endif
+}
+
+/* 4.3.1 Graphics Status Register manipulation */
+
+_VISATTR
+static __inline vis_u64
+vis_read_gsr64(void)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	return __builtin_vis_read_gsr();
+#else
+	vis_u64 out;
+
+	__asm("rd %%gsr,%0"
+	    : "=r"(out));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline void
+vis_write_gsr64(vis_u64 gsr)
+{
+#if defined(__VIS__) && defined(__GNUC__)
+	__builtin_vis_write_gsr(gsr);
+#else
+	__asm("mov %0,%%gsr"
+	    :
+	    : "r"(gsr));
+#endif
+}
+
+_VISATTR
+static __inline vis_u32
+vis_read_gsr32(void)
+{
+	return vis_read_gsr64();
+}
+
+_VISATTR
+static __inline void
+vis_write_gsr32(vis_u32 gsr)
+{
+	vis_write_gsr64(gsr);
+}
+
+/* 4.3.2 Read and write to upper/lower components */
+
+_VISATTR
+static __inline vis_f32
+vis_read_hi(vis_d64 var)
+{
+	vis_u64 reg = *((vis_u64 *)&var);
+	vis_u32 hi = (reg >> 32) & 0xffffffff;
+	vis_f32 out = *((vis_f32 *)&hi);
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_read_lo(vis_d64 var)
+{
+	vis_u64 reg = *((vis_u64 *)&var);
+	vis_u32 lo = reg & 0xffffffff;
+	vis_f32 out = *((vis_f32 *)&lo);
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_write_lo(vis_d64 in, vis_f32 lower)
+{
+	vis_u64 out = *((vis_u64 *)&in);
+	vis_u32 hi = (out >> 32) & 0xffffffff;
+	vis_u32 lo = *((vis_u32 *)&lower);
+
+	out = ((vis_u64)hi << 32ULL) | lo;
+	return *((vis_d64 *)&out);
+}
+
+_VISATTR
+static __inline vis_d64
+vis_write_hi(vis_d64 in, vis_f32 upper)
+{
+	vis_u64 out = *((vis_u64 *)&in);
+	vis_u32 hi = *((vis_u32 *)&upper);
+	vis_u32 lo = out & 0xffffffff;
+
+	out = ((vis_u64)hi << 32ULL) | lo;
+	return *((vis_d64 *)&out);
+}
+
+/* 4.3.3 Join two variables into a single */
+
+_VISATTR
+static __inline vis_d64
+vis_freg_pair(vis_f32 f1, vis_f32 f2)
+{
+	vis_u64 out;
+	vis_u32 r1 = *((vis_u32 *)&f1);
+	vis_u32 r2 = *((vis_u32 *)&f2);
+
+	out = ((vis_u64)r1 << 32ULL) | r2;
+	return *((vis_d64 *)&out);
+}
+
+/* 4.3.4 Place ints into FP register */
+
+_VISATTR
+static __inline vis_f32
+vis_to_float(vis_u32 data)
+{
+	return *((vis_f32 *)&data);
+}
+
+_VISATTR
+static __inline vis_d64
+vis_to_double(vis_u32 d1, vis_u32 d2)
+{
+	vis_u64 out;
+
+	out = ((vis_u64)d1 << 32ULL) | d2;
+	return *((vis_d64 *)&out);
+}
+
+_VISATTR
+static __inline vis_d64
+vis_to_double_dup(vis_u32 data)
+{
+	return vis_to_double(data, data);
+}
+
+_VISATTR
+static __inline vis_d64
+vis_ll_to_double(vis_u64 data)
+{
+	return *((vis_d64 *)&data);
+}
+
+/* 4.6.2 Arithmetic - multiplication */
+
+_VISATTR
+static __inline vis_d64
+vis_fmul8x16(vis_f32 pixels, vis_d64 scale)
+{
+	vis_d64 out;
+
+	__asm("fmul8x16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(pixels), "f"(scale));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fmul8x16au(vis_f32 pixels, vis_f32 scale)
+{
+	vis_d64 out;
+
+	__asm("fmul8x16au %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(pixels), "f"(scale));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fmul8x16al(vis_f32 pixels, vis_f32 scale)
+{
+	vis_d64 out;
+
+	__asm("fmul8x16al %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(pixels), "f"(scale));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fmul8sux16(vis_d64 d1, vis_d64 d2)
+{
+	vis_d64 out;
+
+	__asm("fmul8sux16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fmul8ulx16(vis_d64 d1, vis_d64 d2)
+{
+	vis_d64 out;
+
+	__asm("fmul8ulx16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fmuld8sux16(vis_f32 d1, vis_f32 d2)
+{
+	vis_d64 out;
+
+	__asm("fmuld8sux16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fmuld8ulx16(vis_f32 d1, vis_f32 d2)
+{
+	vis_d64 out;
+
+	__asm("fmuld8ulx16 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+/* 4.5 Pixel compare */
+
+_VISATTR
+static __inline int
+vis_fcmpgt16(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmpgt16 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmple16(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmple16 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmpeq16(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmpeq16 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmpne16(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmpne16 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmpgt32(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmpgt32 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmple32(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmple32 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmpeq32(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmpeq32 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmpne32(vis_d64 d1, vis_d64 d2)
+{
+	int out;
+
+	__asm("fcmpne32 %1,%2,%0"
+	    : "=r"(out)
+	    : "f"(d1), "f"(d2));
+	return out;
+}
+
+_VISATTR
+static __inline int
+vis_fcmplt16(vis_d64 d1, vis_d64 d2)
+{
+	return vis_fcmpgt16(d2, d1);
+}
+
+_VISATTR
+static __inline int
+vis_fcmpge16(vis_d64 d1, vis_d64 d2)
+{
+	return vis_fcmple16(d2, d1);
+}
+
+_VISATTR
+static __inline int
+vis_fcmplt32(vis_d64 d1, vis_d64 d2)
+{
+	return vis_fcmpgt32(d2, d1);
+}
+
+_VISATTR
+static __inline int
+vis_fcmpge32(vis_d64 d1, vis_d64 d2)
+{
+	return vis_fcmple32(d2, d1);
+}
+
+/* 4.10 Pixel distance */
+
+_VISATTR
+static __inline vis_d64
+vis_pdist(vis_d64 pixels1, vis_d64 pixels2, vis_d64 acc)
+{
+	__asm("pdist %1,%2,%0"
+	    : "+f"(acc)
+	    : "f"(pixels1), "f"(pixels2));
+
+	return acc;
+}
+
+/* 4.4.1 Logical instructions - fill variables */
+
+_VISATTR
+static __inline vis_d64
+vis_fzero(void)
+{
+	vis_d64 out;
+
+	__asm("fzero %0"
+	    : "=f"(out));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fone(void)
+{
+	vis_d64 out;
+
+	__asm("fone %0"
+	    : "=f"(out));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fzeros(void)
+{
+	vis_f32 out;
+
+	__asm("fzeros %0"
+	    : "=f"(out));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fones(void)
+{
+	vis_f32 out;
+
+	__asm("fones %0"
+	    : "=f"(out));
+	return out;
+}
+
+/* 4.4.2 Logical instructions - copies and complements */
+
+_VISATTR
+static __inline vis_d64
+vis_fsrc(vis_d64 r1)
+{
+	vis_d64 out;
+
+	__asm("fsrc1 %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fnot(vis_d64 r1)
+{
+	vis_d64 out;
+
+	__asm("fnot1 %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fsrcs(vis_f32 r1)
+{
+	vis_f32 out;
+
+	__asm("fsrc1s %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fnots(vis_f32 r1)
+{
+	vis_f32 out;
+
+	__asm("fnot1s %1,%0"
+	    : "=f"(out)
+	    : "f"(r1));
+	return out;
+}
+
+/* 4.3 Logical instructions - bitwise */
+
+_VISATTR
+static __inline vis_d64
+vis_for(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("for %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fand(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fand %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fxor(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fxor %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fnor(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fnor %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fnand(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fnand %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fxnor(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fxnor %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fornot(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fornot1 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_fandnot(vis_d64 r1, vis_d64 r2)
+{
+	vis_d64 out;
+	__asm("fandnot1 %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fors(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fors %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fands(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fands %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fxors(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fxors %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fnors(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fnors %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fnands(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fnands %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fxnors(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fxnors %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fornots(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fornot1s %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_f32
+vis_fandnots(vis_f32 r1, vis_f32 r2)
+{
+	vis_f32 out;
+	__asm("fandnot1s %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(r1), "f"(r2));
+	return out;
+}
+
+/* 4.8.1 Partial Stores */
+
+_VISATTR
+static __inline void
+vis_pst_8(vis_d64 data, void *addr, vis_u8 mask)
+{
+	__asm("stda %1,[%0]%2,0xc0"
+	    : "=r"(addr)
+	    : "f"(data), "r"(mask));
+}
+
+_VISATTR
+static __inline void
+vis_pst_16(vis_d64 data, void *addr, vis_u8 mask)
+{
+	__asm("stda %1,[%0]%2,0xc2"
+	    : "=r"(addr)
+	    : "f"(data), "r"(mask));
+}
+
+_VISATTR
+static __inline void
+vis_pst_32(vis_d64 data, void *addr, vis_u8 mask)
+{
+	__asm("stda %1,[%0]%2,0xc4"
+	    : "=r"(addr)
+	    : "f"(data), "r"(mask));
+}
+
+/* 4.8.2 Byte/Short Loads and Stores */
+
+_VISATTR
+static __inline void
+vis_st_u8(vis_u64 data, void *addr)
+{
+	__asm("stda %1,[%0]0xd0"
+	    : "=r"(addr)
+	    : "f"(data));
+}
+
+_VISATTR
+static __inline void
+vis_st_u8_le(vis_d64 data, void *addr)
+{
+	__asm("stda %1,[%0]0xd8"
+	    : "=r"(addr)
+	    : "f"(data));
+}
+
+_VISATTR
+static __inline void
+vis_st_u16(vis_d64 data, void *addr)
+{
+	__asm("stda %1,[%0]0xd2"
+	    : "=r"(addr)
+	    : "f"(data));
+}
+
+_VISATTR
+static __inline void
+vis_st_u16_le(vis_d64 data, void *addr)
+{
+	__asm("stda %1,[%0]0xda"
+	    : "=r"(addr)
+	    : "f"(data));
+}
+
+_VISATTR
+static __inline void
+vis_st_u8_i(vis_d64 data, void *addr, long idx)
+{
+	vis_u8 *ptr = addr;
+	vis_st_u8(data, ptr + idx);
+}
+
+_VISATTR
+static __inline void
+vis_st_u16_i(vis_d64 data, void *addr, long idx)
+{
+	vis_u8 *ptr = addr;
+	vis_st_u16(data, ptr + idx);
+}
+
+_VISATTR
+static __inline vis_d64
+vis_ld_u8(void *addr)
+{
+	vis_u8 val;
+	vis_d64 out;
+
+	val = *((vis_u8 *)addr);
+	*((vis_u8 *)&out) = val;
+
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_ld_u16(void *addr)
+{
+	vis_u16 val;
+	vis_d64 out;
+
+	val = *((vis_u16 *)addr);
+	*((vis_u16 *)&out) = val;
+
+	return out;
+}
+
+_VISATTR
+static __inline vis_d64
+vis_ld_u8_i(void *addr, long idx)
+{
+	vis_u8 *ptr = addr;
+	return vis_ld_u8(ptr + idx);
+}
+
+_VISATTR
+static __inline vis_d64
+vis_ld_u16_i(void *addr, long idx)
+{
+	vis_u8 *ptr = addr;
+	return vis_ld_u16(ptr + idx);
+}
+
+/*
+ * VIS 2.0 instructions
+ */
+
+_VISATTR
+static __inline vis_u32
+vis_read_bmask(void)
+{
+	vis_u32 out;
+
+	__asm("rd %%gsr,%0"
+	    "srlx %0,32,%0"
+	    : "+f"(out));
+	return out;
+}
+
+_VISATTR
+static __inline void
+vis_write_bmask(vis_u32 mask1, vis_u32 mask2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	(void)__builtin_vis_bmask(mask1, mask2);
+#else
+	vis_u32 out;
+
+	__asm("bmask %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(mask1), "r"(mask2));
+
+	(void)out;
+#endif
+}
+
+_VISATTR
+static __inline vis_d64
+vis_bshuffle(vis_d64 pixels1, vis_d64 pixels2)
+{
+	vis_d64 out;
+
+	__asm("bshuffle %1,%2,%0"
+	    : "=f"(out)
+	    : "f"(pixels1), "f"(pixels2));
+	return out;
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge8n(void *a1, void *a2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	return __builtin_vis_edge8n(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge8n %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "r"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge16n(void *a1, void *a2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	return __builtin_vis_edge16n(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge16n %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "r"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge32n(void *a1, void *a2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	return __builtin_vis_edge32n(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge32n %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "r"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge8ln(void *a1, void *a2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	return __builtin_vis_edge8ln(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge8ln %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "r"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge16ln(void *a1, void *a2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	return __builtin_vis_edge16ln(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge16ln %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "r"(a2));
+	return out;
+#endif
+}
+
+_VISATTR
+static __inline vis_s32
+vis_edge32ln(void *a1, void *a2)
+{
+#if defined(__VIS__) && __VIS__ >= 0x200 && defined(__GNUC__)
+	return __builtin_vis_edge32ln(a1, a2);
+#else
+	vis_s32 out;
+
+	__asm("edge32ln %1,%2,%0"
+	    : "=r"(out)
+	    : "r"(a1), "r"(a2));
+	return out;
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
Index: include/vis_types.h
===================================================================
RCS file: include/vis_types.h
diff -N include/vis_types.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ include/vis_types.h	10 Nov 2025 08:06:18 -0000
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2025 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Nia Alarie.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIS_TYPES_H
+#define _VIS_TYPES_H
+
+/*
+ * This should be compatible with what was shipped with SunPro.
+ *
+ * VIS Instruction Set User's Manual
+ * Sun Microsystems
+ * Part Number: 805-1394-03
+ * May 2001
+ *
+ * Page 32 describes data types.
+ */
+
+typedef signed char		vis_s8 __attribute__ ((__may_alias__));
+typedef unsigned char		vis_u8 __attribute__ ((__may_alias__));
+typedef signed short		vis_s16 __attribute__ ((__may_alias__));
+typedef unsigned short		vis_u16 __attribute__ ((__may_alias__));
+typedef signed int		vis_s32 __attribute__ ((__may_alias__));
+typedef unsigned int		vis_u32 __attribute__ ((__may_alias__));
+typedef double			vis_d64 __attribute__ ((__may_alias__));
+typedef float			vis_f32 __attribute__ ((__may_alias__));
+typedef unsigned long		vis_addr __attribute__ ((__may_alias__));
+
+#ifdef _LP64
+typedef signed long		vis_s64 __attribute__ ((__may_alias__));
+typedef unsigned long		vis_u64 __attribute__ ((__may_alias__));
+#else
+typedef signed long long	vis_s64 __attribute__ ((__may_alias__));
+typedef unsigned long long	vis_u64 __attribute__ ((__may_alias__));
+#endif
+
+#endif
Index: distrib/sets/lists/comp/md.sparc64
===================================================================
RCS file: /cvsroot/src/distrib/sets/lists/comp/md.sparc64,v
retrieving revision 1.219
diff -u -r1.219 md.sparc64
--- distrib/sets/lists/comp/md.sparc64	19 Sep 2025 07:12:52 -0000	1.219
+++ distrib/sets/lists/comp/md.sparc64	10 Nov 2025 08:06:19 -0000
@@ -168,6 +168,8 @@
 ./usr/include/sparc64/vuid_event.h		comp-c-include
 ./usr/include/sparc64/wchar_limits.h		comp-c-include
 ./usr/include/sparc64/z8530var.h		comp-c-include
+./usr/include/vis_proto.h			comp-c-include
+./usr/include/vis_types.h			comp-c-include
 ./usr/lib/sparc_mcmedany.o			comp-c-lib
 ./usr/lib/sparc_mcmedlow.o			comp-c-lib
 ./usr/lib/sparc_mcmedmid.o			comp-c-lib
Index: distrib/sets/lists/tests/md.sparc64
===================================================================
RCS file: distrib/sets/lists/tests/md.sparc64
diff -N distrib/sets/lists/tests/md.sparc64
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ distrib/sets/lists/tests/md.sparc64	10 Nov 2025 08:06:19 -0000
@@ -0,0 +1,3 @@
+# $NetBSD: md.amd64,v 1.18 2024/07/11 20:41:09 riastradh Exp $
+#
+./usr/tests/lib/libc/misc/t_vis		tests-lib-tests	compattestfile,atf
Index: tests/lib/libc/misc/Makefile
===================================================================
RCS file: /cvsroot/src/tests/lib/libc/misc/Makefile,v
retrieving revision 1.9
diff -u -r1.9 Makefile
--- tests/lib/libc/misc/Makefile	10 Aug 2023 20:44:37 -0000	1.9
+++ tests/lib/libc/misc/Makefile	10 Nov 2025 08:06:19 -0000
@@ -7,6 +7,11 @@
 TESTS_C+=	t_ubsan
 TESTS_CXX+=	t_ubsanxx
 
+.if ${MACHINE_ARCH} == "sparc64"
+TESTS_C+=	t_vis
+COPTS.t_vis.c+=	-mvis
+.endif
+
 .PATH:		${NETBSDSRCDIR}/common/lib/libc/misc
 SRCS.t_ubsan=	t_ubsan.c
 SRCS.t_ubsanxx=	t_ubsanxx.cpp
Index: tests/lib/libc/misc/t_vis.c
===================================================================
RCS file: tests/lib/libc/misc/t_vis.c
diff -N tests/lib/libc/misc/t_vis.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ tests/lib/libc/misc/t_vis.c	10 Nov 2025 08:06:19 -0000
@@ -0,0 +1,134 @@
+#include <atf-c.h>
+#include <string.h>
+#include <stdio.h>
+#include "vis_types.h"
+#include "vis_proto.h"
+
+ATF_TC(vis_test_addsub);
+
+ATF_TC_HEAD(vis_test_addsub, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test 32-bit packed add/subtract");
+}
+
+ATF_TC_BODY(vis_test_addsub, tc)
+{
+	vis_d64 v1, v2, v3;
+	vis_f32 f1, f2;
+	vis_u32 u1, u2;
+
+	v1 = vis_to_double(8, 16);
+	v2 = vis_to_double(16, 8);
+
+	v3 = vis_fpadd32(v1, v2);
+
+	f1 = vis_read_lo(v3);
+	memcpy(&u1, &f1, sizeof(f1));
+	f2 = vis_read_hi(v3);
+	memcpy(&u2, &f2, sizeof(f2));
+
+	ATF_REQUIRE(u1 == 24 && u2 == 24);
+
+	v2 = vis_to_double(4, 4);
+	v3 = vis_fpsub32(v3, v2);
+
+	f1 = vis_read_lo(v3);
+	memcpy(&u1, &f1, sizeof(f1));
+	f2 = vis_read_hi(v3);
+	memcpy(&u2, &f2, sizeof(f2));
+
+	ATF_REQUIRE(u1 == 20 && u2 == 20);
+}
+
+ATF_TC(vis_test_bitwise);
+
+ATF_TC_HEAD(vis_test_bitwise, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test 32-bit packed bitwise");
+}
+
+ATF_TC_BODY(vis_test_bitwise, tc)
+{
+	static vis_u8 testbytes1[8] = { 1, 0, 1, 1, 1, 0, 1, 1 };
+	static vis_u8 testbytes2[8] = { 1, 1, 0, 1, 1, 1, 0, 1 };
+	static vis_u8 test_and[8] = { 1, 0, 0, 1, 1, 0, 0, 1 };
+	static vis_u8 test_or[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+	static vis_u8 test_zero[8] = { 0 };
+	static vis_u64 test_ones = 0xffffffffffffffff;
+	double v1, v2, v3;
+
+	memcpy(&v1, testbytes1, sizeof(v1));
+	memcpy(&v2, testbytes2, sizeof(v2));
+
+	v3 = vis_fand(v1, v2);
+
+	ATF_REQUIRE(memcmp(&v3, test_and, sizeof(v3)) == 0);
+
+	v3 = vis_fone();
+
+	ATF_REQUIRE(memcmp(&v3, &test_ones, sizeof(v3)) == 0);
+
+	v3 = vis_for(v1, v2);
+
+	ATF_REQUIRE(memcmp(&v3, test_or, sizeof(v3)) == 0);
+
+	v3 = vis_fzero();
+
+	ATF_REQUIRE(memcmp(&v3, test_zero, sizeof(v3)) == 0);
+}
+
+ATF_TC(vis_test_fcmpeq16);
+
+ATF_TC_HEAD(vis_test_fcmpeq16, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test 16-bit packed compare");
+}
+
+ATF_TC_BODY(vis_test_fcmpeq16, tc)
+{
+	static vis_u16 testshort1[4] = { 16000, 16000, 16000, 16000 };
+	static vis_u16 testshort2[4] = { 32000, 16000, 32000, 16000 };
+	static vis_u16 testshort3[4] = { 48000, 48000, 48000, 48000 };
+	vis_d64 v1, v2, v3;
+
+	memcpy(&v1, testshort1, sizeof(v1));
+	memcpy(&v2, testshort2, sizeof(v2));
+	memcpy(&v3, testshort3, sizeof(v3));
+
+	ATF_REQUIRE((!!vis_fcmpeq16(v1, v2)) != 0);
+	ATF_REQUIRE((!!vis_fcmpeq16(v1, v3)) == 0);
+	ATF_REQUIRE((!!vis_fcmpne16(v1, v3)) != 0);
+}
+
+ATF_TC(vis_test_fcmpeq32);
+
+ATF_TC_HEAD(vis_test_fcmpeq32, tc)
+{
+	atf_tc_set_md_var(tc, "descr", "Test 32-bit packed compare");
+}
+
+ATF_TC_BODY(vis_test_fcmpeq32, tc)
+{
+	static vis_u32 testlong1[2] = { 16000, 16000 };
+	static vis_u32 testlong2[2] = { 32000, 16000 };
+	static vis_u32 testlong3[2] = { 48000, 48000 };
+	vis_d64 v1, v2, v3;
+
+	memcpy(&v1, testlong1, sizeof(v1));
+	memcpy(&v2, testlong2, sizeof(v2));
+	memcpy(&v3, testlong3, sizeof(v3));
+
+	ATF_REQUIRE((!!vis_fcmpeq32(v1, v2)) != 0);
+	ATF_REQUIRE((!!vis_fcmpeq32(v1, v3)) == 0);
+	ATF_REQUIRE((!!vis_fcmpne32(v1, v3)) != 0);
+}
+
+ATF_TP_ADD_TCS(tp) 
+{
+	ATF_TP_ADD_TC(tp, vis_test_addsub);
+	ATF_TP_ADD_TC(tp, vis_test_bitwise);
+	ATF_TP_ADD_TC(tp, vis_test_fcmpeq16);
+	ATF_TP_ADD_TC(tp, vis_test_fcmpeq32);
+
+	return atf_no_error();
+}

Follow-Ups:
- Re: Implementation of VIS C API
  - From: Sad Clouds
- Re: Implementation of VIS C API
  - From: Magnus Lindholm

Prev by Date: Re: Dillo + Netsurf broken fonts (sparc64)
Next by Date: Re: Implementation of VIS C API
Previous by Thread: Re: Dillo + Netsurf broken fonts (sparc64)
Next by Thread: Re: Implementation of VIS C API
Indexes:

Home | Main Index | Thread Index | Old Index