Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[xsrc/trunk]: xsrc/external/mit/xf86-video-suncg14/dist/src add another Copy8...
details: https://anonhg.NetBSD.org/xsrc/rev/c53a9b59fd2c
branches: trunk
changeset: 10828:c53a9b59fd2c
user: macallan <macallan%NetBSD.org@localhost>
date: Fri Dec 10 18:25:43 2021 +0000
description:
add another Copy8() variant:
- supports unaligned source and destination
- uses all 32bit accesses
- supports copies up to 124 pixels wide so an entire line fits into registers
and we can ignore x direction
... mostly an exercise in learning how to use the funnel shifter
TODO:
- skip the funnel shifter if source and destination are aligned
- skip fb reads where possible, like straight GXcopy
diffstat:
external/mit/xf86-video-suncg14/dist/src/cg14_accel.c | 144 +++++++++++++++++-
1 files changed, 136 insertions(+), 8 deletions(-)
diffs (179 lines):
diff -r de2ae8f2d5cc -r c53a9b59fd2c external/mit/xf86-video-suncg14/dist/src/cg14_accel.c
--- a/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c Thu Dec 09 17:29:14 2021 +0000
+++ b/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c Fri Dec 10 18:25:43 2021 +0000
@@ -1,4 +1,4 @@
-/* $NetBSD: cg14_accel.c,v 1.21 2021/12/09 17:29:14 christos Exp $ */
+/* $NetBSD: cg14_accel.c,v 1.22 2021/12/10 18:25:43 macallan Exp $ */
/*
* Copyright (c) 2013 Michael Lorenz
* All rights reserved.
@@ -405,6 +405,114 @@
}
}
+/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
+static void
+CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
+{
+ int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
+#ifdef DEBUG
+ int taddr = 4 + dstpitch * 50;
+#endif
+ uint32_t lmask, rmask;
+ ENTER;
+
+ pre = dststart & 3;
+ lmask = 0xffffffff >> pre;
+ spre = srcstart & 3;
+ /*
+ * make sure we count all the words needed to cover the destination
+ * line, covering potential partials on both ends
+ */
+ wrds = (w + pre + 3) >> 2;
+ swrds = (w + spre + 3) >> 2;
+
+ if (spre < pre) {
+ dist = 32 - (pre - spre) * 8;
+ sreg = 9;
+ } else {
+ dist = (spre - pre) * 8;
+ sreg = 8;
+ }
+
+ /*
+ * mask out trailing pixels to avoid partial writes
+ */
+ post = (dststart + w) & 3;
+ rmask = ~(0xffffffff >> (post * 8));
+ write_sx_reg(p, SX_QUEUED(7), rmask);
+ write_sx_reg(p, SX_QUEUED(6), ~rmask);
+
+ DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
+ w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
+
+ /* mask out the leading pixels in dst by using a mask and ROP */
+ write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
+ write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
+
+ saddr = srcstart & ~3;
+ daddr = dststart & ~3;
+
+ /* TODO:
+ * - special case dist == 0 where we can skip the funnel shifter
+ * and only need to deal with leading / trailing garbage
+ * - skip reading the fb where we can get away with it, for example
+ * GXcopy, where we only need to read the destination for partials,
+ * everything in between is straight copy
+ */
+ while (h > 0) {
+ write_sx_io(p, daddr & ~7, SX_LD(80, wrds - 1, daddr & 7));
+ write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
+ if (wrds > 15) {
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
+ /* shifted source pixels are now at register 40+ */
+ if (pre != 0) {
+ /* mask out leading junk */
+ write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, 0));
+ write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(41, 81, 9, 14));
+ } else {
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, 15));
+ }
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(56, 96, 24, wrds - 16));
+ } else {
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
+
+ if (pre != 0) {
+ /* mask out leading junk */
+ write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, 0));
+ write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(41, 81, 9, wrds));
+ } else {
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, wrds));
+ }
+ }
+ if (post != 0) {
+ /*
+ * if the last word to be written out is a partial we
+ * mask out the leftovers and replace them with
+ * background pixels
+ * we could pull the same ROP * mask trick as we do on
+ * the left end but it's less annoying this way and
+ * the instruction count is the same
+ */
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(7 + wrds, 7, 5, 0));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(79 + wrds, 6, 4, 0));
+ write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, 7 + wrds, 0));
+ }
+#ifdef DEBUG
+ write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
+ taddr += dstpitch;
+#endif
+ write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
+ saddr += srcpitch;
+ daddr += dstpitch;
+ h--;
+ }
+}
+
static void
CG14Copy8(PixmapPtr pDstPixmap,
int srcX, int srcY, int dstX, int dstY, int w, int h)
@@ -427,13 +535,6 @@
srcstart = srcX + (srcpitch * srcY) + srcoff;
dststart = dstX + (dstpitch * dstY) + dstoff;
- if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
- srcstart += (w - 32);
- dststart += (w - 32);
- xinc = -32;
- } else
- xinc = 32;
-
if (p->ydir < 0) {
srcstart += (h - 1) * srcpitch;
dststart += (h - 1) * dstpitch;
@@ -443,6 +544,32 @@
srcinc = srcpitch;
dstinc = dstpitch;
}
+
+ /*
+ * this copies up to 124 pixels wide in one go, so horizontal
+ * direction / overlap don't matter
+ * uses all 32bit accesses and funnel shifter for unaligned copies
+ */
+ if ((w < 125) && (w > 8)) {
+ CG14Copy8_short_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
+ return;
+ }
+
+ /*
+ * only invert x direction if absolutely necessary, it's a pain to
+ * go backwards on SX so avoid as much as possible
+ */
+ if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
+ srcstart += (w - 32);
+ dststart += (w - 32);
+ xinc = -32;
+ } else
+ xinc = 32;
+
+ /*
+ * for aligned copies we can go all 32bit and avoid VRAM reads in the
+ * most common case
+ */
if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
switch (p->last_rop) {
case 0xcc:
@@ -453,6 +580,7 @@
}
return;
}
+
if (p->last_rop == 0xcc) {
/* plain old copy */
if ( xinc > 0) {
Home |
Main Index |
Thread Index |
Old Index