Subject: port-alpha/12102: TGA console problems
To: None <gnats-bugs@gnats.netbsd.org>
From: None <cpg@aladdin.de>
List: netbsd-bugs
Date: 02/01/2001 14:34:24
>Number:         12102
>Category:       port-alpha
>Synopsis:       TGA console problems
>Confidential:   no
>Severity:       non-critical
>Priority:       medium
>Responsible:    port-alpha-maintainer
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Thu Feb 01 14:37:00 PST 2001
>Closed-Date:
>Last-Modified:
>Originator:     Christian Groessler
>Release:        current as of 01-Feb-2001
>Organization:
	
>Environment:
	
Architecture: alpha
Machine: alpha
>Description:
	The TGA console has problems when it has to shift display data
to the right or to the left (e.g. when inserting or deleting chars in
a command line)
The color depth is 8 bit on the problem system here. (I cannot check
other systems.)
>How-To-Repeat:
	E.g. with bash: type some chars in the command line then go
back to the middle of them and insert additional chars. Parts of the
line will turn into unreadable garbage.
>Fix:
	The problem is in the tga_rop_vtov() functon
(sys/dev/pci/tga.c). It uses TGA acceleration functions which work on
multiples of 64 bytes. So there is a problem if the # of bytes to move
is not dividable by 64. The current code always uses 4 of those TGA
ops in a row (loop unrolled), giving a real granularity of 256 bytes.

The attached patch changes this and uses loop-unrolled ops until the
remaining bytes to move are < 256, then uses single ops until < 64 and
uses the CPU to move the last bytes.

here's the patch:

Index: tga.c
===================================================================
RCS file: /net/swamp/zeug/netbsd-rsync/main/syssrc/sys/dev/pci/tga.c,v
retrieving revision 1.30
diff -p -u -r1.30 tga.c
--- tga.c	2000/12/28 22:59:15	1.30
+++ tga.c	2001/02/01 22:21:09
@@ -1000,10 +1000,11 @@ tga_rop_vtov(dst, dx, dy, w, h, rop, src
 	int sx, sy;
 {
 	struct tga_devconfig *dc = (struct tga_devconfig *)dst->ri_hw;
-	int srcb, dstb;
-	int x, y;
-	int xstart, xend, xdir, xinc;
+	int srcb, dstb, tga_srcb, tga_dstb;
+	int x, y, wb;
+	int xstart, xend, xdir;
 	int ystart, yend, ydir, yinc;
+	int xleft, lastx, lastleft;
 	int offset = 1 * dc->dc_tgaconf->tgac_vvbr_units;
 
 	/*
@@ -1018,6 +1019,7 @@ tga_rop_vtov(dst, dx, dy, w, h, rop, src
 		return -1;
 	}
 
+        wb = w * (dst->ri_depth / 8);
 	if (sy >= dy) {
 		ystart = 0;
 		yend = h;
@@ -1027,45 +1029,135 @@ tga_rop_vtov(dst, dx, dy, w, h, rop, src
 		yend = 0;
 		ydir = -1;
 	}
-	if (sx >= dx) {
+	if (sx >= dx) {      /* moving to the left */
 		xstart = 0;
-		xend = w * (dst->ri_depth / 8);
+		xend = w * (dst->ri_depth / 8) - 4;
 		xdir = 1;
-	} else {
-		xstart = w * (dst->ri_depth / 8);
+	} else {             /* moving to the right */
+		xstart = wb - ( wb >= 4*64 ? 4*64 : wb >= 64 ? 64 : 4 );
 		xend = 0;
 		xdir = -1;
 	}
-	xinc = xdir * 4 * 64;
+#define XINC4   4
+#define XINC64  64
+#define XINC256 (64*4)
 	yinc = ydir * dst->ri_stride;
 	ystart *= dst->ri_stride;
 	yend *= dst->ri_stride;
-	srcb = offset + (sy + src->ri_yorigin) * src->ri_stride + 
-		            (sx + src->ri_xorigin) * (src->ri_depth/8);
-	dstb = offset + (dy + dst->ri_yorigin) * dst->ri_stride + 
-		            (dx + dst->ri_xorigin ) * (dst->ri_depth/8);
+
+	srcb = sy * src->ri_stride + sx * (src->ri_depth/8);
+	dstb = dy * dst->ri_stride + dx * (dst->ri_depth/8);
+	tga_srcb = offset + (sy + src->ri_yorigin) * src->ri_stride + 
+		(sx + src->ri_xorigin) * (src->ri_depth/8);
+	tga_dstb = offset + (dy + dst->ri_yorigin) * dst->ri_stride + 
+		(dx + dst->ri_xorigin) * (dst->ri_depth/8);
+
 	TGAWALREG(dc, TGA_REG_GMOR, 3, 0x0007); /* Copy mode */
-	TGAWALREG(dc, TGA_REG_GOPR, 3, map_rop[rop]);	/* Set up the op */
-	for (y = ystart; (ydir * y) < (ydir * yend); y += yinc) {
-		for (x = xstart; (xdir * x) < (xdir * xend); x += xinc) {
-		  /* XXX XXX Eight writes to different addresses should fill 
-		   * XXX XXX up the write buffers on 21064 and 21164 chips,
-		   * XXX XXX but later CPUs might have larger write buffers which
-		   * XXX XXX require further unrolling of this loop, or the
-		   * XXX XXX insertion of memory barriers.
-		   */
-			TGAWALREG(dc, TGA_REG_GCSR, 0, srcb + y + x + 3 * 64);
-			TGAWALREG(dc, TGA_REG_GCDR, 0, dstb + y + x + 3 * 64);
-			TGAWALREG(dc, TGA_REG_GCSR, 1, srcb + y + x + 2 * 64);
-			TGAWALREG(dc, TGA_REG_GCDR, 1, dstb + y + x + 2 * 64);
-			TGAWALREG(dc, TGA_REG_GCSR, 2, srcb + y + x + 1 * 64);
-			TGAWALREG(dc, TGA_REG_GCDR, 2, dstb + y + x + 1 * 64);
-			TGAWALREG(dc, TGA_REG_GCSR, 3, srcb + y + x + 0 * 64);
-			TGAWALREG(dc, TGA_REG_GCDR, 3, dstb + y + x + 0 * 64);
+	TGAWALREG(dc, TGA_REG_GOPR, 3, map_rop[rop]);   /* Set up the op */
+
+	/*
+	 * we have 3 sizes of pixels to move in X direction:
+	 * 4 * 64   (unrolled TGA ops)
+	 *     64   (single TGA op)
+	 *      4   (CPU, using long word)
+	 */
+
+	if (xdir == 1) {   /* move to the left */
+
+		for (y = ystart; (ydir * y) <= (ydir * yend); y += yinc) {
+
+			/* 4*64 byte chunks */
+			for (xleft = wb, x = xstart;
+			     x <= xend && xleft >= 4*64;
+			     x += XINC256, xleft -= XINC256) {
+
+				/* XXX XXX Eight writes to different addresses should fill 
+				 * XXX XXX up the write buffers on 21064 and 21164 chips,
+				 * XXX XXX but later CPUs might have larger write buffers which
+				 * XXX XXX require further unrolling of this loop, or the
+				 * XXX XXX insertion of memory barriers.
+				 */
+				TGAWALREG(dc, TGA_REG_GCSR, 0, tga_srcb + y + x + 0 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 0, tga_dstb + y + x + 0 * 64);
+				TGAWALREG(dc, TGA_REG_GCSR, 1, tga_srcb + y + x + 1 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 1, tga_dstb + y + x + 1 * 64);
+				TGAWALREG(dc, TGA_REG_GCSR, 2, tga_srcb + y + x + 2 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 2, tga_dstb + y + x + 2 * 64);
+				TGAWALREG(dc, TGA_REG_GCSR, 3, tga_srcb + y + x + 3 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 3, tga_dstb + y + x + 3 * 64);
+			}
+
+			/* 64 byte chunks */
+			for ( ; x <= xend && xleft >= 64;
+			      x += XINC64, xleft -= XINC64) {
+				TGAWALREG(dc, TGA_REG_GCSR, 0, tga_srcb + y + x + 0 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 0, tga_dstb + y + x + 0 * 64);
+			}
+			lastx = x; lastleft = xleft;  /* remember for CPU loop */
+
+		}
+		TGAWALREG(dc, TGA_REG_GOPR, 0, 0x0003); /* op -> dst = src */
+		TGAWALREG(dc, TGA_REG_GMOR, 0, 0x0000); /* Simple mode */
+
+		for (y = ystart; (ydir * y) <= (ydir * yend); y += yinc) {
+			/* 4 byte granularity */
+			for (x = lastx, xleft = lastleft;
+			     x <= xend && xleft >= 4;
+			     x += XINC4, xleft -= XINC4) {
+				*(uint32_t *)(dst->ri_bits + dstb + y + x) =
+					*(uint32_t *)(dst->ri_bits + srcb + y + x);
+			}
+		}
+	}
+	else {    /* above move to the left, below move to the right */
+
+		for (y = ystart; (ydir * y) <= (ydir * yend); y += yinc) {
+
+			/* 4*64 byte chunks */
+			for (xleft = wb, x = xstart;
+			     x >= xend && xleft >= 4*64;
+			     x -= XINC256, xleft -= XINC256) {
+
+				/* XXX XXX Eight writes to different addresses should fill 
+				 * XXX XXX up the write buffers on 21064 and 21164 chips,
+				 * XXX XXX but later CPUs might have larger write buffers which
+				 * XXX XXX require further unrolling of this loop, or the
+				 * XXX XXX insertion of memory barriers.
+				 */
+				TGAWALREG(dc, TGA_REG_GCSR, 0, tga_srcb + y + x + 3 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 0, tga_dstb + y + x + 3 * 64);
+				TGAWALREG(dc, TGA_REG_GCSR, 1, tga_srcb + y + x + 2 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 1, tga_dstb + y + x + 2 * 64);
+				TGAWALREG(dc, TGA_REG_GCSR, 2, tga_srcb + y + x + 1 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 2, tga_dstb + y + x + 1 * 64);
+				TGAWALREG(dc, TGA_REG_GCSR, 3, tga_srcb + y + x + 0 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 3, tga_dstb + y + x + 0 * 64);
+			}
+
+			if (xleft) x += XINC256 - XINC64;
+
+			/* 64 byte chunks */
+			for ( ; x >= xend && xleft >= 64;
+			      x -= XINC64, xleft -= XINC64) {
+				TGAWALREG(dc, TGA_REG_GCSR, 0, tga_srcb + y + x + 0 * 64);
+				TGAWALREG(dc, TGA_REG_GCDR, 0, tga_dstb + y + x + 0 * 64);
+			}
+			if (xleft) x += XINC64 - XINC4;
+			lastx = x; lastleft = xleft;  /* remember for CPU loop */
+		}
+		TGAWALREG(dc, TGA_REG_GOPR, 0, 0x0003); /* op -> dst = src */
+		TGAWALREG(dc, TGA_REG_GMOR, 0, 0x0000); /* Simple mode */
+
+		for (y = ystart; (ydir * y) <= (ydir * yend); y += yinc) {
+			/* 4 byte granularity */
+			for (x = lastx, xleft = lastleft;
+			     x >= xend && xleft >= 4;
+			     x -= XINC4, xleft -= XINC4) {
+				*(uint32_t *)(dst->ri_bits + dstb + y + x) =
+					*(uint32_t *)(dst->ri_bits + srcb + y + x);
+			}
 		}
 	}
-	TGAWALREG(dc, TGA_REG_GOPR, 0, 0x0003); /* op -> dst = src */
-	TGAWALREG(dc, TGA_REG_GMOR, 0, 0x0000); /* Simple mode */
 	return 0;
 }
 
>Release-Note:
>Audit-Trail:
>Unformatted: