Subject: More on rcons performance woes: pmax profiling
To: None <tech-kern@netbsd.org>
From: Jonathan Stone <jonathan@DSG.Stanford.EDU>
List: tech-kern
Date: 03/02/1999 01:07:19
I've been kernel-profiling rcons on a pmax. The machine is a 5000/150
(50MHz r4000, 12.5MHz 32-bit TC bus) with PMAGB-B sfb" 1280x1024x8-bit board.

The "benchmark' I've been using is
    head -1000  /usr/share/misc/termcap

to the rcons device; that takes ~7 minutes, catting the whole thing
takes too long.

I built a kernel with DEBUG=-g to be sure and get static symbols. Flat
profiling data shows conclusively taht for this test, on this
hardware, raster_op_noclip() calls to do scrolling are the bottleneck.
i see approximately 1 call to raster_op_noclip() per line cat'ted, and
the call to raster_blit() from that function takes 90% of the system
time, which is over 99% of walltime.

I tried turning on BCOPY_FASTER to see if it helped. But for
"performance" reasons, pmax rcons only uses 80 columns, so the
operation cant be done as a single bcopy() anyway.

I reworked the FASTER_BCOPY() case for 8x8 to blit the leftmost subset
of a scanline in one (non-overlapping) bcopy operation per rasterline.
That only won about 2%.  The pmax kernel bcopy is fairly well tuned,
so I'm wondering if we're approaching bus-bandwidth limits for
rasterop-based designs.  

Using only 80 columns and an 8-pixel-wide cell, that's 640 bytes or
160 32-bit words per line, both read and written. With 1024 lines to
scroll, that's 320K word reads+writes.  IIRC the pmax hardware isn't
smart enough to merge contiguous CPU accesses to TC space into a
burst; only DMA. At 12 TC cycles [*] per read/operation thats about
80% utilization of the bus.

[*] a guess, we dont have docs on this hw and I dont have a scope at
home. But IIRC, uncached accesses to motherboard devices take 7 cpu
cycles on a 25MHz CPU.

That leaves me wondering what the impact of `wscons' will be on
workstation hardware whcih doesnt have a hardware `text mode'.  Maybe
we'd be better off keeping a char-cell image of the display and
updating it by repainting rather than blitting. But for displays with
blit hardware, supporting that hardware and using it in the
graphics-console code could be faster yet.


How will this impact wscons on a pmax? Anyone got ideas either way?



Here's some profiling data with the patch (below) applied, for one
complete run sending 1000 line of termcap to rcons.


Flat profile:

Each sample counts as 0.00390625 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  us/call  us/call  name    
 89.55    382.80   382.80   990390   386.52   386.52  memmove
  3.11    396.08    13.28    49097   270.52  8067.35  raster_op_noclip
  3.10    409.34    13.26                             kernel_text
  0.42    411.12     1.79                             __mcount
  0.32    412.48     1.36     2944   461.76   461.76  raster_op_nosrc_noclip
  0.10    412.92     0.44                             _mcount
  0.08    413.26     0.34   137661     2.47   119.87  interrupt
  0.06    413.53     0.27     1115   238.24   246.88  ast
  0.03    413.65     0.12   319873     0.36     0.36  _splx
  0.02    413.75     0.10     9384    10.82 40770.98  syscall
  0.02    413.82     0.07                             mips3_proc_trampoline


[... rest elided]


And last, the gross patch I've been using to get counters on the
problem. (and DONT turn on rop_noisy unless you have syslog running!)


--- raster_op.c	1999/01/11 12:18:55	1.1
+++ raster_op.c	1999/03/02 08:09:07
@@ -61,12 +61,27 @@
  */
 
 #include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/syslog.h>
 #ifdef _KERNEL
 #include <dev/rcons/raster.h>
 #else
 #include "raster.h"
 #endif
 
+
+/* debugging: patchable logging of ops */
+extern int rop_noisy;
+int rop_noisy = 0;
+
+void set_rop_noisy(void); void set_rop_quiet(void);
+void set_rop_noisy(void)	{  rop_noisy = 1; }
+void set_rop_quiet(void)	{ rop_noisy = 0; }
+
+extern int rop_tries, rop_mine, rop_bcopy, rop_slow;
+int rop_tries, rop_mine, rop_bcopy, rop_slow;
+
+
 /* CONFIGURE: To save on executable size, you can configure out the seldom-used
 ** logical operations.  With this variable set, the only operations implemented
 ** are: RAS_SRC, RAS_CLEAR, RAS_SET, RAS_INVERT, RAS_XOR, RAS_INVERTSRC.
@@ -82,7 +97,9 @@
 ** on overlapping copies.  And on a 4.1.1 SPARC, bcopy() is about 2/3rds
 ** as fast on backwards overlaps.  So, only define this if your bcopy is ok.
 */
+#if 0
 #undef BCOPY_FASTER
+#endif
 
 /* End of configurable definitions. */
 
@@ -639,7 +656,7 @@
 	    if ( op == RAS_SRC && src->width == w && dst->width == w &&
 		 src->linelongs == dst->linelongs && src->linelongs == w >> 5 )
 		{
-		bcopy(
+		ovbcopy(
 		    (char*) srclin1, (char*) dstlin1,
 		    h * src->linelongs * sizeof(u_int32_t) );
 		return 0;
@@ -842,16 +859,36 @@
 	dstlin1 = RAS_ADDR( dst, dx, dy );
 
 #ifdef BCOPY_FASTER
+#if 1
+	if (rop_noisy)
+	  addlog("8x8 rop_noclip: op %d w %d sw %d dw %d sl %d dl %d\n",
+	       op, w, src->width,dst->width, 
+	       src->linelongs, dst->linelongs);
+#endif	
 	/* Special-case full-width to full-width copies. */
-	if ( op == RAS_SRC && src->width == w && dst->width == w &&
-	     src->linelongs == dst->linelongs && src->linelongs == w >> 2 )
+	if ( op == RAS_SRC && src->width == dst->width && src->width >= w &&
+	     src->linelongs == dst->linelongs && src->linelongs >= w >> 2 )
 	    {
-	    bcopy( (char*) srclin1, (char*) dstlin1,
-		   h * src->linelongs * sizeof(u_int32_t) );
+	        rop_tries++;
+	    	if (src->width == w && src->linelongs == w >> 2) {
+			rop_bcopy++;
+			ovbcopy( (char*) srclin1, (char*) dstlin1,
+			     h * src->linelongs * sizeof(u_int32_t) );
+		} else {
+			int i;
+			int linelen = src->linelongs;
+		        rop_mine++;
+			for (i = 0; i < h; i++) {
+			  ovbcopy( (char*) srclin1, (char*) dstlin1, w);
+			  srclin1 += linelen;
+			  dstlin1 += linelen;
+			}
+		}
 	    return 0;
 	    }
 #endif /*BCOPY_FASTER*/
 
+	rop_slow++;
 	srcleftignore = ( sx & 3 ) * 8;
 	srclongs = ( srcleftignore + w * 8 + 31 ) >> 5;
 	srcrightignore = ( srclongs * 32 - w * 8 - srcleftignore ) & 31;
@@ -859,6 +896,12 @@
 	dstlongs = ( dstleftignore + w * 8 + 31 ) >> 5;
 	dstrightignore = ( dstlongs * 32 - w * 8 - dstleftignore ) & 31;
 
+#if 1
+	if (rop_noisy)
+	addlog("slow: src ig %d l %d ri %d ig dst %d l %d ri %d\n",
+	       srcleftignore, srclongs, srcrightignore,
+	       dstleftignore, dstlongs, dstrightignore);
+#endif	
 	return raster_blit(
 	    src, srclin1, srcleftignore, srcrightignore, srclongs,
 	    dst, dstlin1, dstleftignore, dstrightignore, dstlongs, h, op );