netbsd-bugs: bin/36092: algorithmic inefficiency in bin/test/test.c:t

Subject: bin/36092: algorithmic inefficiency in bin/test/test.c:t_lex
To: None <gnats-admin@netbsd.org, netbsd-bugs@netbsd.org>
From: None <mtbakerguy@yahoo.com>
List: netbsd-bugs
Date: 03/27/2007 05:10:00
>Number:         36092
>Category:       bin
>Synopsis:       algorithmic inefficiency in bin/test/test.c:t_lex
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    bin-bug-people
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Tue Mar 27 05:10:00 +0000 2007
>Originator:     Knotwell
>Release:        3.1
>Organization:
home
>Environment:
NetBSD bradwkstn 3.1 NetBSD 3.1 (GENERIC.PROF) #7: Thu Dec 14 21:54:41 PST 2006  root@bradwkstn:/usr/src/usr/src/sys/arch/i386/compile/GENERIC.PROF i386
>Description:
While profiling /bin/sh for fun, I noticed several things:

   o the test builtin is *extremely* expensive with massive calls to strcmp in the t_lex function.  Specifically, it does linear walks of the operations to do a match.
   o profiling /bin/sh is difficult because bin/sh/trap.c:exitshell uses _exit instead of exit (NB:  this isn't a bug but probably belongs in a README somewhere)
   o arithmetic operations are also more expensive than they need to be since there are two calls made to ioctl (via isatty) when arith_lex_reset is called.  In my limited testing, I commented out the YY_NEW_FILE stanza and cut the number of syscalls substantially in a shell doing a large amount of arithmetic.

At this cost of some codespace as well as readability*, I created a substantially improved parser for the test operations that dropped strcmp and t_lex off the profiler (almost) entirely.  On a shell doing a significant number of boolean comparisons, I cut the run time by approximately 15%.

I've attached a diff below as well as a small shell that I used to validate the change (I noticed TEST.csh as I was preparing the diff).

*unfortunately, my change loses the data-driven clarity of the original code and it wasn't obvious to me how you'd get a data-driven solution nearly as efficient as mine.  There's probably a simpler and more elegant solution but code review's difficult on a home project.
>How-To-Repeat:

>Fix:
Index: test.c
===================================================================
RCS file: /cvsroot/src/bin/test/test.c,v
retrieving revision 1.26
diff -d -u -r1.26 test.c
--- test.c	10 Feb 2005 06:56:55 -0000	1.26
+++ test.c	27 Mar 2007 04:44:08 -0000
@@ -95,51 +95,12 @@
 	PAREN
 };
 
+
+#define MAXOPSIZE 3 /* strlen("-ne") */
 static struct t_op {
-	const char *op_text;
+	char op_text[MAXOPSIZE+1];
 	short op_num, op_type;
-} const ops [] = {
-	{"-r",	FILRD,	UNOP},
-	{"-w",	FILWR,	UNOP},
-	{"-x",	FILEX,	UNOP},
-	{"-e",	FILEXIST,UNOP},
-	{"-f",	FILREG,	UNOP},
-	{"-d",	FILDIR,	UNOP},
-	{"-c",	FILCDEV,UNOP},
-	{"-b",	FILBDEV,UNOP},
-	{"-p",	FILFIFO,UNOP},
-	{"-u",	FILSUID,UNOP},
-	{"-g",	FILSGID,UNOP},
-	{"-k",	FILSTCK,UNOP},
-	{"-s",	FILGZ,	UNOP},
-	{"-t",	FILTT,	UNOP},
-	{"-z",	STREZ,	UNOP},
-	{"-n",	STRNZ,	UNOP},
-	{"-h",	FILSYM,	UNOP},		/* for backwards compat */
-	{"-O",	FILUID,	UNOP},
-	{"-G",	FILGID,	UNOP},
-	{"-L",	FILSYM,	UNOP},
-	{"-S",	FILSOCK,UNOP},
-	{"=",	STREQ,	BINOP},
-	{"!=",	STRNE,	BINOP},
-	{"<",	STRLT,	BINOP},
-	{">",	STRGT,	BINOP},
-	{"-eq",	INTEQ,	BINOP},
-	{"-ne",	INTNE,	BINOP},
-	{"-ge",	INTGE,	BINOP},
-	{"-gt",	INTGT,	BINOP},
-	{"-le",	INTLE,	BINOP},
-	{"-lt",	INTLT,	BINOP},
-	{"-nt",	FILNT,	BINOP},
-	{"-ot",	FILOT,	BINOP},
-	{"-ef",	FILEQ,	BINOP},
-	{"!",	UNOT,	BUNOP},
-	{"-a",	BAND,	BBINOP},
-	{"-o",	BOR,	BBINOP},
-	{"(",	LPAREN,	PAREN},
-	{")",	RPAREN,	PAREN},
-	{0,	0,	0}
-};
+} tmpop;
 
 static char **t_wp;
 static struct t_op const *t_wp_op;
@@ -157,6 +118,7 @@
 static int newerf(const char *, const char *);
 static int olderf(const char *, const char *);
 static int equalf(const char *, const char *);
+static int findoperand(const char *txt,short *operand, short *operand_type);
 
 #if defined(SHELL)
 extern void error(const char *, ...) __attribute__((__noreturn__));
@@ -387,45 +349,118 @@
 static enum token
 t_lex(char *s)
 {
-	struct t_op const *op;
-
-	op = ops;
-
 	if (s == 0) {
 		t_wp_op = NULL;
 		return EOI;
 	}
-	while (op->op_text) {
-		if (strcmp(s, op->op_text) == 0) {
-			if ((op->op_type == UNOP && isoperand()) ||
-			    (op->op_num == LPAREN && *(t_wp+1) == 0))
-				break;
-			t_wp_op = op;
-			return op->op_num;
-		}
-		op++;
-	}
+       
+        if(findoperand(s,&tmpop.op_num,&tmpop.op_type) == 1) {
+            if((tmpop.op_type == UNOP && isoperand()) ||
+               (tmpop.op_num == LPAREN && *(t_wp+1) == 0))
+              goto out;
+
+            strcpy(tmpop.op_text,s); 
+            t_wp_op = &tmpop;
+            return tmpop.op_num;
+        }
+out:
 	t_wp_op = NULL;
 	return OPERAND;
 }
+static int
+findoperand(const char *txt,short *operand, short *operand_type) {
+    switch(*txt) {
+    default:
+        return(0); 
+    case '-': {
+        if(*(txt + 1) == '\0')
+            return(0);
+        if(*(txt + 2) == '\0') {
+            switch(*(txt+1)) {
+            default:
+                return(0);
+#define DASH2(c,op,optype) case c: *operand=op;*operand_type=optype;return(1); 
+            DASH2('a',BAND,BBINOP);
+            DASH2('o',BOR,BBINOP);
+            DASH2('r',FILRD,UNOP);
+            DASH2('w',FILWR,UNOP);
+            DASH2('x',FILEX,UNOP);
+            DASH2('e',FILEXIST,UNOP);
+            DASH2('f',FILREG,UNOP);
+            DASH2('d',FILDIR,UNOP);
+            DASH2('c',FILCDEV,UNOP);
+            DASH2('b',FILBDEV,UNOP);
+            DASH2('p',FILFIFO,UNOP);
+            DASH2('u',FILSUID,UNOP);
+            DASH2('g',FILSGID,UNOP);
+            DASH2('k',FILSTCK,UNOP);
+            DASH2('s',FILGZ,UNOP);
+            DASH2('t',FILTT,UNOP);
+            DASH2('z',STREZ,UNOP);
+            DASH2('n',STRNZ,UNOP);
+            DASH2('h',FILSYM,UNOP);
+            DASH2('O',FILUID,UNOP);
+            DASH2('G',FILGID,UNOP);
+            DASH2('L',FILSYM,UNOP);
+            DASH2('S',FILSOCK,UNOP);
+            }
+        }
+        if(*(txt + 3) == '\0') {
+#define DASH3(c1,c2,op,optype) if(*(txt + 1) == c1 && *(txt + 2) == c2) { \
+                                 *operand = op;*operand_type = optype; \
+                                 return(1); }
+            DASH3('e','q',INTEQ,BINOP);
+            DASH3('n','e',INTNE,BINOP);
+            DASH3('g','e',INTGE,BINOP);
+            DASH3('g','t',INTGE,BINOP);
+            DASH3('l','e',INTLE,BINOP);
+            DASH3('l','t',INTLT,BINOP);
+            DASH3('n','t',FILNT,BINOP);
+            DASH3('o','t',FILOT,BINOP);
+            DASH3('e','f',FILEQ,BINOP);
+            return(0);
+        }
+        return(0); /* the -NNN... case */
+    }
+#define SINGLECHAR(c,op,optype) case c: *operand = op;*operand_type = optype;return(1);
+    SINGLECHAR('(',LPAREN,PAREN);
+    SINGLECHAR(')',RPAREN,PAREN);
+    SINGLECHAR('=',STREQ,BINOP);
+    SINGLECHAR('<',STRLT,BINOP);
+    SINGLECHAR('>',STRGT,BINOP);
+
+    case '!':  /* ! or != */
+      switch(*(txt + 1)) {
+      default:
+          return(0);
+      case '=':
+          if(*(txt + 2) == '\0') {
+              *operand = STRNE;
+              *operand_type = BINOP;
+              return(1);
+          }
+          return(0);
+       case '\0':
+           *operand = UNOT;
+           *operand_type = BUNOP;
+           return(1);
+      }
+   }
+}
 
 static int
 isoperand(void)
 {
-	struct t_op const *op;
 	char *s, *t;
+        short op_,optype_;
 
-	op = ops;
 	if ((s  = *(t_wp+1)) == 0)
 		return 1;
 	if ((t = *(t_wp+2)) == 0)
 		return 0;
-	while (op->op_text) {
-		if (strcmp(s, op->op_text) == 0)
-	    		return op->op_type == BINOP &&
-	    		    (t[0] != ')' || t[1] != '\0'); 
-		op++;
-	}
+
+        if(findoperand(s,&op_,&optype_) == 1)
+             return(optype_ == BINOP && (t[0] != ')' || t[1] != '\0')); 
 	return 0;
 }
 
!#/bin/sh
dotest () {
   ./test $*
   if [ $? -eq 0 ]
   then
       echo "($*) passed"
       return
   fi
   echo "($*) failed"
   exit 1
}

touch /tmp/$$.tst
chmod a+x /tmp/$$.tst
dotest -x /tmp/$$.tst
chmod a-x /tmp/$$.tst
dotest ! -x /tmp/$$.tst
chmod a+w /tmp/$$.tst
dotest -w /tmp/$$.tst
chmod a+r /tmp/$$.tst
dotest -r /tmp/$$.tst
dotest -e /tmp/$$.tst
dotest ! -e /tmp/$$.fdasfdastst
dotest -f /tmp/$$.tst
dotest ! -d /tmp/$$.tst
dotest ! -c /tmp/$$.tst
dotest ! -b /tmp/$$.tst
dotest ! -p /tmp/$$.tst
dotest ! -u /tmp/$$.tst
dotest ! -g /tmp/$$.tst
dotest ! -k /tmp/$$.tst
dotest ! -h /tmp/$$.tst
dotest -O /tmp/$$.tst
dotest -G /tmp/$$.tst
dotest ! -L /tmp/$$.tst
dotest ! -S /tmp/$$.tst
dotest ! "equal" = "notequal"
dotest "equal" != "notequal"
dotest "equal" = "equal"
dotest "equal" \< "fqual"
dotest "equal" \> "dqual"
dotest -n "equal"
dotest -z ""
dotest 0 -eq 0
dotest 0 -ne 1
dotest 1 -ge 0
dotest 1 -gt 0
dotest 0 -le 1
dotest 0 -lt 1
dotest 1 -a 1
dotest ! ! 0 -a 1
dotest 1 -o 0
dotest ! 0 -o 0
dotest \( 0 -eq 0 -a 1 -ne 0 \)
rm /tmp/$$.tst