Subject: Re: bin/36092: algorithmic inefficiency in bin/test/test.c:t_lex
To: None <gnats-bugs@NetBSD.org, gnats-admin@netbsd.org,>
From: Christos Zoulas <christos@zoulas.com>
List: netbsd-bugs
Date: 03/27/2007 12:54:24
On Mar 27,  5:10am, mtbakerguy@yahoo.com (mtbakerguy@yahoo.com) wrote:
-- Subject: bin/36092: algorithmic inefficiency in bin/test/test.c:t_lex

| >Number:         36092
| >Category:       bin
| >Synopsis:       algorithmic inefficiency in bin/test/test.c:t_lex
| >Confidential:   no
| >Severity:       serious
| >Priority:       medium
| >Responsible:    bin-bug-people
| >State:          open
| >Class:          change-request
| >Submitter-Id:   net
| >Arrival-Date:   Tue Mar 27 05:10:00 +0000 2007
| >Originator:     Knotwell
| >Release:        3.1
| >Organization:
| home
| >Environment:
| NetBSD bradwkstn 3.1 NetBSD 3.1 (GENERIC.PROF) #7: Thu Dec 14 21:54:41 PST 2006  root@bradwkstn:/usr/src/usr/src/sys/arch/i386/compile/GENERIC.PROF i386
| >Description:
| While profiling /bin/sh for fun, I noticed several things:
| 
|    o the test builtin is *extremely* expensive with massive calls to strcmp in the t_lex function.  Specifically, it does linear walks of the operations to do a match.
|    o profiling /bin/sh is difficult because bin/sh/trap.c:exitshell uses _exit instead of exit (NB:  this isn't a bug but probably belongs in a README somewhere)
|    o arithmetic operations are also more expensive than they need to be since there are two calls made to ioctl (via isatty) when arith_lex_reset is called.  In my limited testing, I commented out the YY_NEW_FILE stanza and cut the number of syscalls substantially in a shell doing a large amount of arithmetic.
| 
| At this cost of some codespace as well as readability*, I created a substantially improved parser for the test operations that dropped strcmp and t_lex off the profiler (almost) entirely.  On a shell doing a significant number of boolean comparisons, I cut the run time by approximately 15%.
| 
| I've attached a diff below as well as a small shell that I used to validate the change (I noticed TEST.csh as I was preparing the diff).
| 
| *unfortunately, my change loses the data-driven clarity of the original code and it wasn't obvious to me how you'd get a data-driven solution nearly as efficient as mine.  There's probably a simpler and more elegant solution but code review's difficult on a home project.

How about this?

christos

Index: test.c
===================================================================
RCS file: /cvsroot/src/bin/test/test.c,v
retrieving revision 1.30
diff -u -u -r1.30 test.c
--- test.c	24 Sep 2006 13:24:08 -0000	1.30
+++ test.c	27 Mar 2007 16:53:02 -0000
@@ -95,50 +95,60 @@
 	PAREN
 };
 
-static struct t_op {
+struct t_op {
 	const char *op_text;
 	short op_num, op_type;
-} const ops [] = {
-	{"-r",	FILRD,	UNOP},
-	{"-w",	FILWR,	UNOP},
-	{"-x",	FILEX,	UNOP},
-	{"-e",	FILEXIST,UNOP},
-	{"-f",	FILREG,	UNOP},
-	{"-d",	FILDIR,	UNOP},
-	{"-c",	FILCDEV,UNOP},
-	{"-b",	FILBDEV,UNOP},
-	{"-p",	FILFIFO,UNOP},
-	{"-u",	FILSUID,UNOP},
-	{"-g",	FILSGID,UNOP},
-	{"-k",	FILSTCK,UNOP},
-	{"-s",	FILGZ,	UNOP},
-	{"-t",	FILTT,	UNOP},
-	{"-z",	STREZ,	UNOP},
-	{"-n",	STRNZ,	UNOP},
-	{"-h",	FILSYM,	UNOP},		/* for backwards compat */
-	{"-O",	FILUID,	UNOP},
-	{"-G",	FILGID,	UNOP},
-	{"-L",	FILSYM,	UNOP},
-	{"-S",	FILSOCK,UNOP},
-	{"=",	STREQ,	BINOP},
-	{"!=",	STRNE,	BINOP},
-	{"<",	STRLT,	BINOP},
-	{">",	STRGT,	BINOP},
-	{"-eq",	INTEQ,	BINOP},
-	{"-ne",	INTNE,	BINOP},
-	{"-ge",	INTGE,	BINOP},
-	{"-gt",	INTGT,	BINOP},
-	{"-le",	INTLE,	BINOP},
-	{"-lt",	INTLT,	BINOP},
-	{"-nt",	FILNT,	BINOP},
-	{"-ot",	FILOT,	BINOP},
-	{"-ef",	FILEQ,	BINOP},
+};
+
+static const struct t_op cop[] = {
 	{"!",	UNOT,	BUNOP},
-	{"-a",	BAND,	BBINOP},
-	{"-o",	BOR,	BBINOP},
 	{"(",	LPAREN,	PAREN},
 	{")",	RPAREN,	PAREN},
-	{0,	0,	0}
+	{"<",	STRLT,	BINOP},
+	{"=",	STREQ,	BINOP},
+	{">",	STRGT,	BINOP},
+};
+
+static const struct t_op cop2[] = {
+	{"!=",	STRNE,	BINOP},
+};
+
+static const struct t_op mop3[] = {
+	{"ef",	FILEQ,	BINOP},
+	{"eq",	INTEQ,	BINOP},
+	{"ge",	INTGE,	BINOP},
+	{"gt",	INTGT,	BINOP},
+	{"le",	INTLE,	BINOP},
+	{"lt",	INTLT,	BINOP},
+	{"ne",	INTNE,	BINOP},
+	{"nt",	FILNT,	BINOP},
+	{"ot",	FILOT,	BINOP},
+};
+
+static const struct t_op mop2[] = {
+	{"G",	FILGID,	UNOP},
+	{"L",	FILSYM,	UNOP},
+	{"O",	FILUID,	UNOP},
+	{"S",	FILSOCK,UNOP},
+	{"a",	BAND,	BBINOP},
+	{"b",	FILBDEV,UNOP},
+	{"c",	FILCDEV,UNOP},
+	{"d",	FILDIR,	UNOP},
+	{"e",	FILEXIST,UNOP},
+	{"f",	FILREG,	UNOP},
+	{"g",	FILSGID,UNOP},
+	{"h",	FILSYM,	UNOP},		/* for backwards compat */
+	{"k",	FILSTCK,UNOP},
+	{"n",	STRNZ,	UNOP},
+	{"o",	BOR,	BBINOP},
+	{"p",	FILFIFO,UNOP},
+	{"r",	FILRD,	UNOP},
+	{"s",	FILGZ,	UNOP},
+	{"t",	FILTT,	UNOP},
+	{"u",	FILSUID,UNOP},
+	{"w",	FILWR,	UNOP},
+	{"x",	FILEX,	UNOP},
+	{"z",	STREZ,	UNOP},
 };
 
 static char **t_wp;
@@ -390,26 +400,68 @@
 	}
 }
 
+#define VTOC(x)	(const unsigned char *)((const struct t_op *)x)->op_text
+
+static int
+compare1(const void *va, const void *vb)
+{
+	const unsigned char *a = va;
+	const unsigned char *b = VTOC(vb);
+
+	return a[0] - b[0];
+}
+
+static int
+compare2(const void *va, const void *vb)
+{
+	const unsigned char *a = va;
+	const unsigned char *b = VTOC(vb);
+	int z = a[0] - b[0];
+
+	return z ? z : (a[1] - b[1]);
+}
+
+static struct t_op const *
+findop(const char *s)
+{
+	if (s[0] == '-') {
+		if (s[1] == '\0')
+			return NULL;
+		if (s[2] == '\0')
+			return bsearch(s + 1, mop2, __arraycount(mop2),
+			    sizeof(*mop2), compare1);
+		else if (s[3] != '\0')
+			return NULL;
+		else
+			return bsearch(s + 1, mop3, __arraycount(mop3),
+			    sizeof(*mop3), compare2);
+	} else {
+		if (s[1] == '\0')
+			return bsearch(s, cop, __arraycount(cop), sizeof(*cop),
+			    compare1);
+		else if (strcmp(s, cop2[0].op_text) == 0)
+			return cop2;
+		else
+			return NULL;
+	}
+}
+
 static enum token
 t_lex(char *s)
 {
 	struct t_op const *op;
 
-	op = ops;
-
-	if (s == 0) {
+	if (s == NULL) {
 		t_wp_op = NULL;
 		return EOI;
 	}
-	while (op->op_text) {
-		if (strcmp(s, op->op_text) == 0) {
-			if ((op->op_type == UNOP && isoperand()) ||
-			    (op->op_num == LPAREN && *(t_wp+1) == 0))
-				break;
+
+	if ((op = findop(s)) != NULL) {
+		if (!((op->op_type == UNOP && isoperand()) ||
+		    (op->op_num == LPAREN && *(t_wp+1) == 0))) {
 			t_wp_op = op;
 			return op->op_num;
 		}
-		op++;
 	}
 	t_wp_op = NULL;
 	return OPERAND;
@@ -421,17 +473,12 @@
 	struct t_op const *op;
 	char *s, *t;
 
-	op = ops;
 	if ((s  = *(t_wp+1)) == 0)
 		return 1;
 	if ((t = *(t_wp+2)) == 0)
 		return 0;
-	while (op->op_text) {
-		if (strcmp(s, op->op_text) == 0)
-	    		return op->op_type == BINOP &&
-	    		    (t[0] != ')' || t[1] != '\0'); 
-		op++;
-	}
+	if ((op = findop(s)) != NULL)
+		return op->op_type == BINOP && (t[0] != ')' || t[1] != '\0'); 
 	return 0;
 }