Subject: Re: Musings on ld.elf_so and symbol lookup caching
To: <>
From: David Laight <david@l8s.co.uk>
List: tech-userlevel
Date: 10/04/2002 15:44:25
> Another observation I've made is that, when a process is started, or a
> module is loaded, all of the objects loaded at the same time use
> exactly the same search order to look up symbols.
> 
> So, I was thinking that, rather than doing the `SymCache' per object,
> we could instead create one global cache to be used during the entire
> startup or module loading.  For simplicity, make it a simple,
> direct-mapped, non-chaining hash table, and fill it in lazily.  This
> should give a significant improvement over the `SymCache' code.
> 
> I'll probably get to this on Monday unless someone else does it.
> 

I implemented code to set up a global symbol table for the main
program body yesterday.  I didn't do 'lazy' fill so the cost
of populating it is too large to benefit small programs.
However for large programs there is a big gain.

I've now merged the code with the latest sources (from anoncvs),
but still want to tidy up the debug trace code a little further.
(in particular add an option for either 'wall clock' timestamps
or 'cpu usage' ones.)

In any case the current diff is below:

	David

Index: Makefile
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/Makefile,v
retrieving revision 1.57
diff -u -r1.57 Makefile
--- Makefile	2002/09/27 21:37:50	1.57
+++ Makefile	2002/10/04 14:41:38
@@ -40,7 +40,6 @@
 CPPFLAGS+= -DRTLD_LOADER
 CPPFLAGS+= -D_RTLD_SOURCE
 #CPPFLAGS+= -DDEBUG
-#CPPFLAGS+= -DRTLD_DEBUG
 #CPPFLAGS+= -DRTLD_DEBUG_RELOC
 #DBG=	-g
 DBG=	-O3 -fomit-frame-pointer
Index: debug.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/debug.c,v
retrieving revision 1.3
diff -u -r1.3 debug.c
--- debug.c	2002/05/26 00:02:07	1.3
+++ debug.c	2002/10/04 14:41:38
@@ -36,6 +36,8 @@
  */
 
 #include <sys/cdefs.h>
+#include <sys/resource.h>
+/* #include <sys/time.h> */
 #include <stdarg.h>
 
 #include "debug.h"
@@ -44,16 +46,51 @@
 #ifdef DEBUG
 int debug = 0;
 
+#define TIME_DIFF(new, old) (((new).tv_sec - (old).tv_sec) * 1000000 \
+			    + (new).tv_usec - (old).tv_usec)
+
 void
 debug_printf(const char *format, ...)
 {
-	if(debug) {
-		va_list ap;
+	int now;
+	struct rusage r;
+	static int last_time, epoch;
+	static int time_cost = ~0;
+	va_list ap;
+
+	if(!debug)
+		return;
+	if (debug & (DEBUG_TIME | DEBUG_TIME_REL)) {
+		getrusage( RUSAGE_SELF, &r );
+		now = (r.ru_utime.tv_sec + r.ru_stime.tv_sec) * 1000000
+			+ r.ru_utime.tv_usec + r.ru_stime.tv_usec;
+		if (time_cost == ~0) {
+			int i;
+			for (i = 16; --i;)
+			    getrusage( RUSAGE_SELF, &r );
+			last_time = now;
+			now = (r.ru_utime.tv_sec + r.ru_stime.tv_sec) * 1000000
+			    + r.ru_utime.tv_usec + r.ru_stime.tv_usec;
+			time_cost = (now - last_time)/16;
+		}
+		if (debug & DEBUG_TIME_REL)
+			xprintf(debug & DEBUG_TIME ? "+%d=" : "+%d:",
+				now - last_time - time_cost);
+		if (debug & DEBUG_TIME)
+			xprintf("%d: ", now - epoch - time_cost);
+	}
+
+	va_start(ap, format);
+	xvprintf(format, ap);
+	va_end(ap);
+	xprintf("\n");
 
-		va_start(ap, format);
-		xvprintf(format, ap);
-		va_end(ap);
-		xprintf("\n");
+	/* Try to exclude cost of printf from timestamps */
+	if (debug & (DEBUG_TIME | DEBUG_TIME_REL)) {
+		getrusage(RUSAGE_SELF, &r);
+		last_time = (r.ru_utime.tv_sec + r.ru_stime.tv_sec) * 1000000
+			+ r.ru_utime.tv_usec + r.ru_stime.tv_usec;
+		epoch += last_time - now + time_cost;
 	}
 }
 #endif
Index: debug.h
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/debug.h,v
retrieving revision 1.5
diff -u -r1.5 debug.h
--- debug.h	2002/09/12 22:56:28	1.5
+++ debug.h	2002/10/04 14:41:39
@@ -41,18 +41,30 @@
 
 #ifdef DEBUG
 
+#define	DEBUG_DEFAULT	0x01
+#define	DEBUG_TIME	0x02		/* timestamp debug trace */
+#define	DEBUG_TIME_REL	0x04		/* relative times */
+
+#define	DEBUG_SYMBOL	0x10		/* symbol debug */
+#define	DEBUG_RELOC	0x20		/* relocation debug */
+
 extern void debug_printf __P((const char *, ...))
     __attribute__((__format__(__printf__, 1, 2)));
 extern int debug;
+
+#define dbg(a)		if (__predict_false(debug)) debug_printf a
+#define x_dbg(x,a)	if (__predict_false(debug & (x))) debug_printf a
+#define IF_DEBUG(x)	x
+
+#else 	/* DEBUG */
+
+#define dbg(a)		((void) 0)
+#define x_dbg(x,a)	((void) 0)
+#define IF_DEBUG(x)
+
+#endif	/* DEBUG */
 
-# define dbg(a)		debug_printf a
-#else 
-# define dbg(a)		((void) 0)
-#endif
-#ifdef RTLD_DEBUG_RELOC
-# define rdbg(a)	debug_printf a
-#else
-# define rdbg(a)	((void) 0)
-#endif
+#define sdbg(a)		x_dbg(DEBUG_SYMBOL, a)
+#define rdbg(a)		x_dbg(DEBUG_RELOC, a)
 
 #endif
Index: map_object.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/map_object.c,v
retrieving revision 1.21
diff -u -r1.21 map_object.c
--- map_object.c	2002/09/27 19:48:24	1.21
+++ map_object.c	2002/10/04 14:41:42
@@ -45,8 +45,8 @@
 static int protflags __P((int));	/* Elf flags -> mmap protection */
 
 /*
- * Map a shared object into memory.  The argument is a file descriptor,
- * which must be open on the object and positioned at its beginning.
+ * Map a shared object into memory.
+ * The argument is a file descriptor, which must be open on the object.
  *
  * The return value is a pointer to a newly-allocated Obj_Entry structure
  * for the shared object.  Returns NULL on failure.
@@ -61,11 +61,12 @@
 	Elf_Ehdr	*ehdr;
 	Elf_Phdr	*phdr;
 	Elf_Phdr	*phlimit;
-	Elf_Phdr	*segs[2];
+	Elf_Phdr	 segs[2];
 	int		 nsegs;
-	Elf_Phdr	*phdyn;
-	Elf_Phdr	*phphdr;
-	Elf_Phdr	*phinterp;
+	Elf_Addr	 dyn_vaddr;
+	Elf_Addr	 ph_addr;
+	size_t		 ph_size;
+	Elf_Addr	 interp_addr;
 	caddr_t		 mapbase;
 	size_t		 mapsize;
 	Elf_Off		 base_offset;
@@ -79,11 +80,13 @@
 	caddr_t		 data_addr;
 	caddr_t		 gap_addr;
 	size_t		 gap_size;
+	Elf_Addr	 entry;
 #ifdef RTLD_LOADER
 	Elf_Addr	 clear_vaddr;
 	caddr_t		 clear_addr;
 	size_t		 nclear;
 #endif
+	int		 isdynamic;
 
 	ehdr = mmap(NULL, _rtld_pagesz, PROT_READ, MAP_FILE | MAP_SHARED, fd,
 	    (off_t)0);
@@ -133,31 +136,32 @@
 	phdr = (Elf_Phdr *) ((caddr_t)ehdr + ehdr->e_phoff);
 	phlimit = phdr + ehdr->e_phnum;
 	nsegs = 0;
-	phdyn = phphdr = phinterp = NULL;
+	interp_addr = ph_addr = dyn_vaddr = (Elf_Addr)~0;
 	while (phdr < phlimit) {
 		switch (phdr->p_type) {
 		case PT_INTERP:
-			phinterp = phdr;
+			interp_addr = phdr->p_vaddr;
 			break;
 
 		case PT_LOAD:
 			if (nsegs < 2)
-				segs[nsegs] = phdr;
+				segs[nsegs] = *phdr;
 			++nsegs;
 			break;
 
 		case PT_PHDR:
-			phphdr = phdr;
+			ph_addr = phdr->p_vaddr;
+			ph_size = phdr->p_memsz;
 			break;
 
 		case PT_DYNAMIC:
-			phdyn = phdr;
+			dyn_vaddr = phdr->p_vaddr;
 			break;
 		}
 
 		++phdr;
 	}
-	if (phdyn == NULL) {
+	if (dyn_vaddr == (Elf_Addr)~0) {
 		_rtld_error("%s: not dynamically linked", path);
 		goto bad;
 	}
@@ -179,10 +183,10 @@
 	 * and unmap the gaps left by padding to alignment.
 	 */
 
-	base_offset = round_down(segs[0]->p_offset);
-	base_vaddr = round_down(segs[0]->p_vaddr);
-	base_vlimit = round_up(segs[1]->p_vaddr + segs[1]->p_memsz);
-	text_vlimit = round_up(segs[0]->p_vaddr + segs[0]->p_memsz);
+	base_offset = round_down(segs[0].p_offset);
+	base_vaddr = round_down(segs[0].p_vaddr);
+	base_vlimit = round_up(segs[1].p_vaddr + segs[1].p_memsz);
+	text_vlimit = round_up(segs[0].p_vaddr + segs[0].p_memsz);
 	mapsize = base_vlimit - base_vaddr;
 
 #ifdef RTLD_LOADER
@@ -190,24 +194,30 @@
 #else
 	base_addr = NULL;
 #endif
+	entry = ehdr->e_entry;
+	isdynamic = ehdr->e_type == ET_DYN;
 
-	mapbase = mmap(base_addr, mapsize, protflags(segs[0]->p_flags),
+	/* unmap first page now so we don't have a double mapping */
+	munmap(ehdr, _rtld_pagesz);
+	ehdr = 0;
+
+	mapbase = mmap(base_addr, mapsize, protflags(segs[0].p_flags),
 		       MAP_FILE | MAP_PRIVATE, fd, base_offset);
 	if (mapbase == MAP_FAILED) {
 		_rtld_error("mmap of entire address space failed: %s",
 		    xstrerror(errno));
-		goto bad;
+		return NULL;
 	}
 
 	base_addr = mapbase;
 
 	/* Overlay the data segment onto the proper region. */
-	data_offset = round_down(segs[1]->p_offset);
-	data_vaddr = round_down(segs[1]->p_vaddr);
-	data_vlimit = round_up(segs[1]->p_vaddr + segs[1]->p_filesz);
+	data_offset = round_down(segs[1].p_offset);
+	data_vaddr = round_down(segs[1].p_vaddr);
+	data_vlimit = round_up(segs[1].p_vaddr + segs[1].p_filesz);
 	data_addr = mapbase + (data_vaddr - base_vaddr);
 	if (mmap(data_addr, data_vlimit - data_vaddr,
-		 protflags(segs[1]->p_flags),
+		 protflags(segs[1].p_flags),
 		 MAP_FILE | MAP_PRIVATE | MAP_FIXED, fd, data_offset)
 	    == MAP_FAILED) {
 		_rtld_error("mmap of data failed: %s", xstrerror(errno));
@@ -216,7 +226,7 @@
 
 	/* Overlay the bss segment onto the proper region. */
 	if (mmap(mapbase + data_vlimit - base_vaddr, base_vlimit - data_vlimit,
-		 protflags(segs[1]->p_flags),
+		 protflags(segs[1].p_flags),
 		 MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0)
 	    == MAP_FAILED) {
 		_rtld_error("mmap of bss failed: %s", xstrerror(errno));
@@ -234,7 +244,7 @@
 
 #ifdef RTLD_LOADER
 	/* Clear any BSS in the last page of the data segment. */
-	clear_vaddr = segs[1]->p_vaddr + segs[1]->p_filesz;
+	clear_vaddr = segs[1].p_vaddr + segs[1].p_filesz;
 	clear_addr = mapbase + (clear_vaddr - base_vaddr);
 	if ((nclear = data_vlimit - clear_vaddr) > 0)
 		memset(clear_addr, 0, nclear);
@@ -250,27 +260,26 @@
 	}
 	obj->mapbase = mapbase;
 	obj->mapsize = mapsize;
-	obj->textsize = round_up(segs[0]->p_vaddr + segs[0]->p_memsz) -
+	obj->textsize = round_up(segs[0].p_vaddr + segs[0].p_memsz) -
 	    base_vaddr;
 	obj->vaddrbase = base_vaddr;
 	obj->relocbase = mapbase - base_vaddr;
-	obj->dynamic = (Elf_Dyn *)(obj->relocbase + phdyn->p_vaddr);
-	if (ehdr->e_entry != 0)
-		obj->entry = (caddr_t)(obj->relocbase + ehdr->e_entry);
-	if (phphdr != NULL) {
-		obj->phdr = (const Elf_Phdr *)
-		    (obj->relocbase + phphdr->p_vaddr);
-		obj->phsize = phphdr->p_memsz;
-	}
-	if (phinterp != NULL)
-		obj->interp = (const char *) (obj->relocbase + phinterp->p_vaddr);
-	obj->isdynamic = ehdr->e_type == ET_DYN;
+	obj->dynamic = (Elf_Dyn *)(obj->relocbase + dyn_vaddr);
+	if (entry != 0)
+		obj->entry = (caddr_t)(obj->relocbase + entry);
+	if (ph_addr != (Elf_Addr)~0) {
+		obj->phdr = (const Elf_Phdr *)(obj->relocbase + ph_addr);
+		obj->phsize = ph_size;
+	}
+	if (interp_addr != (Elf_Addr)~0)
+		obj->interp = (const char *)(obj->relocbase + interp_addr);
+	obj->isdynamic = isdynamic;
 
-	munmap(ehdr, _rtld_pagesz);
 	return obj;
 
 bad2:
 	munmap(mapbase, mapsize);
+	return NULL;
 bad:
 	munmap(ehdr, _rtld_pagesz);
 	return NULL;
Index: paths.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/paths.c,v
retrieving revision 1.18
diff -u -r1.18 paths.c
--- paths.c	2002/09/28 05:00:27	1.18
+++ paths.c	2002/10/04 14:41:45
@@ -78,18 +78,19 @@
 {
 	char *cp;
 	Search_Path *path;
+	int len = ep - bp;
 
-	if (bp == NULL || bp == ep || *bp == '\0')
+	if (bp == NULL || len == 0 || *bp == '\0')
 		return path_p;
 
-	if (_rtld_find_path(*head_p, bp, ep - bp) != NULL)
+	if (_rtld_find_path(*head_p, bp, len) != NULL)
 		return path_p;
 
 	path = NEW(Search_Path);
-	path->sp_pathlen = ep - bp;
-	cp = xmalloc(path->sp_pathlen + 1);
-	strncpy(cp, bp, path->sp_pathlen);
-	cp[path->sp_pathlen] = '\0';
+	path->sp_pathlen = len;
+	cp = xmalloc(len + 1);
+	memcpy(cp, bp, len);
+	cp[len] = '\0';
 	path->sp_path = cp;
 	path->sp_next = (*path_p);
 	(*path_p) = path;
@@ -198,17 +199,22 @@
 	if (bp == NULL || bp == ep || *bp == '\0')
 		return;
 
+	i = ep - bp;
+	hwptr = xmalloc(sizeof(*hwptr) + i + 1);
+	memset(hwptr, 0, sizeof(*hwptr));
+	ptr = (void *)(hwptr + 1);
+	hwptr->name = ptr;
+	memcpy(ptr, bp, i);
+	ptr[i] = 0;
+	bp = ptr;
+
 	dbg((" processing mapping \"%s\"", bp));
 
 	if ((ptr = strsep(&bp, WS)) == NULL)
-		return;
+		goto cleanup;
 
 	dbg((" library \"%s\"", ptr));
 
-	hwptr = xmalloc(sizeof(*hwptr));
-	memset(hwptr, 0, sizeof(*hwptr));
-	hwptr->name = xstrdup(ptr);
-
 	while ((ptr = strsep(&bp, WS)) != NULL)
 		if (*ptr != '\0')
 			break;
@@ -313,8 +319,6 @@
 	return;
 
 cleanup:
-	if (hwptr->name)
-		free(hwptr->name);
 	free(hwptr);
 }
 
@@ -329,12 +333,13 @@
 	struct stat st;
 	size_t sz;
 	Search_Path **head_p = path_p;
-	int doing_path = 0;
+	int c;
 
 	if ((fd = open(fname, O_RDONLY)) == -1) {
 		/* Don't complain */
 		return;
 	}
+	dbg(("processing hints file \"%s\"", fname));
 
 	if (fstat(fd, &st) == -1) {
 		/* Complain */
@@ -344,7 +349,7 @@
 
 	sz = (size_t) st.st_size;
 
-	buf = mmap(0, sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FILE, fd, 0);
+	buf = mmap(0, sz, PROT_READ, MAP_SHARED|MAP_FILE, fd, 0);
 	if (buf == MAP_FAILED) {
 		xwarn("fstat: %s", fname);
 		(void)close(fd);
@@ -355,60 +360,41 @@
 	while ((*path_p) != NULL)
 		path_p = &(*path_p)->sp_next;
 
-	for (b = NULL, p = buf, ebuf = buf + sz; p < ebuf; p++) {
+	ebuf = buf + sz;
+	b = buf;
+	while (b < ebuf) {
+		/* skip whitespace */
+		while ((c = *b) == ' ' || c == '\t' || !c)
+			if (++b >= ebuf)
+				goto eof;
+		/* ignore blank and comment lines */
+		if (c == '#' || c == '\n') {
+			while (*b != '\n')
+				if (++b >= ebuf)
+					goto eof;
+			continue;
+		}
+		/* find end of line or start of comment */
+		for (p = b; p < ebuf && (c = *p) != '#' && c != '\n';)
+			p++;
+		/* back up over white space before comment */
+		if (c == '#') {
+			while (*(p-1) == ' ' || *(p-1) == '\t')
+				p--;
+		}
 
-		if ((p == buf || p[-1] == '\0') && b == NULL)
+		if (*b == '/')
+			path_p = _rtld_append_path(head_p, path_p, b, p);
+		else
+			_rtld_process_mapping(lib_p, b, p);
+
+		/* ignore rest of line (reprocess EOL comments) */
+		if (c == '\n')
+			b = p + 1;
+		else
 			b = p;
-
-		switch (*p) {
-		case '/':
-			if (b == p)
-				doing_path = 1;
-			break;
-
-		case ' ': case '\t':
-			if (b == p)
-				b++;
-			break;
-
-		case '\n':
-			*p = '\0';
-			if (doing_path)
-				path_p = _rtld_append_path(head_p, path_p, b,
-				    p);
-			else
-				_rtld_process_mapping(lib_p, b, p);
-			b = NULL;
-			break;
-
-		case '#':
-			if (b != p) {
-				char *sp;
-				for  (sp = p - 1; *sp == ' ' ||
-				    *sp == '\t'; --sp)
-					continue;
-				*++sp = '\0';
-				if (doing_path)
-					path_p = _rtld_append_path(head_p,
-					    path_p, b, sp);
-				else
-					_rtld_process_mapping(lib_p, b, sp);
-				*sp = ' ';
-			}
-			b = NULL;
-			break;
-
-		default:
-			if (b == p)
-				doing_path = 0;
-			break;
-		}
 	}
-
-	if (doing_path)
-		path_p = _rtld_append_path(head_p, path_p, b, ebuf);
-	else
-		_rtld_process_mapping(lib_p, b, ebuf);
+    eof:
 
 	(void)munmap(buf, sz);
 }
Index: reloc.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/reloc.c,v
retrieving revision 1.74
diff -u -r1.74 reloc.c
--- reloc.c	2002/09/26 20:42:10	1.74
+++ reloc.c	2002/10/04 14:41:45
@@ -72,14 +72,16 @@
 	void           *dstaddr = (void *)(dstobj->relocbase + rela->r_offset);
 	const Elf_Sym  *dstsym = dstobj->symtab + ELF_R_SYM(rela->r_info);
 	const char     *name = dstobj->strtab + dstsym->st_name;
-	unsigned long   hash = _rtld_elf_hash(name);
+	uint		len;
+	unsigned long   hash = _rtld_elf_hash(name, &len);
 	size_t          size = dstsym->st_size;
 	const void     *srcaddr;
 	const Elf_Sym  *srcsym = NULL;
 	Obj_Entry      *srcobj;
 
 	for (srcobj = dstobj->next; srcobj != NULL; srcobj = srcobj->next)
-		if ((srcsym = _rtld_symlook_obj(name, hash, srcobj, false)) != NULL)
+		if ((srcsym = _rtld_symlook_obj(name, len, hash, srcobj,
+		    false)) != NULL)
 			break;
 
 	if (srcobj == NULL) {
@@ -156,6 +158,7 @@
 	int ok = 1;
 
 	for (obj = first; obj != NULL; obj = obj->next) {
+		dbg(("relocating objects for %s", obj->path));
 		if (obj->nbuckets == 0 || obj->nchains == 0 ||
 		    obj->buckets == NULL || obj->symtab == NULL ||
 		    obj->strtab == NULL) {
@@ -175,6 +178,7 @@
 			 * There are relocations to the write-protected text
 			 * segment.
 			 */
+			dbg((" relocations to write-protected text"));
 			if (mprotect(obj->mapbase, obj->textsize,
 				PROT_READ | PROT_WRITE | PROT_EXEC) == -1) {
 				_rtld_error("%s: Cannot write-enable text "
@@ -197,9 +201,11 @@
 		if (_rtld_relocate_plt_lazy(obj) < 0)
 			ok = 0;
 #if defined(__i386__)
-		if (bind_now)
+		if (bind_now) {
+			dbg((" relocate plt objects"));
 			if (_rtld_relocate_plt_objects(obj) < 0)
 				ok = 0;
+		}
 #endif
 		if (!ok)
 			return -1;
@@ -218,8 +224,10 @@
 
 		dbg(("fixing up PLTGOT"));
 		/* Set the special PLTGOT entries. */
-		if (obj->pltgot != NULL)
+		if (obj->pltgot != NULL) {
+			dbg((" setup plgot"));
 			_rtld_setup_pltgot(obj);
+		}
 	}
 
 	return 0;
Index: rtld.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/rtld.c,v
retrieving revision 1.85
diff -u -r1.85 rtld.c
--- rtld.c	2002/10/04 03:59:41	1.85
+++ rtld.c	2002/10/04 14:41:49
@@ -324,8 +324,13 @@
 #ifdef RTLD_DEBUG
 		debug = 0;
 #endif
-		if (ld_debug != NULL && *ld_debug != '\0')
-			debug = 1;
+		if (ld_debug != NULL && *ld_debug != '\0') {
+			char *ep;
+			debug = strtoul(ld_debug, &ep, 0);
+			if (ep == ld_debug)
+				debug = DEBUG_DEFAULT;
+		}
+		dbg(("debug enabled"));
 #endif
 		_rtld_add_paths(&_rtld_paths, getenv("LD_LIBRARY_PATH"));
 	}
@@ -338,7 +343,7 @@
          * already loaded.
          */
 	if (pAUX_execfd != NULL) {	/* Load the main program. */
-		int             fd = pAUX_execfd->a_v;
+		int fd = pAUX_execfd->a_v;
 		dbg(("loading main program"));
 		_rtld_objmain = _rtld_map_object(xstrdup(argv[0] ? argv[0] :
 		    "main program"), fd, NULL);
@@ -406,6 +411,8 @@
 	if (_rtld_load_needed_objects(_rtld_objmain, RTLD_MAIN) == -1)
 		_rtld_die();
 
+	_rtld_process_main_symbols();
+
 	dbg(("relocating objects"));
 	if (_rtld_relocate_objects(_rtld_objmain, bind_now) == -1)
 		_rtld_die();
@@ -612,6 +619,8 @@
 	Obj_Entry **old_obj_tail = _rtld_objtail;
 	Obj_Entry *obj = NULL;
 
+	dbg(("dlopen( %s, %x )", name, mode ));
+
 	_rtld_debug.r_state = RT_ADD;
 	_rtld_debug_state();
 
@@ -640,6 +649,8 @@
 	_rtld_debug.r_state = RT_CONSISTENT;
 	_rtld_debug_state();
 
+	dbg(("dlopen returning %p", obj));
+
 	return obj;
 }
 
@@ -653,11 +664,13 @@
 	unsigned long hash;
 	const Elf_Sym *def;
 	const Obj_Entry *obj;
+	uint len;
 
-	hash = _rtld_elf_hash(name);
+	hash = _rtld_elf_hash(name, &len);
 	obj = _rtld_objmain;
 
-	def = _rtld_symlook_list(name, hash, &_rtld_list_main, &obj, false);
+	def = _rtld_symlook_list(name, len, hash, &_rtld_list_main,
+								&obj, false);
 
 	if (def != NULL)
 		return obj->relocbase + def->st_value;
@@ -673,8 +686,11 @@
 	unsigned long hash;
 	const Elf_Sym *def;
 	const Obj_Entry *defobj;
+	uint len;
 	
-	hash = _rtld_elf_hash(name);
+	dbg(("dlsym(%p, %s)", handle, name));
+
+	hash = _rtld_elf_hash(name, &len);
 	def = NULL;
 	defobj = NULL;
 	
@@ -691,11 +707,12 @@
 			return NULL;
 		}
 		if (handle == NULL) { /* Just the caller's shared object. */
-			def = _rtld_symlook_obj(name, hash, obj, false);
+			def = _rtld_symlook_obj(name, len, hash, obj, false);
 			defobj = obj;
 		} else { /* All the shared objects after the caller's */
 			while ((obj = obj->next) != NULL) {
-				if ((def = _rtld_symlook_obj(name, hash, obj, false)) != NULL) {
+				if ((def = _rtld_symlook_obj(name, len, hash,
+							obj, false)) != NULL) {
 					defobj = obj;
 					break;
 				}
@@ -707,18 +724,20 @@
 		
 		if (obj->mainprog) {
 			/* Search main program and all libraries loaded by it. */
-			def = _rtld_symlook_list(name, hash, &_rtld_list_main, &defobj, false);
+			def = _rtld_symlook_list(name, len, hash,
+					&_rtld_list_main, &defobj, false);
 		} else {
 			/*
-			 * XXX - This isn't correct.  The search should include the whole
-			 * DAG rooted at the given object.
+			 * XXX - This isn't correct.  The search should include
+			 * the whole DAG rooted at the given object.
 			 */
-			def = _rtld_symlook_obj(name, hash, obj, false);
+			def = _rtld_symlook_obj(name, len, hash, obj, false);
 			defobj = obj;
 		}
 	}
 	
 	if (def != NULL) {
+		dbg(("dlsym: returning %p", defobj->relocbase + def->st_value));
 #ifdef __HAVE_FUNCTION_DESCRIPTORS
 		if (ELF_ST_TYPE(def->st_info) == STT_FUNC)
 			return (void *)_rtld_function_descriptor_alloc(defobj, 
@@ -813,6 +832,7 @@
 	xvsnprintf(buf, sizeof buf, fmt, ap);
 	error_message = buf;
 	va_end(ap);
+	dbg(("rtld_error: %s", buf));
 }
 
 void
Index: rtld.h
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/rtld.h,v
retrieving revision 1.62
diff -u -r1.62 rtld.h
--- rtld.h	2002/10/03 20:35:20	1.62
+++ rtld.h	2002/10/04 14:41:50
@@ -254,13 +254,14 @@
 Obj_Entry *_rtld_load_library __P((const char *, const Obj_Entry *, int));
 
 /* symbol.c */
-unsigned long _rtld_elf_hash __P((const char *));
-const Elf_Sym *_rtld_symlook_obj __P((const char *, unsigned long,
+unsigned long _rtld_elf_hash(const char *, uint *);
+const Elf_Sym *_rtld_symlook_obj __P((const char *, uint, unsigned long,
     const Obj_Entry *, bool));
 const Elf_Sym *_rtld_find_symdef __P((unsigned long, const Obj_Entry *,
     const Obj_Entry **, bool));
-const Elf_Sym *_rtld_symlook_list(const char *, unsigned long,
+const Elf_Sym *_rtld_symlook_list(const char *, uint, unsigned long,
     const Objlist *, const Obj_Entry **, bool);
+void _rtld_process_main_symbols(void);
 
 /* map_object.c */
 Obj_Entry *_rtld_map_object __P((char *, int, const struct stat *));
Index: search.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/search.c,v
retrieving revision 1.14
diff -u -r1.14 search.c
--- search.c	2002/10/01 14:16:53	1.14
+++ search.c	2002/10/04 14:41:51
@@ -55,7 +55,7 @@
 /*
  * Data declarations.
  */
-Search_Path    *_rtld_invalid_paths;
+static Search_Path    *_rtld_invalid_paths;
 
 static Obj_Entry *_rtld_search_library_path __P((const char *, size_t,
     const char *, size_t, int));
@@ -73,20 +73,20 @@
 	Obj_Entry *obj;
 	Search_Path *sp;
 
-	pathnamelen = dirlen + 1 + namelen;
+	pathnamelen = dirlen + 1 + namelen + 1;
 
 	for (sp = _rtld_invalid_paths; sp != NULL; sp = sp->sp_next) {
 		if (sp->sp_pathlen == pathnamelen &&
-		    !strncmp(name, sp->sp_path + dirlen + 1, namelen) &&
-		    !strncmp(dir, sp->sp_path, dirlen)) {
+		    !memcmp(name, sp->sp_path + dirlen + 1, namelen) &&
+		    !memcmp(dir, sp->sp_path, dirlen)) {
 			return NULL;
 		}
 	}
 
-	pathname = xmalloc(pathnamelen + 1);
-	(void)strncpy(pathname, dir, dirlen);
+	pathname = xmalloc(pathnamelen);
+	memcpy(pathname, dir, dirlen);
 	pathname[dirlen] = '/';
-	strcpy(pathname + dirlen + 1, name);
+	memcpy(pathname + dirlen + 1, name, namelen + 1);
 
 	dbg(("  Trying \"%s\"", pathname));
 	obj = _rtld_load_object(pathname, mode);
@@ -129,7 +129,10 @@
 			return NULL;
 		}
 		pathname = xstrdup(name);
-		goto found;
+		obj = _rtld_load_object(pathname, mode);
+		if (obj == NULL)
+			free(pathname);
+		return obj;
 	}
 	dbg((" Searching for \"%s\" (%p)", name, refobj));
 
@@ -153,10 +156,4 @@
 
 	_rtld_error("Shared object \"%s\" not found", name);
 	return NULL;
-
-found:
-	obj = _rtld_load_object(pathname, mode);
-	if (obj == NULL)
-		free(pathname);
-	return obj;
 }
Index: symbol.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/symbol.c,v
retrieving revision 1.23
diff -u -r1.23 symbol.c
--- symbol.c	2002/10/03 20:35:20	1.23
+++ symbol.c	2002/10/04 14:41:53
@@ -52,31 +52,87 @@
 #include "debug.h"
 #include "rtld.h"
 
+typedef struct main_symtbl_t main_symtbl_t;
+struct main_symtbl_t {
+	main_symtbl_t   *chain;         /* hash chain */
+	const char      *name;          /* name in 'obj' symbol table */
+	const Obj_Entry	*obj;		/* containing object */
+	const Elf_Sym   *sym;           /* in 'obj' table */
+};
+
+#if defined(__i386__)
+/* An asm memcpy is better than the inline code gcc creates */
+extern int my_memcmp(const void *, const void *, int);
+#else
+#define my_memcpy(a,b,l) memcmp(a,b,l)
+#endif
+
+static main_symtbl_t *main_symtbl = 0;
+static uint main_sym_hash = 0;
+
 /*
  * Hash function for symbol table lookup.  Don't even think about changing
  * this.  It is specified by the System V ABI.
  */
 unsigned long
-_rtld_elf_hash(name)
-	const char *name;
+_rtld_elf_hash(const char *name, uint *lp)
 {
 	const unsigned char *p = (const unsigned char *) name;
 	unsigned long   h = 0;
-	unsigned long   g;
+	unsigned long	c;
+	uint l = 0;
 
-	while (*p != '\0') {
-		h = (h << 4) + *p++;
-		if ((g = h & 0xf0000000) != 0)
-			h ^= g >> 24;
-		h &= ~g;
+	do {
+		c = p[ l++ ];
+		if (!c)
+			goto done;
+		h = (h << 4) + c;
+	} while (!(h & 0xf0000000));
+	h ^= (h & 0xf0000000) >> 24;
+	while ((c = p[ l++ ])) {
+		h = (h << 4) + c;
+		h ^= (h >> 24) & 0xf0;
 	}
-	return h;
+    done:
+	*lp = l;	/* NB includes trailing null */
+	return h & 0xfffffff;
 }
 
-const Elf_Sym *
-_rtld_symlook_list(const char *name, unsigned long hash, const Objlist *objlist,
+static const Elf_Sym *
+_rtld_symlook_main(const char *name, uint len, unsigned long hash,
   const Obj_Entry **defobj_out, bool in_plt)
 {
+	main_symtbl_t *m;
+	const Elf_Sym *sym;
+
+	m = main_symtbl + hash % main_sym_hash;
+	if (!m->name)
+		return 0;
+	for (;;) {
+		sdbg(("   check %s vs %s from %p", name, m->name, m->obj));
+		if (m->name[1] == name[1] && !my_memcmp(name, m->name, len)) {
+			sym = m->sym;
+			if (sym->st_shndx != SHN_UNDEF)
+				break;
+#if !defined(__mips__)
+			if (!in_plt && sym->st_value != 0 &&
+			    ELF_ST_TYPE(sym->st_info) == STT_FUNC)
+				break;
+#endif
+		}
+		m = m->chain;
+		if (!m)
+			return 0;
+	}
+
+	*defobj_out = m->obj;
+	return sym;
+}
+
+const Elf_Sym *
+_rtld_symlook_list(const char *name, uint len, unsigned long hash,
+    const Objlist *objlist, const Obj_Entry **defobj_out, bool in_plt)
+{
 	const Elf_Sym *symp;
 	const Elf_Sym *def;
 	const Obj_Entry *defobj;
@@ -85,8 +141,8 @@
 	def = NULL;
 	defobj = NULL;
 	SIMPLEQ_FOREACH(elm, objlist, link) {
-		rdbg(("search object %p (%s)", elm->obj, elm->obj->path));
-		if ((symp = _rtld_symlook_obj(name, hash, elm->obj, in_plt))
+		sdbg(("  search object %p (%s)", elm->obj, elm->obj->path));
+		if ((symp = _rtld_symlook_obj(name, len, hash, elm->obj, in_plt))
 		    != NULL) {
 			if ((def == NULL) ||
 			    (ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
@@ -111,8 +167,9 @@
  * eliminates many recomputations of the hash value.
  */
 const Elf_Sym *
-_rtld_symlook_obj(name, hash, obj, in_plt)
+_rtld_symlook_obj(name, len, hash, obj, in_plt)
 	const char *name;
+	uint len;
 	unsigned long hash;
 	const Obj_Entry *obj;
 	bool in_plt;
@@ -128,8 +185,11 @@
 		assert(symnum < obj->nchains);
 		symp = obj->symtab + symnum;
 		strp = obj->strtab + symp->st_name;
-		rdbg(("check %s vs %s in %p", name, strp, obj));
-		if (name[1] == strp[1] && !strcmp(name, strp)) {
+		sdbg(("   check %s vs %s in %p", name, strp, obj));
+		if (name[1] == strp[1] && my_memcmp(name, strp, len) == 0) {
+		/* if (name[1] == strp[1] && my_memcmp(name, strp, len) == 0) { */
+		/* if (memcmp(name, strp, len) == 0) { */
+		/* if (name[1] == strp[1] && !strcmp(name, strp)) { */
 			if (symp->st_shndx != SHN_UNDEF)
 				return symp;
 #ifndef __mips__
@@ -176,16 +236,20 @@
 	const Objlist_Entry *elm;
 	const char     *name;
 	unsigned long   hash;
+	uint len;
 
 	ref = refobj->symtab + symnum;
 	name = refobj->strtab + ref->st_name;
 
-	hash = _rtld_elf_hash(name);
+	sdbg((" looking for %s", name ));
+
+	hash = _rtld_elf_hash(name, &len);
 	def = NULL;
 	defobj = NULL;
 	
 	if (refobj->symbolic) {	/* Look first in the referencing object */
-		symp = _rtld_symlook_obj(name, hash, refobj, in_plt);
+		sdbg(("  checking referencing object"));
+		symp = _rtld_symlook_obj(name, len, hash, refobj, in_plt);
 		if (symp != NULL) {
 			def = symp;
 			defobj = refobj;
@@ -194,8 +258,12 @@
 	
 	/* Search all objects loaded at program start up. */
 	if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
-		rdbg(("search _rtld_list_main"));
-		symp = _rtld_symlook_list(name, hash, &_rtld_list_main, &obj, in_plt);
+		sdbg(("  search _rtld_list_main"));
+		if (main_symtbl)
+			symp = _rtld_symlook_main(name, len, hash, &obj,in_plt);
+		else
+			symp = _rtld_symlook_list(name, len, hash,
+					    &_rtld_list_main, &obj, in_plt);
 		if (symp != NULL &&
 		    (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 			def = symp;
@@ -204,24 +272,29 @@
 	}
 	
 	/* Search all dlopened DAGs containing the referencing object. */
-	SIMPLEQ_FOREACH(elm, &refobj->dldags, link) {
-		if (def != NULL && ELF_ST_BIND(def->st_info) != STB_WEAK)
-			break;
-		rdbg(("search DAG with root %p (%s)", elm->obj, elm->obj->path));
-		symp = _rtld_symlook_list(name, hash, &elm->obj->dagmembers, &obj, in_plt);
-		if (symp != NULL &&
-		    (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
-			def = symp;
-			defobj = obj;
+	if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
+		SIMPLEQ_FOREACH(elm, &refobj->dldags, link) {
+			sdbg(("  search DAG with root %p (%s)", elm->obj, elm->obj->path));
+			symp = _rtld_symlook_list(name, len, hash,
+					&elm->obj->dagmembers, &obj, in_plt);
+			if (symp != NULL &&
+			    (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
+				def = symp;
+				defobj = obj;
+				if (ELF_ST_BIND(def->st_info) != STB_WEAK)
+					break;
+			}
 		}
 	}
 	
 	/* Search all RTLD_GLOBAL objects. */
 	if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
-		rdbg(("search _rtld_list_global"));
-		symp = _rtld_symlook_list(name, hash, &_rtld_list_global, &obj, in_plt);
+		sdbg(("  search _rtld_list_global"));
+		symp = _rtld_symlook_list(name, len, hash, &_rtld_list_global,
+								&obj, in_plt);
 		if (symp != NULL &&
 		    (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
+			dbg(("found %s in global list!", name));
 			def = symp;
 			defobj = obj;
 		}
@@ -240,9 +313,158 @@
 	if (def != NULL)
 		*defobj_out = defobj;
 	else {
-		rdbg(("lookup failed"));
 		_rtld_error("%s: Undefined %ssymbol \"%s\" (symnum = %ld)",
 		    refobj->path, in_plt ? "PLT " : "", name, symnum);
 	}
 	return def;
+}
+
+/*
+ * If there are a lot of libraries the hashed search of each
+ * library explodes to a lot of string compares (mozilla
+ * loads 20 libraries) and the search time becomes significant.
+ * (There are about 1.5 items per hash chain per library.)
+ * A combined symbol table ought to help!
+ */
+
+void
+_rtld_process_main_symbols(void)
+{
+	Obj_Entry *obj;
+	const Objlist_Entry *elm;
+	uint obj_count = 0;
+	uint sym_count = 0;
+	main_symtbl_t *m, *m1, *m2;
+	const Elf_Sym *sym;
+	const char *name;
+	uint len;
+	uint hash, hash2;
+	IF_DEBUG( uint new_sym = 0; )
+	IF_DEBUG( uint rehash = 0; )
+	IF_DEBUG( uint hash_scan = 0; )
+	IF_DEBUG( uint rechain = 0; )
+
+	SIMPLEQ_FOREACH(elm, &_rtld_list_main, link) {
+		obj = elm->obj;
+		obj_count++;
+		sym_count += obj->nchains;
+		sdbg(("main: %s symbols %ld hash %ld",
+			obj->path, obj->nchains, obj->nbuckets));
+	}
+
+	/* a sum based on 'libraries * relocations > symbols' might be better */
+	if (obj_count < 8)
+		/* probably not worth all the work! */
+		return;
+
+	/* sym_count is somewhat generous since in includes undefined
+	   symbols and padding to 4k (maybe a page) boundary. */
+	len = sym_count * sizeof *main_symtbl;
+	len = (len + 4095) & ~4095;
+	sym_count = len / sizeof *main_symtbl;
+	/* Maybe we should find a prime number - but it probably makes
+	   no difference.  Just ensuring the hash isn't a power of 2 is
+	   almost certainly good enough. */
+	main_sym_hash = sym_count - 1;
+	dbg(("total %d symbols, hash %d", sym_count, main_sym_hash));
+
+	main_symtbl = mmap( 0, len, PROT_READ|PROT_WRITE, MAP_ANON, -1, 0);
+	if (!main_symtbl)
+		return;
+	memset(main_symtbl, 0, len);
+
+	SIMPLEQ_FOREACH(elm, &_rtld_list_main, link) {
+		obj = elm->obj;
+		dbg(("processing symbol table from %s", obj->path));
+		sym = obj->symtab + obj->nchains;
+		do {
+			sym--;
+			name = obj->strtab + sym->st_name;
+			if (!*name)
+				continue;
+			sdbg(("processing symbol \"%s\"", name));
+			if (sym->st_shndx == SHN_UNDEF
+#if !defined(__mips__)
+			    /* see XXX DANGER WILL ROBINSON above */
+			    && (sym->st_value == 0
+				    || ELF_ST_TYPE(sym->st_info) != STT_FUNC)
+#endif
+			    ) {
+				continue;
+			}
+			hash = _rtld_elf_hash(name, &len) % main_sym_hash;
+			m = main_symtbl + hash;
+			if (m->name) {
+				for (;; m = m->chain) {
+					if (name[1] != m->name[1]
+					    || my_memcmp(m->name, name, len)) {
+						if (m->chain)
+							continue;
+						break;
+					}
+					/* duplicate symbol */
+					if (sym->st_shndx == SHN_UNDEF
+					    || m->sym->st_shndx == SHN_UNDEF) {
+						/* add both! */
+						if (m->chain)
+							continue;
+						break;
+					}
+					/* overwrite old weak symbol? */
+					if ((ELF_ST_BIND(sym->st_info))
+					    != STB_WEAK
+					    && (ELF_ST_BIND(m->sym->st_info))
+					    == STB_WEAK) {
+						/* replace old weak one */
+						m->name = name;
+						m->sym = sym;
+						m->obj = obj;
+					}
+					/* take first definition */
+					m = 0;
+					break;
+				}
+				if (!m)
+					continue;
+
+				/* New symbol with hash clash,
+				   search for empty slot. */
+				IF_DEBUG( rehash++; )
+				IF_DEBUG( hash_scan--; )
+				m1 = m;
+				do {
+					IF_DEBUG( hash_scan++; )
+					if (++m1 >= main_symtbl + sym_count)
+						m1 = main_symtbl;
+				} while (m1->name);
+				/* Before we link into this hash chain, check
+				   that our original entry hadn't been
+				   'stolen' for another hash chain. */
+				hash2 = _rtld_elf_hash(main_symtbl[hash].name,
+							&len) % main_sym_hash;
+				if (hash2 != hash) {
+					/* our entry was stolen - find ptr */
+					m = main_symtbl + hash;
+					for (m2 = main_symtbl + hash2;
+					    m2->chain != m;)
+						m2 = m2->chain;
+					/* copy into free slot */
+					*m1 = *m;
+					m2->chain = m1;
+					m->chain = 0;
+					IF_DEBUG( rechain++; )
+				} else {
+					m->chain = m1;
+					m = m1;
+				}
+			}
+			/* fill new entry */
+			IF_DEBUG( new_sym++; )
+			m->name = name;
+			m->obj = obj;
+			m->sym = sym;
+		} while (sym != obj->symtab);
+	}
+	dbg(("rebuilt symbol table, added %d chained %d rechain %d scan %d",
+		new_sym, rehash - rechain, rechain, hash_scan));
 }
Index: arch/i386/mdreloc.c
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/arch/i386/mdreloc.c,v
retrieving revision 1.18
diff -u -r1.18 mdreloc.c
--- arch/i386/mdreloc.c	2002/10/03 20:39:22	1.18
+++ arch/i386/mdreloc.c	2002/10/04 14:41:54
@@ -53,6 +53,11 @@
 	unsigned long lastsym = -1;
 #endif
 	Elf_Addr target;
+	IF_DEBUG( int reloc_none = 0; )
+	IF_DEBUG( int reloc_pc32 = 0; )
+	IF_DEBUG( int reloc_name = 0; )
+	IF_DEBUG( int reloc_relative = 0; )
+	IF_DEBUG( int reloc_copy = 0; )
 
 	for (rel = obj->rel; rel < obj->rellim; rel++) {
 		Elf_Addr        *where;
@@ -66,10 +71,12 @@
 
 		switch (ELF_R_TYPE(rel->r_info)) {
 		case R_TYPE(NONE):
+			IF_DEBUG( reloc_none++; )
 			break;
 
 #if 1 /* XXX should not occur */
 		case R_TYPE(PC32):
+			IF_DEBUG( reloc_pc32++; )
 #ifdef COMBRELOC
 			if (symnum != lastsym) {
 #endif
@@ -94,6 +101,7 @@
 #endif
 		case R_TYPE(32):
 		case R_TYPE(GLOB_DAT):
+			IF_DEBUG( reloc_name++; )
 #ifdef COMBRELOC
 			if (symnum != lastsym) {
 #endif
@@ -117,12 +125,14 @@
 			break;
 
 		case R_TYPE(RELATIVE):
+			IF_DEBUG( reloc_relative++; )
 			*where += (Elf_Addr)obj->relocbase;
 			rdbg(("RELATIVE in %s --> %p", obj->path,
 			    (void *)*where));
 			break;
 
 		case R_TYPE(COPY):
+			IF_DEBUG( reloc_copy++; )
 			/*
 			 * These are deferred until all other relocations have
 			 * been done.  All we do here is make sure that the
@@ -150,6 +160,7 @@
 			return -1;
 		}
 	}
+	dbg(("  reloc none %d, pc32 %d, name %d, relative %d, copy %d", reloc_none, reloc_pc32, reloc_name, reloc_relative, reloc_copy));
 	return 0;
 }
 
Index: arch/i386/rtld_start.S
===================================================================
RCS file: /cvsroot/basesrc/libexec/ld.elf_so/arch/i386/rtld_start.S,v
retrieving revision 1.8
diff -u -r1.8 rtld_start.S
--- arch/i386/rtld_start.S	2002/09/25 08:00:26	1.8
+++ arch/i386/rtld_start.S	2002/10/04 14:41:55
@@ -93,3 +93,28 @@
 
 	leal	4(%esp),%esp		# Discard reloff, do not change eflags
 	ret
+
+ENTRY(my_memcmp)
+	movl	%esi,%eax
+	movl	%edi,%edx
+	movl	4(%esp),%esi
+	movl	8(%esp),%edi
+	movl	12(%esp),%ecx
+	shrl	$1,%ecx
+	shrl	$1,%ecx
+	repe	cmpsl
+	jne	1f
+	movl	12(%esp),%ecx
+	andl	$3,%ecx
+	repe	cmpsb
+	jne	1f
+
+	movl	%eax,%esi
+	movl	%edx,%edi
+	xor	%eax,%eax
+	ret
+
+1:	movl	%eax,%esi
+	movl	%edx,%edi
+	movl	$1,%eax
+	ret
-- 
David Laight: david@l8s.co.uk