Subject: Re: RFC: memmem(3)
To: None <tech-userlevel@netbsd.org>
From: der Mouse <mouse@Rodents.Montreal.QC.CA>
List: tech-userlevel
Date: 03/02/2003 15:59:51
[quoting order normalized -dM]

>> I'm not sure which algorithm I'm using in terms of names like those.
>> The name Boyer/Moore comes to mind, but I'm not sure exactly what
>> that is so I could be misled there.
> There is even Boyer-Moore string search implementation in tree -
> bm(3).

I just read over bm.c.

The algorithm therein looks related to mine, but definitely isn't the
same.  In particular, I don't have the frequency table (such a thing
could improve efficiency of mine in cases where certain characters
occur _very_ much more frequently than others, but to do it right would
greatly complicate the computation of the table), and searching doesn't
need the fast-loop/slow-loop split in the search function.

My code is small enough that I'll just quote it here.  I've left off an
#include, but all it does is prototype searchstr_maketbl() and
searchstr().

I place this code in the public domain.  Feel free to pick it up for
anything you want.

static int eq(const unsigned char *s1, const unsigned char *s2, int n, const char *eqv)
{
 for (;n>0;n--) if (eqv[*s1++] != eqv[*s2++]) return(0);
 return(1);
}

void searchstr_maketbl(unsigned char (*tbl)[256], const unsigned char *str, unsigned int len, const unsigned char equiv[256])
{
 const unsigned char *str1;
 int i;
 int j;
 int k;
 int l;

 if (len < 1) return;
 str1 = str + 1;
 for (i=len-1;i>=0;i--)
  { l = len - 1 - i;
    for (j=255;j>=0;j--)
     { for (k=i;k>=0;k--)
	{ if ((equiv[str[k]] == equiv[j]) && ((l == 0) || (k == i) || eq(str1+k,str1+i,l,equiv)))
	   { tbl[i][j] = i - k;
	     break;
	   }
	}
       if (k < 0)
	{ for (k=0;k<l;k++)
	   { if (eq(str,str1+i+k,l-k,equiv))
	      { tbl[i][j] = 1 + i + k;
		break;
	      }
	   }
	  if (k >= l) tbl[i][j] = len;
	}
     }
  }
}

int searchstr(const unsigned char *str, unsigned int len, unsigned char (*tbl)[256], unsigned int tbllen)
{
 unsigned int strpos;
 unsigned int stridx;
 unsigned int stroff;
 unsigned int adv;

 strpos = 0;
 while (1)
  { stroff = tbllen - 1;
    stridx = strpos + stroff;
    if (stridx >= len) return(-1);
    while (1)
     { adv = tbl[stroff][str[stridx]];
       if (adv == 0)
	{ if (stroff == 0)
	   { return(stridx);
	   }
	  stroff --;
	  stridx --;
	}
       else
	{ strpos += adv;
	  break;
	}
     }
  }
}

/~\ The ASCII				der Mouse
\ / Ribbon Campaign
 X  Against HTML	       mouse@rodents.montreal.qc.ca
/ \ Email!	     7D C8 61 52 5D E7 2D 39  4E F1 31 3E E8 B3 27 4B