Subject: Re: [dM] make manpage mis-describes $(:S///g)
To: None <gnats-bugs@NetBSD.ORG>
From: der Mouse <mouse@Athena.McRCIM.McGill.EDU>
List: netbsd-bugs
Date: 09/13/1996 13:09:19
In PR 2748, I wrote

	However, I would much prefer to completely replace the :S code,
	since it has other problems; most notably, it does not support
	regexps, [...]

	I hope to get this code written soon; if and when I do, I'll
	send it in as an addendum to this PR, [...]

Herewith said code.  I tweaked buf.[ch] because I write code that's
properly const-poisoned (and I use -Wcast-qual -Wwrite-strings to
ensure this), which made me notice that Buf_AddBytes() was missing that
const.  The rest of it is pretty straightforward.  I actually didn't
touch the S modifier because too many Makefiles probably depend on it
already.  Instead, I used C ("change") for the regex version.

With these changes, I see the following:

% cat /tmp/Makefile
VAR = x-foo-bar-foo-bar-x y-foo-two-bar-foo-two-bar-y
MANPAGES = cat1/dig.0 cat3/resolver.0

.PHONY: foo
foo:
	@echo VAR = $(VAR)
	@echo :S/foo/X/ = $(VAR:S/foo/X/)
	@echo :S/foo/X/g = $(VAR:S/foo/X/g)
	@echo :S/two/X/ = $(VAR:S/two/X/)
	@echo :S/two/X/g = $(VAR:S/two/X/g)
	@echo :C/foo/X/ = $(VAR:C/foo/X/)
	@echo :C/foo/X/g = $(VAR:C/foo/X/g)
	@echo :C/foo/X/1 = $(VAR:C/foo/X/1)
	@echo :C/foo/X/1g = $(VAR:C/foo/X/1g)
	@echo MANPAGES = $(MANPAGES)
	@echo ':C=^cat(.)/(.*)[.]0$$=man\1/\2.\1=' = $(MANPAGES:C=^cat(.)/(.*)[.]0$=man\1/\2.\1=)
% make -f /tmp/Makefile
VAR = x-foo-bar-foo-bar-x y-foo-two-bar-foo-two-bar-y
:S/foo/X/ = x-X-bar-foo-bar-x y-foo-two-bar-foo-two-bar-y
:S/foo/X/g = x-X-bar-X-bar-x y-X-two-bar-X-two-bar-y
:S/two/X/ = x-foo-bar-foo-bar-x y-foo-X-bar-foo-two-bar-y
:S/two/X/g = x-foo-bar-foo-bar-x y-foo-X-bar-foo-X-bar-y
:C/foo/X/ = x-X-bar-foo-bar-x y-X-two-bar-foo-two-bar-y
:C/foo/X/g = x-X-bar-X-bar-x y-X-two-bar-X-two-bar-y
:C/foo/X/1 = x-X-bar-foo-bar-x y-foo-two-bar-foo-two-bar-y
:C/foo/X/1g = x-X-bar-X-bar-x y-foo-two-bar-foo-two-bar-y
MANPAGES = cat1/dig.0 cat3/resolver.0
:C=^cat(.)/(.*)[.]0$=man\1/\2.\1= = man1/dig.1 man3/resolver.3
% 

Here are the changes themselves.

--- OLD/usr.bin/make/buf.c	Thu Jan  1 00:00:00 1970
+++ NEW/usr.bin/make/buf.c	Thu Jan  1 00:00:00 1970
@@ -130,7 +130,7 @@
 Buf_AddBytes (bp, numBytes, bytesPtr)
     register Buffer bp;
     int	    numBytes;
-    Byte    *bytesPtr;
+    const Byte *bytesPtr;
 {
 
     BufExpand (bp, numBytes);
--- OLD/usr.bin/make/buf.h	Thu Jan  1 00:00:00 1970
+++ NEW/usr.bin/make/buf.h	Thu Jan  1 00:00:00 1970
@@ -68,7 +68,7 @@
 #define BUF_ERROR 256
 
 void Buf_OvAddByte __P((Buffer, int));
-void Buf_AddBytes __P((Buffer, int, Byte *));
+void Buf_AddBytes __P((Buffer, int, const Byte *));
 void Buf_UngetByte __P((Buffer, int));
 void Buf_UngetBytes __P((Buffer, int, Byte *));
 int Buf_GetByte __P((Buffer));
--- OLD/usr.bin/make/make.1	Thu Jan  1 00:00:00 1970
+++ NEW/usr.bin/make/make.1	Thu Jan  1 00:00:00 1970
@@ -452,27 +452,27 @@
 .It Cm R
 Replaces each word in the variable with everything but its suffix.
 .Sm off
-.It Cm S No \&/ Ar old_pattern Xo
-.No \&/ Ar new_pattern
+.It Cm S No \&/ Ar old_string Xo
+.No \&/ Ar new_string
 .No \&/ Op Cm g
 .Xc
 .Sm on
 Modify the first occurrence of
-.Ar old_pattern
-in each word to be replaced with
-.Ar new_pattern .
+.Ar old_string
+in the variable's value, replacing it with
+.Ar new_string .
 If a
 .Ql g
 is appended to the last slash of the pattern, all occurrences
 in each word are replaced.
 If
-.Ar old_pattern
-begins with a carat
+.Ar old_string
+begins with a caret
 .Pq Ql ^ ,
-.Ar old_pattern
+.Ar old_string
 is anchored at the beginning of each word.
 If
-.Ar old_pattern
+.Ar old_string
 ends with a dollar sign
 .Pq Ql \&$ ,
 it is anchored at the end of each word.
@@ -481,7 +481,11 @@
 an ampersand
 .Pq Ql &
 is replaced by
-.Ar old_pattern .
+.Ar old_string
+(without any
+.Ql ^
+or
+.Ql \&$ ) .
 Any character may be used as a delimiter for the parts of the modifier
 string.
 The anchoring, ampersand and delimiter characters may be escaped with a
@@ -494,8 +498,36 @@
 .Ar new_string
 with the single exception that a backslash is used to prevent the expansion
 of a dollar sign
-.Pq Ql \&$
+.Pq Ql \&$ ,
 not a preceding dollar sign as is usual.
+.Sm off
+.It Cm C No \&/ Ar pattern Xo
+.No \&/ Ar replacement
+.No \&/ Op Cm 1g
+.Xc
+.Sm on
+The
+.Cm C
+modifier is just like the
+.Cm S
+modifier except that the the old and new strings, instead of being
+simple strings, are a regular expression (see
+.Xr regex 3 )
+and an
+.Xr ed 1 Ns \-style
+replacement string.  Normally, the first occurrence of the pattern in
+each word of the value is changed.  The
+.Ql 1
+modifier causes the substitution to apply to at most one word; the
+.Ql g
+modifier causes the substitution to apply to as many instances of the
+search pattern as occur in the word or words it is found in.  Note that
+.Ql 1
+and
+.Ql g
+are orthogonal; the former specifies whether multiple words are
+potentially affected, the latter whether multiple substitutions can
+potentially occur within each affected word.
 .It Cm T
 Replaces each word in the variable with its last component.
 .It Ar old_string=new_string
--- OLD/usr.bin/make/var.c	Thu Jan  1 00:00:00 1970
+++ NEW/usr.bin/make/var.c	Thu Jan  1 00:00:00 1970
@@ -89,6 +89,7 @@
  */
 
 #include    <ctype.h>
+#include <regex.h>
 #include    "make.h"
 #include    "buf.h"
 
@@ -156,6 +157,16 @@
 #define VAR_NO_SUB	8   /* Substitution is non-global and already done */
 } VarPattern;
 
+typedef struct {
+	  regex_t re;
+	  int nsub;
+	  regmatch_t *matches;
+	  char *replace;
+	  unsigned int global : 1;
+	  unsigned int oneword : 1;
+	  unsigned int matched : 1;
+	  } VarREPattern;
+
 static int VarCmp __P((ClientData, ClientData));
 static Var *VarFind __P((char *, GNode *, int));
 static void VarAdd __P((char *, char *, GNode *));
@@ -1039,6 +1050,128 @@
     return(TRUE);
 }
 
+
+/*-
+ *-----------------------------------------------------------------------
+ * VarRegexpSub --
+ *	Perform a regex substitution on the given word, placing the
+ *	result in the passed buffer.
+ *
+ * Results:
+ *	TRUE if a space is needed before more characters are added.
+ *
+ * Side Effects:
+ *	None.
+ *
+ *-----------------------------------------------------------------------
+ */
+static Boolean VarRegexpSub(char *word, Boolean addSpace, Buffer buf, ClientData patternp)
+{
+ VarREPattern *pat;
+ int xrv;
+ char *wp;
+ char *rp;
+ int added;
+
+#define MAYBE_ADD_SPACE() do { if (addSpace && !added) { Buf_AddByte(buf,' '); } added = 1; } while (0)
+ added = 0;
+ wp = word;
+ pat = patternp;
+ if (pat->oneword && pat->matched)
+  { xrv = REG_NOMATCH;
+  }
+ else
+  {
+tryagain:;
+    xrv = regexec(&pat->re,wp,pat->nsub,pat->matches,0);
+  }
+ switch (xrv)
+  { case 0:
+       pat->matched = 1;
+       if (pat->matches[0].rm_so > 0)
+	{ MAYBE_ADD_SPACE();
+	  Buf_AddBytes(buf,pat->matches[0].rm_so,wp);
+	}
+       for (rp=pat->replace;*rp;rp++)
+	{ if ( (*rp == '\\') &&
+	       ( (rp[1] == '&') ||
+		 (rp[1] == '\\') ) )
+	   { MAYBE_ADD_SPACE();
+	     Buf_AddByte(buf,rp[1]);
+	     rp ++;
+	   }
+	  else if ( (*rp == '&') ||
+		    ((*rp == '\\') && isdigit(rp[1])) )
+	   { int n;
+	     char *subbuf;
+	     char zsub;
+	     int sublen;
+	     char errstr[3];
+	     if (*rp == '&')
+	      { n = 0;
+		errstr[0] = '&';
+		errstr[1] = '\0';
+	      }
+	     else
+	      { n = rp[1] - '0';
+		errstr[0] = '\\';
+		errstr[1] = rp[1];
+		errstr[2] = '\0';
+		rp ++;
+	      }
+	     if (n > pat->nsub)
+	      { Error("%s in replacement but no such subexpression in expression",&errstr[0]);
+		subbuf = "";
+		sublen = 0;
+	      }
+	     else if ((pat->matches[n].rm_so == -1) && (pat->matches[n].rm_eo == -1))
+	      { Error("%s in replacement but that subexpression wasn't matched",&errstr[0]);
+		subbuf = "";
+		sublen = 0;
+	      }
+	     else
+	      { subbuf = wp + pat->matches[n].rm_so;
+		sublen = pat->matches[n].rm_eo - pat->matches[n].rm_so;
+	      }
+	     if (sublen > 0)
+	      { MAYBE_ADD_SPACE();
+		Buf_AddBytes(buf,sublen,subbuf);
+	      }
+	   }
+	  else
+	   { MAYBE_ADD_SPACE();
+	     Buf_AddByte(buf,*rp);
+	   }
+	}
+       wp += pat->matches[0].rm_eo;
+       if (pat->global) goto tryagain;
+       if (*wp)
+	{ MAYBE_ADD_SPACE();
+	  Buf_AddBytes(buf,strlen(wp),wp);
+	}
+       break;
+    default:
+	{ char *errbuf;
+	  int errlen;
+	  errlen = regerror(xrv,&pat->re,0,0);
+	  errbuf = malloc(errlen);
+	  regerror(xrv,&pat->re,errbuf,errlen);
+	  Error("unexpected regex error: %s",errbuf);
+	  free(errbuf);
+	}
+       /* fall through */
+    case REG_NOMATCH:
+       if (*wp)
+	{ MAYBE_ADD_SPACE();
+	  Buf_AddBytes(buf,strlen(wp),wp);
+	}
+       break;
+  }
+ return(addSpace||added);
+#undef MAYBE_ADD_SPACE
+}
+
+
 /*-
  *-----------------------------------------------------------------------
  * VarModify --
@@ -1600,6 +1733,128 @@
 		    free(pattern.lhs);
 		    free(pattern.rhs);
 		    break;
+		}
+		case 'C':
+		{ VarREPattern pat;
+		  char *re;
+		  int junk;
+		  char delim;
+		  Buffer buf;
+		  int err;
+		  delim = tstr[1];
+		  tstr += 2;
+		  buf = Buf_Init(0);
+		  /* Skim through until the matching delimiter is found;
+		     pick up variable substitutions on the way.  Also
+		     allow backslashes to quote the delimiter, $, and \,
+		     but don't touch other backslashes. */
+		  for (cp=tstr;*cp&&(*cp!=delim);cp++)
+		   { if ( (*cp == '\\') &&
+			  ( (cp[1] == delim) ||
+			    (cp[1] == '$') ||
+			    (cp[1] == '\\') ) )
+		      { Buf_AddByte(buf,(Byte)cp[1]);
+			cp ++;
+		      }
+		     else if ((*cp == '$') && (cp[1] != delim))
+		      { char *cp2;
+			int len;
+			Boolean freeIt;
+			cp2 = Var_Parse(cp,ctxt,err,&len,&freeIt);
+			Buf_AddBytes(buf,strlen(cp2),(Byte *)cp2);
+			if (freeIt) free(cp2);
+			cp += len - 1;
+		      }
+		     else
+		      { Buf_AddByte(buf,(Byte)*cp);
+		      }
+		   }
+		  Buf_AddByte(buf,(Byte)'\0');
+		  if (*cp != delim)
+		   { *lengthPtr = cp - start + 1;
+		     if (*freePtr) free(str);
+		     Buf_Destroy(buf,TRUE);
+		     Error("Unclosed substitution for %s (%c missing)",v->name,delim);
+		     return(var_Error);
+		   }
+		  re = (char *) Buf_GetAll(buf,&junk);
+		  Buf_Destroy(buf,FALSE);
+		  /* Now we've got the match expression; pick up the replacement.
+		     Once again, do variable expansion and handle some backslashes. */
+		  buf = Buf_Init(0);
+		  tstr = cp + 1;
+		  for (cp=tstr;*cp&&(*cp!=delim);cp++)
+		   { if ( (*cp == '\\') &&
+			  ( (cp[1] == delim) ||
+			    (cp[1] == '\\') ||
+			    (cp[1] == '$') ) )
+		      { Buf_AddByte(buf,(Byte)cp[1]);
+			cp ++;
+		      }
+		     else if ((*cp == '$') && (cp[1] != delim))
+		      { char *cp2;
+			int len;
+			Boolean freeIt;
+			cp2 = Var_Parse(cp,ctxt,err,&len,&freeIt);
+			Buf_AddBytes(buf,strlen(cp2),(Byte *)cp2);
+			cp += len - 1;
+			if (freeIt) free(cp2);
+		      }
+		     else
+		      { Buf_AddByte(buf,(Byte)*cp);
+		      }
+		   }
+		  Buf_AddByte(buf,(Byte)'\0');
+		  if (*cp != delim)
+		   { *lengthPtr = cp - start + 1;
+		     free(re);
+		     if (*freePtr) free(str);
+		     Buf_Destroy(buf,TRUE);
+		     Error("Unclosed substitution for %s (%c missing)",v->name,delim);
+		     return(var_Error);
+		   }
+		  pat.replace = (char *) Buf_GetAll(buf,&junk);
+		  Buf_Destroy(buf,FALSE);
+		  cp ++;
+		  pat.global = 0;
+		  pat.oneword = 0;
+		  for (;;cp++)
+		   { switch (*cp)
+		      { case 'g':
+			   pat.global = 1;
+			   continue;
+			   break;
+			case '1':
+			   pat.oneword = 1;
+			   continue;
+			   break;
+		      }
+		     break;
+		   }
+		  termc = *cp;
+		  err = regcomp(&pat.re,re,REG_EXTENDED);
+		  if (err)
+		   { char *errbuf;
+		     int errlen;
+		     errlen = regerror(err,&pat.re,0,0);
+		     errbuf = malloc(errlen);
+		     regerror(err,&pat.re,errbuf,errlen);
+		     Error("RE substitution error: %s",errbuf);
+		     free(errbuf);
+		     free(re);
+		     free(pat.replace);
+		     return(var_Error);
+		   }
+		  free(re);
+		  pat.nsub = pat.re.re_nsub + 1;
+		  if (pat.nsub < 1) pat.nsub = 1;
+		  if (pat.nsub > 10) pat.nsub = 10;
+		  pat.matches = malloc(pat.nsub*sizeof(regmatch_t));
+		  pat.matched = 0;
+		  newStr = VarModify(str,VarRegexpSub,(ClientData)&pat);
+		  regfree(&pat.re);
+		  free(pat.replace);
+		  break;
 		}
 		case 'T':
 		    if (tstr[1] == endc || tstr[1] == ':') {

					der Mouse

			    mouse@collatz.mcrcim.mcgill.edu
		    01 EE 31 F6 BB 0C 34 36  00 F3 7C 5A C1 A0 67 1D