[src/trunk]: src/bin/sh Add support for $'...' quoting (based upon C "..." st...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/bin/sh Add support for $'...' quoting (based upon C "..." st...
From: kre <kre%NetBSD.org@localhost>
Date: Tue, 07 Apr 2020 08:39:05 +0000
details:   https://anonhg.NetBSD.org/src/rev/d5662f4dbe4b
branches:  trunk
changeset: 826209:d5662f4dbe4b
user:      kre <kre%NetBSD.org@localhost>
date:      Mon Aug 21 13:20:49 2017 +0000

description:
Add support for $'...' quoting (based upon C "..." strings, with \ expansions.)

Implementation largely obtained from FreeBSD, with adaptations to meet the
needs and style of this sh, some updates to agree with the current POSIX spec,
and a few other minor changes.

The POSIX spec for this ( http://austingroupbugs.net/view.php?id=249 )
[see note 2809 for the current proposed text] is yet to be approved,
so might change.  It currently leaves several aspects as unspecified,
this implementation handles those as:

Where more than 2 hex digits follow \x this implementation processes the
first two as hex, the following characters are processed as if the \x
sequence was not present.  The value obtained from a \nnn octal sequence
is truncated to the low 8 bits (if a bigger value is written, eg: \456.)
Invalid escape sequences are errors.  Invalid \u (or \U) code points are
errors if known to be invalid, otherwise can generate a '?' character.
Where any escape sequence generates nul ('\0') that char, and the rest of
the $'...' string is discarded, but anything remaining in the word is
processed, ie: aaa$'bbb\0ccc'ddd produces the same as aaa'bbb'ddd.

Differences from FreeBSD:
  FreeBSD allows only exactly 4 or 8 hex digits for \u and \U (as does C,
  but the current sh proposal differs.) reeBSD also continues consuming
  as many hex digits as exist after \x (permitted by the spec, but insane),
  and reject \u0000 as invalid).  Some of this is possibly because that
  their implementation is based upon an earlier proposal, perhaps note 590 -
  though that has been updated several times.

Differences from the current POSIX proposal:
  We currently always generate UTF-8 for the \u & \U escapes.   We should
  generate the equivalent character from the current locale's character set
  (and UTF8 only if that is what the current locale uses.)
  If anyone would like to correct that, go ahead.

  We (and FreeBSD) generate (X & 0x1F) for \cX escapes where we should generate
  the appropriate control character (SOH for \cA for example) with whatever
  value that has in the current character set.   Apart from EBCDIC, which
  we do not support, I've never seen a case where they differ, so ...

diffstat:

 bin/sh/expand.c |   17 ++++-
 bin/sh/parser.c |  187 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 bin/sh/parser.h |    7 +-
 bin/sh/sh.1     |  200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 bin/sh/syntax.c |    5 +-
 bin/sh/syntax.h |   13 +-
 6 files changed, 404 insertions(+), 25 deletions(-)

diffs (truncated from 622 to 300 lines):

diff -r 19d551800b30 -r d5662f4dbe4b bin/sh/expand.c
--- a/bin/sh/expand.c   Mon Aug 21 10:38:19 2017 +0000
+++ b/bin/sh/expand.c   Mon Aug 21 13:20:49 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: expand.c,v 1.119 2017/06/30 23:02:56 kre Exp $ */
+/*     $NetBSD: expand.c,v 1.120 2017/08/21 13:20:49 kre Exp $ */
 
 /*-
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)expand.c   8.5 (Berkeley) 5/15/95";
 #else
-__RCSID("$NetBSD: expand.c,v 1.119 2017/06/30 23:02:56 kre Exp $");
+__RCSID("$NetBSD: expand.c,v 1.120 2017/08/21 13:20:49 kre Exp $");
 #endif
 #endif /* not lint */
 
@@ -267,6 +267,9 @@
                                STPUTC(c, expdest);
                        line_number++;
                        break;
+               case CTLCNL:
+                       STPUTC('\n', expdest);  /* no line_number++ */
+                       break;
                case CTLQUOTEEND:
                        ifs_split = EXP_IFS_SPLIT;
                        break;
@@ -1842,6 +1845,11 @@
                        p++;
                        continue;
                }
+               if (*p == CTLCNL) {
+                       p++;
+                       *q++ = '\n';
+                       continue;
+               }
                if (*p == CTLESC)
                        p++;
                *q++ = *p++;
@@ -1883,6 +1891,11 @@
                        nls++;
                        continue;
                }
+               if (*p == CTLCNL) {
+                       p++;
+                       *q++ = '\n';
+                       continue;
+               }
                if (*p == CTLESC)
                        p++;
 
diff -r 19d551800b30 -r d5662f4dbe4b bin/sh/parser.c
--- a/bin/sh/parser.c   Mon Aug 21 10:38:19 2017 +0000
+++ b/bin/sh/parser.c   Mon Aug 21 13:20:49 2017 +0000
@@ -1,4 +1,4 @@
-/*     $NetBSD: parser.c,v 1.143 2017/08/05 11:33:05 kre Exp $ */
+/*     $NetBSD: parser.c,v 1.144 2017/08/21 13:20:49 kre Exp $ */
 
 /*-
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)parser.c   8.7 (Berkeley) 5/16/95";
 #else
-__RCSID("$NetBSD: parser.c,v 1.143 2017/08/05 11:33:05 kre Exp $");
+__RCSID("$NetBSD: parser.c,v 1.144 2017/08/21 13:20:49 kre Exp $");
 #endif
 #endif /* not lint */
 
@@ -1212,6 +1212,7 @@
 #define        NQ      0x00    /* Unquoted */
 #define        SQ      0x01    /* Single Quotes */
 #define        DQ      0x02    /* Double Quotes (or equivalent) */
+#define        CQ      0x03    /* C style Single Quotes */
 #define        QF      0x0F            /* Mask to extract previous values */
 #define        QS      0x10    /* Quoting started at this level in stack */
 
@@ -1562,6 +1563,165 @@
        redirnode = np;         /* this is the "value" of TRENODE */
 }
 
+/*
+ * Called to parse a backslash escape sequence inside $'...'.
+ * The backslash has already been read.
+ */
+static char *
+readcstyleesc(char *out)
+{
+       int c, vc, i, n;
+       unsigned int v;
+
+       c = pgetc();
+       switch (c) {
+       case '\0':
+       case PEOF:
+               synerror("Unterminated quoted string");
+       case '\n':
+               plinno++;
+               if (doprompt)
+                       setprompt(2);
+               else
+                       setprompt(0);
+               return out;
+
+       case '\\':
+       case '\'':
+       case '"':
+               v = c;
+               break;
+
+       case 'a': v = '\a'; break;
+       case 'b': v = '\b'; break;
+       case 'e': v = '\033'; break;
+       case 'f': v = '\f'; break;
+       case 'n': v = '\n'; break;
+       case 'r': v = '\r'; break;
+       case 't': v = '\t'; break;
+       case 'v': v = '\v'; break;
+
+       case '0': case '1': case '2': case '3':
+       case '4': case '5': case '6': case '7':
+               v = c - '0';
+               c = pgetc();
+               if (c >= '0' && c <= '7') {
+                       v <<= 3;
+                       v += c - '0';
+                       c = pgetc();
+                       if (c >= '0' && c <= '7') {
+                               v <<= 3;
+                               v += c - '0';
+                       } else
+                               pungetc();
+               } else
+                       pungetc();
+               break;
+
+       case 'c':
+               c = pgetc();
+               if (c < 0x3f || c > 0x7a || c == 0x60)
+                       synerror("Bad \\c escape sequence");
+               if (c == '\\' && pgetc() != '\\')
+                       synerror("Bad \\c\\ escape sequence");
+               if (c == '?')
+                       v = 127;
+               else
+                       v = c & 0x1f;
+               break;
+
+       case 'x':
+               n = 2;
+               goto hexval;
+       case 'u':
+               n = 4;
+               goto hexval;
+       case 'U':
+               n = 8;
+       hexval:
+               v = 0;
+               for (i = 0; i < n; i++) {
+                       c = pgetc();
+                       if (c >= '0' && c <= '9')
+                               v = (v << 4) + c - '0';
+                       else if (c >= 'A' && c <= 'F')
+                               v = (v << 4) + c - 'A' + 10;
+                       else if (c >= 'a' && c <= 'f')
+                               v = (v << 4) + c - 'a' + 10;
+                       else {
+                               pungetc();
+                               break;
+                       }
+               }
+               if (n > 2 && v > 127) {
+                       if (v >= 0xd800 && v <= 0xdfff)
+                               synerror("Invalid \\u escape sequence");
+
+                       /* XXX should we use iconv here. What locale? */
+                       CHECKSTRSPACE(4, out);
+
+                       if (v <= 0x7ff) {
+                               USTPUTC(0xc0 | v >> 6, out);
+                               USTPUTC(0x80 | (v & 0x3f), out);
+                               return out;
+                       } else if (v <= 0xffff) {
+                               USTPUTC(0xe0 | v >> 12, out);
+                               USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
+                               USTPUTC(0x80 | (v & 0x3f), out);
+                               return out;
+                       } else if (v <= 0x10ffff) {
+                               USTPUTC(0xf0 | v >> 18, out);
+                               USTPUTC(0x80 | ((v >> 12) & 0x3f), out);
+                               USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
+                               USTPUTC(0x80 | (v & 0x3f), out);
+                               return out;
+                       }
+                       if (v > 127)
+                               v = '?';
+               }
+               break;
+       default:
+               synerror("Unknown $'' escape sequence");
+       }
+       vc = (char)v;
+
+       /*
+        * If we managed to create a \n from a \ sequence (no matter how)
+        * then we replace it with the magic CRTCNL control char, which
+        * will turn into a \n again later, but in the meantime, never
+        * causes LINENO increments.
+        */
+       if (vc == '\n') {
+               USTPUTC(CTLCNL, out);
+               return out;
+       }
+
+       /*
+        * We can't handle NUL bytes.
+        * POSIX says we should skip till the closing quote.
+        */
+       if (vc == '\0') {
+               while ((c = pgetc()) != '\'') {
+                       if (c == '\\')
+                               c = pgetc();
+                       if (c == PEOF)
+                               synerror("Unterminated quoted string");
+                       if (c == '\n') {
+                               plinno++;
+                               if (doprompt)
+                                       setprompt(2);
+                               else
+                                       setprompt(0);
+                       }
+               }
+               pungetc();
+               return out;
+       }
+       if (SQSYNTAX[vc] == CCTL)
+               USTPUTC(CTLESC, out);
+       USTPUTC(vc, out);
+       return out;
+}
 
 /*
  * The lowest level basic tokenizer.
@@ -1623,9 +1783,16 @@
                                setprompt(0);
                        continue;
 
+               case CSBACK:    /* single quoted backslash */
+                       if ((quoted & QF) == CQ) {
+                               out = readcstyleesc(out);
+                               continue;
+                       }
+                       /* FALLTHROUGH */
                case CWORD:
                        USTPUTC(c, out);
                        continue;
+
                case CCTL:
                        if (!magicq || ISDBLQUOTE())
                                USTPUTC(CTLESC, out);
@@ -1826,10 +1993,7 @@
        static const char types[] = "}-+?=";
 
        c = pgetc_linecont();
-       if (c != '('/*)*/ && c != OPENBRACE && !is_name(c) && !is_special(c)) {
-               USTPUTC('$', out);
-               pungetc();
-       } else if (c == '('/*)*/) {     /* $(command) or $((arith)) */
+       if (c == '(' /*)*/) {   /* $(command) or $((arith)) */
                if (pgetc_linecont() == '(' /*')'*/ ) {
                        out = insert_elided_nl(out);
                        PARSEARITH();
@@ -1838,7 +2002,7 @@
                        pungetc();
                        out = parsebackq(stack, out, &bqlist, 0, magicq);
                }
-       } else {
+       } else if (c == OPENBRACE || is_name(c) || is_special(c)) {
                USTPUTC(CTLVAR, out);
                typeloc = out - stackblock();
                USTPUTC(VSNORMAL, out);
@@ -1974,6 +2138,15 @@
                                CLRDBLQUOTE();
                        }
                }
+       } else if (c == '\'' && syntax == BASESYNTAX) {
+               USTPUTC(CTLQUOTEMARK, out);
+               quotef = 1;
+               TS_PUSH();
+               syntax = SQSYNTAX;
+               quoted = CQ;
+       } else {
+               USTPUTC('$', out);
+               pungetc();
        }
        goto parsesub_return;
 }
diff -r 19d551800b30 -r d5662f4dbe4b bin/sh/parser.h
--- a/bin/sh/parser.h   Mon Aug 21 10:38:19 2017 +0000
Prev by Date: [src/trunk]: src/sys/arch/evbmips/stand/sbmips no ssp and pie
Next by Date: [src/trunk]: src/usr.sbin/rpcbind don't lock for RUMP
Previous by Thread: [src/trunk]: src/sys/arch/evbmips/stand/sbmips no ssp and pie
Next by Thread: [src/trunk]: src/usr.sbin/rpcbind don't lock for RUMP
Indexes:
Home | Main Index | Thread Index | Old Index