Subject: bin/36394: awk tolower/toupper functions don't support multibyte charsets
To: None <gnats-admin@netbsd.org, netbsd-bugs@netbsd.org>
From: None <cheusov@tut.by>
List: netbsd-bugs
Date: 05/28/2007 18:50:00
>Number:         36394
>Category:       bin
>Synopsis:       awk tolower/toupper functions don't support multibyte charsets
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    bin-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Mon May 28 18:50:00 +0000 2007
>Originator:     cheusov@tut.by
>Release:        NetBSD 4.0_BETA2
>Organization:
Best regards, Aleksey Cheusov.
>Environment:
System: NetBSD chen.chizhovka.net 4.0_BETA2 NetBSD 4.0_BETA2 (GENERIC) #16: Mon May 21 00:01:33 EEST 2007 cheusov@chen.chizhovka.net:/srv/src/sys/arch/i386/compile/GENERIC i386
Architecture: i386
Machine: i386
>Description:
NetBSD awk 'tolower' and 'toupper' functions
don't support multibyte charsets, e.g. utf-8.
Patch follows. Please resend it to upstream.
>Fix:

? nawk-caseconv.patch
Index: nawk/proto.h
===================================================================
RCS file: /cvsroot/src/dist/nawk/proto.h,v
retrieving revision 1.5
diff -u -u -r1.5 proto.h
--- nawk/proto.h	26 Oct 2003 11:34:23 -0000	1.5
+++ nawk/proto.h	28 May 2007 18:36:25 -0000
@@ -112,6 +112,7 @@
 extern	char	*getsval(Cell *);
 extern	char	*getpssval(Cell *);     /* for print */
 extern	char	*tostring(const char *);
+extern	char	*tostringN(const char *, size_t n);
 extern	char	*qstring(const char *, int);
 
 extern	void	recinit(unsigned int);
Index: nawk/run.c
===================================================================
RCS file: /cvsroot/src/dist/nawk/run.c,v
retrieving revision 1.14
diff -u -u -r1.14 run.c
--- nawk/run.c	26 Jul 2006 20:46:37 -0000	1.14
+++ nawk/run.c	28 May 2007 18:36:25 -0000
@@ -25,6 +25,8 @@
 #define DEBUG
 #include <stdio.h>
 #include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
 #include <setjmp.h>
 #include <limits.h>
 #include <math.h>
@@ -1461,12 +1463,70 @@
 
 void flush_all(void);
 
+static char *nawk_toXXX (
+	const char *s,
+	int (*fun_c) (int),
+	wint_t (*fun_wc) (wint_t))
+{
+	char *buf      = NULL;
+	char *pbuf     = NULL;
+	const char *ps = NULL;
+	size_t n       = 0;
+	mbstate_t mbs, mbs2;
+	wchar_t wc;
+	size_t sz = MB_CUR_MAX;
+
+	if (sz == 1){
+		buf = tostring (s);
+
+		for (pbuf = buf; *pbuf; pbuf++)
+			*pbuf = fun_c ((uschar) *pbuf);
+
+		return buf;
+	}else{
+		/* upper/lower character may be shorter/longer */
+		buf = tostringN (s, strlen (s) * sz + 1);
+
+		memset (&mbs,  0, sizeof (mbs));
+		memset (&mbs2, 0, sizeof (mbs2));
+
+		ps   = s;
+		pbuf = buf;
+		while (n = mbrtowc (&wc, ps, sz, &mbs), n > 0){
+			ps += n;
+
+			n = wcrtomb (pbuf, fun_wc (wc), &mbs2);
+			if (n == (size_t) -1)
+				FATAL("illegal wide character %s", s);
+
+			pbuf += n;
+		}
+
+		*pbuf = 0;
+
+		if (n)
+			FATAL("illegal byte sequence %s", s);
+
+		return buf;
+	}
+}
+
+static char *nawk_toupper (const char *s)
+{
+	return nawk_toXXX (s, toupper, towupper);
+}
+
+static char *nawk_tolower (const char *s)
+{
+	return nawk_toXXX (s, tolower, towlower);
+}
+
 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
 {
 	Cell *x, *y;
 	Awkfloat u;
 	int t, sz;
-	char *p, *buf, *fmt;
+	char *buf, *fmt;
 	Node *nextarg;
 	FILE *fp;
 	time_t tv;
@@ -1521,17 +1581,14 @@
 		srand((unsigned int) u);
 		break;
 	case FTOUPPER:
+		buf = nawk_toupper (getsval (x));
+		tempfree(x);
+		x = gettemp();
+		setsval(x, buf);
+		free(buf);
+		return x;
 	case FTOLOWER:
-		buf = tostring(getsval(x));
-		if (t == FTOUPPER) {
-			for (p = buf; *p; p++)
-				if (islower((uschar) *p))
-					*p = toupper((uschar)*p);
-		} else {
-			for (p = buf; *p; p++)
-				if (isupper((uschar) *p))
-					*p = tolower((uschar)*p);
-		}
+		buf = nawk_tolower (getsval (x));
 		tempfree(x);
 		x = gettemp();
 		setsval(x, buf);
Index: nawk/tran.c
===================================================================
RCS file: /cvsroot/src/dist/nawk/tran.c,v
retrieving revision 1.9
diff -u -u -r1.9 tran.c
--- nawk/tran.c	26 Jul 2006 20:46:37 -0000	1.9
+++ nawk/tran.c	28 May 2007 18:36:25 -0000
@@ -410,6 +410,17 @@
 	return(p);
 }
 
+char *tostringN(const char *s, size_t n)	/* make a copy of string s */
+{
+	char *p;
+
+	p = malloc(n);
+	if (p == NULL)
+		FATAL("out of space in tostring on %s", s);
+	strcpy(p, s);
+	return(p);
+}
+
 char *qstring(const char *is, int delim)	/* collect string up to next delim */
 {
 	const char *os = is;