Subject: bin/36394: awk tolower/toupper functions don't support multibyte charsets
To: None <gnats-admin@netbsd.org, netbsd-bugs@netbsd.org>
From: None <cheusov@tut.by>
List: netbsd-bugs
Date: 05/28/2007 18:50:00
>Number: 36394
>Category: bin
>Synopsis: awk tolower/toupper functions don't support multibyte charsets
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: bin-bug-people
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Mon May 28 18:50:00 +0000 2007
>Originator: cheusov@tut.by
>Release: NetBSD 4.0_BETA2
>Organization:
Best regards, Aleksey Cheusov.
>Environment:
System: NetBSD chen.chizhovka.net 4.0_BETA2 NetBSD 4.0_BETA2 (GENERIC) #16: Mon May 21 00:01:33 EEST 2007 cheusov@chen.chizhovka.net:/srv/src/sys/arch/i386/compile/GENERIC i386
Architecture: i386
Machine: i386
>Description:
NetBSD awk 'tolower' and 'toupper' functions
don't support multibyte charsets, e.g. utf-8.
Patch follows. Please resend it to upstream.
>Fix:
? nawk-caseconv.patch
Index: nawk/proto.h
===================================================================
RCS file: /cvsroot/src/dist/nawk/proto.h,v
retrieving revision 1.5
diff -u -u -r1.5 proto.h
--- nawk/proto.h 26 Oct 2003 11:34:23 -0000 1.5
+++ nawk/proto.h 28 May 2007 18:36:25 -0000
@@ -112,6 +112,7 @@
extern char *getsval(Cell *);
extern char *getpssval(Cell *); /* for print */
extern char *tostring(const char *);
+extern char *tostringN(const char *, size_t n);
extern char *qstring(const char *, int);
extern void recinit(unsigned int);
Index: nawk/run.c
===================================================================
RCS file: /cvsroot/src/dist/nawk/run.c,v
retrieving revision 1.14
diff -u -u -r1.14 run.c
--- nawk/run.c 26 Jul 2006 20:46:37 -0000 1.14
+++ nawk/run.c 28 May 2007 18:36:25 -0000
@@ -25,6 +25,8 @@
#define DEBUG
#include <stdio.h>
#include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
#include <setjmp.h>
#include <limits.h>
#include <math.h>
@@ -1461,12 +1463,70 @@
void flush_all(void);
+static char *nawk_toXXX (
+ const char *s,
+ int (*fun_c) (int),
+ wint_t (*fun_wc) (wint_t))
+{
+ char *buf = NULL;
+ char *pbuf = NULL;
+ const char *ps = NULL;
+ size_t n = 0;
+ mbstate_t mbs, mbs2;
+ wchar_t wc;
+ size_t sz = MB_CUR_MAX;
+
+ if (sz == 1){
+ buf = tostring (s);
+
+ for (pbuf = buf; *pbuf; pbuf++)
+ *pbuf = fun_c ((uschar) *pbuf);
+
+ return buf;
+ }else{
+ /* upper/lower character may be shorter/longer */
+ buf = tostringN (s, strlen (s) * sz + 1);
+
+ memset (&mbs, 0, sizeof (mbs));
+ memset (&mbs2, 0, sizeof (mbs2));
+
+ ps = s;
+ pbuf = buf;
+ while (n = mbrtowc (&wc, ps, sz, &mbs), n > 0){
+ ps += n;
+
+ n = wcrtomb (pbuf, fun_wc (wc), &mbs2);
+ if (n == (size_t) -1)
+ FATAL("illegal wide character %s", s);
+
+ pbuf += n;
+ }
+
+ *pbuf = 0;
+
+ if (n)
+ FATAL("illegal byte sequence %s", s);
+
+ return buf;
+ }
+}
+
+static char *nawk_toupper (const char *s)
+{
+ return nawk_toXXX (s, toupper, towupper);
+}
+
+static char *nawk_tolower (const char *s)
+{
+ return nawk_toXXX (s, tolower, towlower);
+}
+
Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg list */
{
Cell *x, *y;
Awkfloat u;
int t, sz;
- char *p, *buf, *fmt;
+ char *buf, *fmt;
Node *nextarg;
FILE *fp;
time_t tv;
@@ -1521,17 +1581,14 @@
srand((unsigned int) u);
break;
case FTOUPPER:
+ buf = nawk_toupper (getsval (x));
+ tempfree(x);
+ x = gettemp();
+ setsval(x, buf);
+ free(buf);
+ return x;
case FTOLOWER:
- buf = tostring(getsval(x));
- if (t == FTOUPPER) {
- for (p = buf; *p; p++)
- if (islower((uschar) *p))
- *p = toupper((uschar)*p);
- } else {
- for (p = buf; *p; p++)
- if (isupper((uschar) *p))
- *p = tolower((uschar)*p);
- }
+ buf = nawk_tolower (getsval (x));
tempfree(x);
x = gettemp();
setsval(x, buf);
Index: nawk/tran.c
===================================================================
RCS file: /cvsroot/src/dist/nawk/tran.c,v
retrieving revision 1.9
diff -u -u -r1.9 tran.c
--- nawk/tran.c 26 Jul 2006 20:46:37 -0000 1.9
+++ nawk/tran.c 28 May 2007 18:36:25 -0000
@@ -410,6 +410,17 @@
return(p);
}
+char *tostringN(const char *s, size_t n) /* make a copy of string s */
+{
+ char *p;
+
+ p = malloc(n);
+ if (p == NULL)
+ FATAL("out of space in tostring on %s", s);
+ strcpy(p, s);
+ return(p);
+}
+
char *qstring(const char *is, int delim) /* collect string up to next delim */
{
const char *os = is;