tech-userlevel: Re: UTF-8 file names?

Subject: Re: UTF-8 file names?
To: Christos Zoulas <christos@zoulas.com>
From: SODA Noriyuki <soda@sra.co.jp>
List: tech-userlevel
Date: 09/26/2004 05:59:24
>>>>> On Sat, 25 Sep 2004 15:23:37 GMT,
	christos@zoulas.com (Christos Zoulas) said:

>>> >> ls has a problem, because it is using isprint(3) instead of iswprint(3).
>>> >> You can work around the problem by using something like "ls | cat",
>>> >> though.

>>> > Should it be changed to use iswprint(3)?

>>> Yes.

>> We're using fts_name, which is a char *.
>> How can I convert it to wchar_t *?

> size_t
> mbsrtowcs(wchar_t * restrict pwcs, const char ** restrict s, size_t n,
>     mbstate_t * restrict ps);
> 
> ???

Yes, mbsrtowcs() or mbrtowcs() can be used, but these functions have
a problem how to handle EILSEQ case.
So, I think the attached implementation is better.
Note that this patch doesn't handle -B and -b options, because I'm
not sure how strvis() should be i18n'ized.

BTW, it seems there is a bug in our wcrtomb() which makes
wcrtomb(buf, L'\0' pst) return 1 instead of 4 with LANG=ja_JP.ISO2022-JP.
--
soda

Index: util.c
===================================================================
RCS file: /cvsroot/src/bin/ls/util.c,v
retrieving revision 1.27
diff -u -r1.27 util.c
--- util.c	22 Sep 2003 02:43:20 -0000	1.27
+++ util.c	25 Sep 2004 20:42:20 -0000
@@ -44,7 +44,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
-#include <ctype.h>
 #include <err.h>
 #include <fts.h>
 #include <limits.h>
@@ -52,6 +51,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <vis.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "ls.h"
 #include "extern.h"
@@ -84,17 +85,64 @@
 		/* NOTREACHED */
 }
 
+/*
+ * Ideally fputwc(L'\0', stream) can output shift sequence needed to
+ * restore the initial shift state.  But that isn't actually
+ * guaranteed to work, that's the reason why we use this function.
+ * Also, we suppress to call fwide(3) to switch wide/byte oriented
+ * stream with this function.
+ */
+wint_t
+fputwcr(wchar_t wc, mbstate_t *pst, FILE *stream)
+{
+	size_t size;
+	char buf[MB_LEN_MAX];
+
+	size = wcrtomb(buf, wc, pst);
+	if (size == (size_t)-1)
+		return WEOF;
+	if (size > 0 && buf[size - 1] == '\0')
+		--size;
+	if (size > 0 && fwrite(buf, 1, size, stream) != size)
+		return WEOF;
+	return (wint_t)wc;
+}
+
 int
 printescaped(const char *src)
 {
-	unsigned char c;
+	wchar_t wc;
+	mbstate_t src_state, stdout_state;
 	int n;
+	size_t rv;
 
-	for (n = 0; (c = *src) != '\0'; ++src, ++n)
-		if (isprint(c))
-			(void)putchar(c);
-		else
-			(void)putchar('?');
+	memset(&src_state, 0, sizeof(src_state));
+	memset(&stdout_state, 0, sizeof(stdout_state));
+	for (n = 0; ; ++src, ++n) {
+		rv = mbrtowc(&wc, src, 1, &src_state);
+		if (rv == 0) { /* assert(wc == L'\0'); */
+			/*
+			 * To output shift sequence needed to restore
+			 * the initial shift state.
+			 */
+			fputwcr(wc, &stdout_state, stdout); 
+			break;
+		} else if (rv == (size_t)-1) { /* assert(errno == EILSEQ); */
+			fputwcr('?', &stdout_state, stdout);
+			/*
+			 * It's unsure how to restart from here...
+			 * Just skip one byte.
+			 */
+			memset(&src_state, 0, sizeof(src_state));
+		} if (rv == (size_t)-2) {
+			/* incomplete but potentially valid character */
+		} else { /* assert(rv == 1); */
+			if (iswprint(wc))
+				fputwcr(wc, &stdout_state, stdout);
+			else
+				fputwcr('?', &stdout_state, stdout);
+		}
+	}
 	return n;
 }