21736: improve tests for word and identifier characters with multibyte input

2025-09-02 22:11:54 +02:00 · 2005-09-20 15:10:26 +00:00 · 2005-09-20 15:10:26 +00:00 · 409296e22f
commit 409296e22f
parent ce43e4a22c
7 changed files with 104 additions and 35 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+2005-09-20  Peter Stephenson  <pws@csr.com>
+
+	* 21736: Src/init.c, Src/params.c, Src/pattern.c, Src/utils.c,
+	Src/Zle/zle.h, Src/Zle/zle_main.c: Fix WORDCHARS to use multibyte
+	characters; rationalise test for identifiers only to use ASCII
+	characters; remove existing hack for 8-bit characters.
+
 2005-09-19  Peter Stephenson  <pws@csr.com>

 	* unposted, c.f. 21735: Doc/Zsh/grammar.yo: document
--- a/Src/Zle/zle.h
+++ b/Src/Zle/zle.h
@ -66,12 +66,7 @@ typedef wint_t   ZLE_INT_T;

 #define ZC_iblank iswspace
 #define ZC_icntrl iswcntrl
-/*
- * TODO: doesn't work on arguments with side effects.
- * Also YUK.  Not even sure this is guaranteed to work.
- * Should be easy to do along the lines of wcsiword.
- */
-#define ZC_iident(x)	(x < 256 && iident((int)x))
+#define ZC_iident wcsiident

 #define ZC_tolower towlower
 #define ZC_toupper towupper
--- a/Src/Zle/zle_main.c
+++ b/Src/Zle/zle_main.c
@ -106,11 +106,6 @@ mod_export ZLE_INT_T lastchar_wide;
 /**/
 mod_export int
 lastchar_wide_valid;
-
-/**/
-mod_export ZLE_STRING_T zle_wordchars;
-#else
-# define zle_wordchars wordchars;
 #endif

 /* the bindings for the previous and for this key */
@ -1558,17 +1553,6 @@ trashzle(void)
 	kungetct = 0;
 }

-/**/
-mod_export void
-wordcharstrigger(void)
-{
-#ifdef ZLE_UNICODE_SUPPORT
-    zrealloc(zle_wordchars, strlen(wordchars)*MB_CUR_MAX);
-    mbsrtowcs(zle_wordchars, (const char **)&wordchars,
-	      strlen(wordchars), NULL);
-    /* TODO: error handling here */
-#endif
-}

 /* Hook functions. Used to allow access to zle parameters if zle is
 * active. */
@ -1636,8 +1620,6 @@ setup_(UNUSED(Module m))
    kungetbuf = (char *) zalloc(kungetsz = 32);
    comprecursive = 0;
    rdstrs = NULL;
-    wordcharstriggerptr = wordcharstrigger;
-    wordcharstrigger();

    /* initialise the keymap system */
    init_keymaps();
@ -1712,7 +1694,6 @@ finish_(UNUSED(Module m))
    zlegetlineptr = NULL;
    zlereadptr = fallback_zleread;
    zlesetkeymapptr= noop_function_int;
-    wordcharstriggerptr = noop_function;

    getkeyptr = NULL;

--- a/Src/init.c
+++ b/Src/init.c
@ -1179,9 +1179,6 @@ mod_export ZleVoidIntFn zlesetkeymapptr = noop_function_int;

 #endif /* !LINKED_XMOD_zshQszle */

-/**/
-mod_export ZleVoidFn wordcharstriggerptr = noop_function;
-
 /**/
 unsigned char *
 autoload_zleread(char **lp, char **rp, int ha, int con)
--- a/Src/params.c
+++ b/Src/params.c
@ -3346,7 +3346,6 @@ wordcharssetfn(UNUSED(Param pm), char *x)
    zsfree(wordchars);
    wordchars = x;
    inittyptab();
-    wordcharstriggerptr();
 }

 /* Function to get value for special parameter `_' */
--- a/Src/pattern.c
+++ b/Src/pattern.c
@ -2749,6 +2749,10 @@ patmatchrange(char *range, int ch)
 		    return 1;
 		break;
 	    case PP_WORD:
+		/*
+		 * HERE: when we support multibyte characters,
+		 * this test needs to be wcsiword().
+		 */
 		if (iword(ch))
 		    return 1;
 		break;
--- a/Src/utils.c
+++ b/Src/utils.c
@ -35,6 +35,16 @@
 /**/
 char *scriptname;

+#ifdef ZLE_UNICODE_SUPPORT
+/*
+ * The wordchars variable turned into a wide character array.
+ * This is much more convenient for testing.
+ */
+
+/**/
+mod_export wchar_t *wordchars_wide;
+#endif
+
 /* Print an error */
 
 /**/
@ -2456,8 +2466,18 @@ inittyptab(void)
 	typtab[t0] = IDIGIT | IALNUM | IWORD | IIDENT | IUSER;
    for (t0 = 'a'; t0 <= 'z'; t0++)
 	typtab[t0] = typtab[t0 - 'a' + 'A'] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#ifndef ZLE_UNICODE_SUPPORT
+    /*
+     * This really doesn't seem to me the right thing to do when
+     * we have multibyte character support...  it was a hack to assume
+     * eight bit characters `worked' for some values of work before
+     * we could test for them properly.  I'm not 100% convinced
+     * having IIDENT here is a good idea at all, but this code
+     * should disappear into history...
+     */
    for (t0 = 0240; t0 != 0400; t0++)
 	typtab[t0] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#endif
    typtab['_'] = IIDENT | IUSER;
    typtab['-'] = IUSER;
    typtab[' '] |= IBLANK | INBLANK;
@ -2477,8 +2497,44 @@ inittyptab(void)
 	}
 	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= ISEP;
    }
-    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++)
-	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= IWORD;
+#ifdef ZLE_UNICODE_SUPPORT
+    if (wordchars) {
+	const char *wordchars_ptr = wordchars;
+	mbstate_t mbs;
+	int nchars;
+
+	memset(&mbs, 0, sizeof(mbs));
+	wordchars_wide = (wchar_t *)
+	    zrealloc(wordchars_wide, (strlen(wordchars)+1)*sizeof(wchar_t));
+	nchars = mbsrtowcs(wordchars_wide, &wordchars_ptr, strlen(wordchars),
+			   &mbs);
+	if (nchars == -1) {
+	    /* Conversion state is undefined: better just set to null */
+	    *wordchars_wide = L'\0';
+	} else {
+	    wordchars_wide[nchars] = L'\0';
+	}
+    } else {
+	wordchars_wide = zrealloc(wordchars_wide, sizeof(wchar_t));
+	*wordchars_wide = L'\0';
+    }
+#endif
+    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) {
+	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
+#ifdef ZLE_UNICODE_SUPPORT
+	if (!isascii(c)) {
+	    /*
+	     * If we have support for multibyte characters, we don't
+	     * handle non-ASCII characters here; instead, we turn
+	     * wordchars into a wide character array.
+	     * (We may actually have a single-byte 8-bit character set,
+	     * but it works the same way.)
+	     */
+	    continue;
+	}
+#endif
+	typtab[c] |= IWORD;
+    }
    for (s = SPECCHARS; *s; s++)
 	typtab[STOUC(*s)] |= ISPECIAL;
    if (isset(BANGHIST) && bangchar && interact && isset(SHINSTDIN))
@ -2503,9 +2559,6 @@ wcsiword(wchar_t c)
     * produces an ASCII character.  If it does, use iword on that.
     * If it doesn't, use iswalnum on the original character.  This
     * is pretty good most of the time.
-     *
-     * TODO: extend WORDCHARS to handle multibyte chars by some kind
-     * of hierarchical list or hash table.
     */
    len = wctomb(outstr, c);

@ -2515,7 +2568,40 @@ wcsiword(wchar_t c)
    } else if (len == 1 && isascii(*outstr)) {
 	return iword(*outstr);
    } else {
-	return iswalnum(c);
+	return iswalnum(c) || wcschr(wordchars_wide, c);
+    }
+}
+
+/*
+ * iident() macro extended to support wide characters.
+ *
+ * The macro is intended to test if a character is allowed in an
+ * internal zsh identifier.  Until the main shell handles multibyte
+ * characters it's not a good idea to allow characters other than
+ * ASCII characters; it would cause zle to allow characters that
+ * the main shell would reject.  Eventually we should be able
+ * to allow all alphanumerics.
+ *
+ * Otherwise similar to wcsiword.
+ */
+
+/**/
+mod_export int
+wcsiident(wchar_t c)
+{
+    int len;
+    VARARR(char, outstr, MB_CUR_MAX);
+
+    len = wctomb(outstr, c);
+
+    if (len == 0) {
+	/* NULL is special */
+	return 0;
+    } else if (len == 1 && isascii(*outstr)) {
+	return iword(*outstr);
+    } else {
+	/* not currently allowed, see above */
+	return 0;
    }
 }
 #endif