52169: a few more improvements of (#) flag

fix (#X) in C locale in FreeBSD, DragonFly, NetBSD. Negative values such as ${(#X):--1} are now error. UCS4 is limited to < 0x8000_0000 (in OSes without __STDC_ISO_10646__).
2025-06-11 07:08:07 +02:00 · 2023-09-27 01:56:47 +09:00 · 2023-09-27 01:56:47 +09:00 · 02e33c54d8
commit 02e33c54d8
parent e4e9afe373
3 changed files with 126 additions and 125 deletions
--- a/4
+++ b/4
@ -1,3 +1,7 @@
 2023-09-27  Jun-ichi Takimoto  <takimoto-j@kba.biglobe.ne.jp>
 	* 52169: Src/subst.c, Src/utils.c: a few more improvemets of (#)
 2023-09-23  Bart Schaefer  <schaefer@BartMAC2014.local>
 	* 52180: Doc/Zsh/expn.yo: clarify array behavior of ${|var|...}
--- a/Src/subst.c
+++ b/Src/subst.c
@ -1501,16 +1501,15 @@ substevalchar(char *ptr)
 	return noerrs ? dupstring(""): NULL;
    }
    errflag |= saved_errflag;
-#ifdef MULTIBYTE_SUPPORT
+    if (ires < 0) {
-    if (isset(MULTIBYTE) && ires > 127) {
+	zerr("character not in range");
 	/* '\\' + 'U' + 8 bytes of character + '\0' */
 	char buf[11];
 	/* inefficient: should separate out \U handling from getkeystring */
 	sprintf(buf, "\\U%.8x", (unsigned int)ires & 0xFFFFFFFFu);
 	ptr = getkeystring(buf, &len, GETKEYS_BINDKEY, NULL);
    }
-    if (len == 0)
+#ifdef MULTIBYTE_SUPPORT
    else if (isset(MULTIBYTE) && ires > 127) {
 	ptr = zhalloc(MB_CUR_MAX);
 	len = ucs4tomb((unsigned int)ires & 0xffffffff, ptr);
    }
    if (len <= 0)
 #endif
    {
 	ptr = zhalloc(2);
--- a/Src/utils.c
+++ b/Src/utils.c
@ -6672,11 +6672,14 @@ dquotedzputs(char const *s, FILE *stream)
 # if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__)
 /* Convert a character from UCS4 encoding to UTF-8 */
-static size_t
+static int
 ucs4toutf8(char *dest, unsigned int wval)
 {
-    size_t len;
+    int len;
    /* UCS4 is now equvalent to UTF-32 and limited to 0 - 0x10_FFFF.
     * This function accepts 0 - 0x7FFF_FFFF (old range of UCS4) to be
     * compatible with wctomb(3) (in UTF-8 locale) on Linux. */
    if (wval < 0x80)
      len = 1;
    else if (wval < 0x800)
@ -6687,8 +6690,12 @@ ucs4toutf8(char *dest, unsigned int wval)
      len = 4;
    else if (wval < 0x4000000)
      len = 5;
-    else
+    else if (wval < 0x80000000)
      len = 6;
    else {
      zerr("character not in range");
      return -1;
    }
    switch (len) { /* falls through except to the last case */
    case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6;
@ -6705,30 +6712,89 @@ ucs4toutf8(char *dest, unsigned int wval)
 }
 #endif
 /* Convert UCS4 to a multibyte character in current locale.
 * Result is saved in buf (must be at least MB_CUR_MAX bytes long).
 * Returns the number of bytes saved in buf, or -1 if conversion fails. */
-/*
+/**/
- * The following only occurs once or twice in the code, but in different
+int
- * places depending how character set conversion is implemented.
+ucs4tomb(unsigned int wval, char *buf)
- */
+{
-#define CHARSET_FAILED()		      \
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
-    if (how & GETKEY_DOLLAR_QUOTE) {	      \
+    int count = wctomb(buf, (wchar_t)wval);
-	while ((*tdest++ = *++s)) {	      \
+    if (count == -1)
-	    if (how & GETKEY_UPDATE_OFFSET) { \
+	zerr("character not in range");
-		if (s - sstart > *misc)	      \
+    return count;
-		    (*misc)++;		      \
+#else	/* !(HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__) */
-	    }				      \
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
-	    if (*s == Snull) {		      \
+    if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
-		*len = (s - sstart) + 1;      \
+	return ucs4toutf8(buf, wval);
-		*tdest = '\0';		      \
+    } else {
-		return buf;		      \
+#   ifdef HAVE_ICONV
-	    }				      \
+	iconv_t cd;
-	}				      \
+	char inbuf[4], *bsave = buf;
-	*len = tdest - buf;		      \
+	ICONV_CONST char *inptr = inbuf;
-	return buf;			      \
+	size_t inbytes = 4, outbytes = 6;
-    }					      \
+	const char *codesetstr = nl_langinfo(CODESET);
-    *t = '\0';				      \
+	size_t count;
-    *len = t - buf;			      \
+	int i;
-    return buf
+
 	/*
 	 * If the code set isn't handled, we'd better assume it's US-ASCII
 	 * rather than just failing hopelessly.  Solaris has a weird habit
 	 * of returning 646.  This is handled by the native iconv(), but
 	 * not by GNU iconv; what's more, some versions of the native iconv
 	 * don't handle standard names like ASCII.
 	 *
 	 * This should only be a problem if there's a mismatch between the
 	 * NLS and the iconv in use, which probably only means if libiconv
 	 * is in use.  We checked at configure time if our libraries pulled
 	 * in _libiconv_version, which should be a good test.
 	 *
 	 * It shouldn't ever be NULL, but while we're being paranoid...
 	 */
 #     ifdef ICONV_FROM_LIBICONV
 	if (!codesetstr || !*codesetstr)
 	    codesetstr = "US-ASCII";
 #     endif
 	cd = iconv_open(codesetstr, "UCS-4BE");
 #     ifdef ICONV_FROM_LIBICONV
 	if (cd == (iconv_t)-1 &&  !strcmp(codesetstr, "646")) {
 	    codesetstr = "US-ASCII";
 	    cd = iconv_open(codesetstr, "UCS-4BE");
 	}
 #     endif
 	if (cd == (iconv_t)-1) {
 	    zerr("cannot do charset conversion (iconv failed)");
 	    return -1;
 	}
 	/* store value in big endian form */
 	for (i=3; i>=0; i--) {
 	    inbuf[i] = wval & 0xff;
 	    wval >>= 8;
 	}
 	count = iconv(cd, &inptr, &inbytes, &buf, &outbytes);
 	iconv_close(cd);
 	if (count) {
 	    /* -1 indicates error. Positive value means number of "invalid"
 	     * (or "non-reversible") conversions, which we consider as
 	     * "out-of-range" characters. */
 	    zerr("character not in range");
 	    return -1;
 	}
 	return buf - bsave;
 #   else    /* !HAVE_ICONV */
 	zerr("cannot do charset conversion (iconv not available)");
 	return -1;
 #   endif   /* HAVE_ICONV */
    }
 # else	/* !(HAVE_NL_LANGINFO && CODESET) */
    zerr("cannot do charset conversion (NLS not supported)");
    return -1;
 # endif	/* HAVE_NL_LANGINFO && CODESET */
 #endif	/* HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__ */
 }
 /*
 * Decode a key string, turning it into the literal characters.
@ -6785,21 +6851,6 @@ getkeystring(char *s, int *len, int how, int *misc)
    char *t, *tdest = NULL, *u = NULL, *sstart = s, *tbuf = NULL;
    char svchar = '\0';
    int meta = 0, control = 0, ignoring = 0;
    int i;
 #if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
    wint_t wval;
    int count;
 #else
    unsigned int wval;
 # if defined(HAVE_NL_LANGINFO) && defined(CODESET)
 #  if defined(HAVE_ICONV)
    iconv_t cd;
    char inbuf[4];
    size_t inbytes, outbytes;
 #  endif
    size_t count;
 # endif
 #endif
    DPUTS((how & GETKEY_UPDATE_OFFSET) &&
 	  (how & ~(GETKEYS_DOLLARS_QUOTE|GETKEY_UPDATE_OFFSET)),
@ -6864,7 +6915,8 @@ getkeystring(char *s, int *len, int how, int *misc)
    }
    for (; *s; s++) {
 	if (*s == '\\' && s[1]) {
-	    int miscadded;
+	    int miscadded, count, i;
 	    unsigned int wval;
 	    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) {
 		(*misc)--;
 		miscadded = 1;
@ -6979,86 +7031,32 @@ getkeystring(char *s, int *len, int how, int *misc)
 		    *misc = wval;
 		    return s+1;
 		}
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
+		count = ucs4tomb(wval, t);
 		count = wctomb(t, (wchar_t)wval);
 		if (count == -1) {
-		    zerr("character not in range");
+		    if (how & GETKEY_DOLLAR_QUOTE) {
-		    CHARSET_FAILED();
+			while ((*tdest++ = *++s)) {
 			    if (how & GETKEY_UPDATE_OFFSET) {
 				if (s - sstart > *misc)
 				    (*misc)++;
 			    }
 			    if (*s == Snull) {
 				*len = (s - sstart) + 1;
 				*tdest = '\0';
 				return buf;
 			    }
 			}
 			*len = tdest - buf;
 		    }
 		    else {
 			*t = '\0';
 			*len = t - buf;
 		    }
 		    return buf;
 		}
 		if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
 		    (*misc) += count;
 		t += count;
 # else
 #  if defined(HAVE_NL_LANGINFO) && defined(CODESET)
 		if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
 		    count = ucs4toutf8(t, wval);
 		    t += count;
 		    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
 			(*misc) += count;
 		} else {
 #   ifdef HAVE_ICONV
 		    ICONV_CONST char *inptr = inbuf;
 		    const char *codesetstr = nl_langinfo(CODESET);
    	    	    inbytes = 4;
 		    outbytes = 6;
 		    /* store value in big endian form */
 		    for (i=3;i>=0;i--) {
 			inbuf[i] = wval & 0xff;
 			wval >>= 8;
 		    }
 		    /*
 		     * If the code set isn't handled, we'd better
 		     * assume it's US-ASCII rather than just failing
 		     * hopelessly.  Solaris has a weird habit of
 		     * returning 646.  This is handled by the
 		     * native iconv(), but not by GNU iconv; what's
 		     * more, some versions of the native iconv don't
 		     * handle standard names like ASCII.
 		     *
 		     * This should only be a problem if there's a
 		     * mismatch between the NLS and the iconv in use,
 		     * which probably only means if libiconv is in use.
 		     * We checked at configure time if our libraries
 		     * pulled in _libiconv_version, which should be
 		     * a good test.
 		     *
 		     * It shouldn't ever be NULL, but while we're
 		     * being paranoid...
 		     */
 #ifdef ICONV_FROM_LIBICONV
 		    if (!codesetstr || !*codesetstr)
 			codesetstr = "US-ASCII";
 #endif
    	    	    cd = iconv_open(codesetstr, "UCS-4BE");
 #ifdef ICONV_FROM_LIBICONV
 		    if (cd == (iconv_t)-1 &&  !strcmp(codesetstr, "646")) {
 			codesetstr = "US-ASCII";
 			cd = iconv_open(codesetstr, "UCS-4BE");
 		    }
 #endif
 		    if (cd == (iconv_t)-1) {
 			zerr("cannot do charset conversion (iconv failed)");
 			CHARSET_FAILED();
 		    }
                    count = iconv(cd, &inptr, &inbytes, &t, &outbytes);
 		    iconv_close(cd);
 		    if (count == (size_t)-1) {
                        zerr("character not in range");
 			CHARSET_FAILED();
 		    }
 		    if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
 			(*misc) += count;
 #   else
                    zerr("cannot do charset conversion (iconv not available)");
 		    CHARSET_FAILED();
 #   endif
 		}
 #  else
                zerr("cannot do charset conversion (NLS not supported)");
 		CHARSET_FAILED();
 #  endif
 # endif
 		if (how & GETKEY_DOLLAR_QUOTE) {
 		    char *t2;
 		    for (t2 = tbuf; t2 < t; t2++) {