mirror of
git://git.code.sf.net/p/zsh/code
synced 2026-01-06 09:41:07 +01:00
22408: support for multibyte characters in patterns
This commit is contained in:
parent
82dc72e034
commit
ef330a5dfd
8 changed files with 587 additions and 417 deletions
|
|
@ -1,6 +1,11 @@
|
|||
2006-04-09 Peter Stephenson <p.w.stephenson@ntlworld.com>
|
||||
|
||||
* 11407: Functions/MIME/zsh-mime-handler: and it needs NULL_GLOB,
|
||||
* 22408: Doc/Zsh/expn.yo, Doc/Zsh/options.yo, Src/options.c,
|
||||
Src/pattern.c, Src/utils.c, Src/zsh.h, Src/Zle/zle.h: add
|
||||
MULTIBYTE option, (#u) and (#U) globbing flags, and support
|
||||
for multibyte characters in patterns.
|
||||
|
||||
* 22407: Functions/MIME/zsh-mime-handler: and it needs NULL_GLOB,
|
||||
too.
|
||||
|
||||
2006-04-06 Peter Stephenson <pws@csr.com>
|
||||
|
|
|
|||
|
|
@ -1461,20 +1461,20 @@ tt(LPAR()#)var(X)tt(RPAR()) where var(X) may have one of the following
|
|||
forms:
|
||||
|
||||
startitem()
|
||||
item(i)(
|
||||
item(tt(i))(
|
||||
Case insensitive: upper or lower case characters in the pattern match
|
||||
upper or lower case characters.
|
||||
)
|
||||
item(l)(
|
||||
item(tt(l))(
|
||||
Lower case characters in the pattern match upper or lower case
|
||||
characters; upper case characters in the pattern still only match
|
||||
upper case characters.
|
||||
)
|
||||
item(I)(
|
||||
item(tt(I))(
|
||||
Case sensitive: locally negates the effect of tt(i) or tt(l) from
|
||||
that point on.
|
||||
)
|
||||
item(b)(
|
||||
item(tt(b))(
|
||||
Activate backreferences for parenthesised groups in the pattern;
|
||||
this does not work in filename generation. When a pattern with a set of
|
||||
active parentheses is matched, the strings matched by the groups are
|
||||
|
|
@ -1525,11 +1525,11 @@ start and end indices are set to -1.
|
|||
|
||||
Pattern matching with backreferences is slightly slower than without.
|
||||
)
|
||||
item(B)(
|
||||
item(tt(B))(
|
||||
Deactivate backreferences, negating the effect of the tt(b) flag from that
|
||||
point on.
|
||||
)
|
||||
item(m)(
|
||||
item(tt(m))(
|
||||
Set references to the match data for the entire string matched; this is
|
||||
similar to backreferencing and does not work in filename generation. The
|
||||
flag must be in effect at the end of the pattern, i.e. not local to a
|
||||
|
|
@ -1550,7 +1550,7 @@ Unlike backreferences, there is no speed penalty for using match
|
|||
references, other than the extra substitutions required for the
|
||||
replacement strings in cases such as the example shown.
|
||||
)
|
||||
item(M)(
|
||||
item(tt(M))(
|
||||
Deactivate the tt(m) flag, hence no references to match data will be
|
||||
created.
|
||||
)
|
||||
|
|
@ -1596,6 +1596,19 @@ the latter case the `tt((#b))' is useful for backreferences and the
|
|||
`tt((#q.))' will be ignored. Note that colon modifiers in the glob
|
||||
qualifiers are also not applied in ordinary pattern matching.
|
||||
)
|
||||
item(tt(u))(
|
||||
Respect the current locale in determining the presence of multibyte
|
||||
characters in a pattern, provided the shell was compiled with
|
||||
tt(MULTIBYTE_SUPPORT). This overrides the tt(MULTIBYTE)
|
||||
option; the default behaviour is taken from the option. Compare tt(U).
|
||||
(Mnemonic: typically multibyte characters are from Unicode in the UTF-8
|
||||
encoding, although any extension of ASCII supported by the system
|
||||
library may be used.)
|
||||
)
|
||||
item(tt(U))(
|
||||
All characters are considered to be a single byte long. The opposite
|
||||
of tt(u). This overrides the tt(MULTIBYTE) option.
|
||||
)
|
||||
enditem()
|
||||
|
||||
For example, the test string tt(fooxx) can be matched by the pattern
|
||||
|
|
|
|||
|
|
@ -411,6 +411,20 @@ item(tt(MARK_DIRS) (tt(-8), ksh: tt(-X)))(
|
|||
Append a trailing `tt(/)' to all directory
|
||||
names resulting from filename generation (globbing).
|
||||
)
|
||||
pindex(MULTIBYTE)
|
||||
cindex(characters, multibyte, in expansion and globbing)
|
||||
cindex(multibyte characters, in expansion and globbing)
|
||||
item(tt(MULTIBYTE))(
|
||||
Respect multibyte characters when found during pattern matching.
|
||||
When this option is set, characters strings are examined using the
|
||||
system library to determine how many bytes form a character, depending
|
||||
on the current locale. If the option is unset
|
||||
(or the shell was not compiled with the configuration option
|
||||
tt(MULTIBYTE_SUPPORT)) a single byte is always treated as a single
|
||||
character. The option will eventually be extended to cover expansion.
|
||||
Note, however, that it does not affect the shellʼs editor, which always
|
||||
uses the locale to determine multibyte characters.
|
||||
)
|
||||
pindex(NOMATCH)
|
||||
cindex(globbing, no matches)
|
||||
item(tt(NOMATCH) (tt(PLUS()3)) <C> <Z>)(
|
||||
|
|
|
|||
|
|
@ -33,20 +33,6 @@ typedef wchar_t *ZLE_STRING_T;
|
|||
typedef wint_t ZLE_INT_T;
|
||||
#define ZLE_CHAR_SIZE sizeof(wchar_t)
|
||||
|
||||
/*
|
||||
* MB_CUR_MAX is the maximum number of bytes that a single wide
|
||||
* character will convert into. We use it to keep strings
|
||||
* sufficiently long. It should always be defined, but if it isn't
|
||||
* just assume we are using Unicode which requires 6 characters.
|
||||
* (Note that it's not necessarily defined to a constant.)
|
||||
*/
|
||||
#ifndef MB_CUR_MAX
|
||||
#define MB_CUR_MAX 6
|
||||
#endif
|
||||
|
||||
/* Convert character or string to wide character or string */
|
||||
#define ZWC(c) L ## c
|
||||
#define ZWS(s) L ## s
|
||||
|
||||
#define ZLEEOF WEOF
|
||||
|
||||
|
|
@ -96,10 +82,6 @@ typedef char *ZLE_STRING_T;
|
|||
typedef int ZLE_INT_T;
|
||||
#define ZLE_CHAR_SIZE sizeof(ZLE_CHAR_T)
|
||||
|
||||
/* Leave character or string as is. */
|
||||
#define ZWC(c) c
|
||||
#define ZWS(s) s
|
||||
|
||||
#define ZLEEOF EOF
|
||||
|
||||
/* Functions that operate on a ZLE_STRING_T. */
|
||||
|
|
|
|||
|
|
@ -166,6 +166,7 @@ static struct optname optns[] = {
|
|||
{{NULL, "markdirs", 0}, MARKDIRS},
|
||||
{{NULL, "menucomplete", 0}, MENUCOMPLETE},
|
||||
{{NULL, "monitor", OPT_SPECIAL}, MONITOR},
|
||||
{{NULL, "multibyte", 0/*TBD*/}, MULTIBYTE},
|
||||
{{NULL, "multios", OPT_EMULATE|OPT_ZSH}, MULTIOS},
|
||||
{{NULL, "nomatch", OPT_EMULATE|OPT_NONBOURNE},NOMATCH},
|
||||
{{NULL, "notify", OPT_ZSH}, NOTIFY},
|
||||
|
|
|
|||
911
Src/pattern.c
911
Src/pattern.c
File diff suppressed because it is too large
Load diff
|
|
@ -2702,7 +2702,7 @@ wcsiident(wchar_t c)
|
|||
} else if (len == 1 && iascii(*outstr)) {
|
||||
return iident(*outstr);
|
||||
} else {
|
||||
/* not currently allowed, see above */
|
||||
/* TODO: not currently allowed, see above */
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
24
Src/zsh.h
24
Src/zsh.h
|
|
@ -1161,6 +1161,7 @@ struct patprog {
|
|||
#define GF_IGNCASE 0x0200
|
||||
#define GF_BACKREF 0x0400
|
||||
#define GF_MATCHREF 0x0800
|
||||
#define GF_MULTIBYTE 0x1000 /* Use multibyte if supported by build */
|
||||
|
||||
/* Dummy Patprog pointers. Used mainly in executable code, but the
|
||||
* pattern code needs to know about it, too. */
|
||||
|
|
@ -1596,6 +1597,7 @@ enum {
|
|||
MARKDIRS,
|
||||
MENUCOMPLETE,
|
||||
MONITOR,
|
||||
MULTIBYTE,
|
||||
MULTIOS,
|
||||
NOMATCH,
|
||||
NOTIFY,
|
||||
|
|
@ -1924,4 +1926,26 @@ typedef char *(*ZleGetLineFn) _((int *, int *));
|
|||
|
||||
#define MB_INCOMPLETE ((size_t)-2)
|
||||
#define MB_INVALID ((size_t)-1)
|
||||
|
||||
/*
|
||||
* MB_CUR_MAX is the maximum number of bytes that a single wide
|
||||
* character will convert into. We use it to keep strings
|
||||
* sufficiently long. It should always be defined, but if it isn't
|
||||
* just assume we are using Unicode which requires 6 characters.
|
||||
* (Note that it's not necessarily defined to a constant.)
|
||||
*/
|
||||
#ifndef MB_CUR_MAX
|
||||
#define MB_CUR_MAX 6
|
||||
#endif
|
||||
|
||||
/* Convert character or string to wide character or string */
|
||||
#define ZWC(c) L ## c
|
||||
#define ZWS(s) L ## s
|
||||
|
||||
#else
|
||||
|
||||
/* Leave character or string as is. */
|
||||
#define ZWC(c) c
|
||||
#define ZWS(s) s
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue