mirror of
git://git.code.sf.net/p/zsh/code
synced 2025-09-03 10:21:46 +02:00
Jon Strait: 26778, 26781: extra options for PCRE matching
This commit is contained in:
parent
aa3942d2d1
commit
418671fdb0
3 changed files with 87 additions and 12 deletions
|
@ -1,5 +1,8 @@
|
|||
2009-03-25 Peter Stephenson <pws@csr.com>
|
||||
|
||||
* Jon Strait: 26778, 26781: Doc/Zsh/mod_pcre.yo,
|
||||
Src/Modules/pcre.c: a couple of extra options for PCRE matching.
|
||||
|
||||
* Michael Hwang: 26776: Src/builtin.c: improved column alignment
|
||||
with print -c -P.
|
||||
|
||||
|
@ -11487,5 +11490,5 @@
|
|||
|
||||
*****************************************************
|
||||
* This is used by the shell to define $ZSH_PATCHLEVEL
|
||||
* $Revision: 1.4636 $
|
||||
* $Revision: 1.4637 $
|
||||
*****************************************************
|
||||
|
|
|
@ -6,7 +6,7 @@ The tt(zsh/pcre) module makes some commands available as builtins:
|
|||
|
||||
startitem()
|
||||
findex(pcre_compile)
|
||||
item(tt(pcre_compile) [ tt(-aimx) ] var(PCRE))(
|
||||
item(tt(pcre_compile) [ tt(-aimxs) ] var(PCRE))(
|
||||
Compiles a perl-compatible regular expression.
|
||||
|
||||
Option tt(-a) will force the pattern to be anchored.
|
||||
|
@ -15,6 +15,8 @@ Option tt(-m) will compile a multi-line pattern; that is,
|
|||
tt(^) and tt($) will match newlines within the pattern.
|
||||
Option tt(-x) will compile an extended pattern, wherein
|
||||
whitespace and tt(#) comments are ignored.
|
||||
Option tt(-s) makes the dot metacharacter match all characters,
|
||||
including those that indicate newline.
|
||||
)
|
||||
findex(pcre_study)
|
||||
item(tt(pcre_study))(
|
||||
|
@ -22,7 +24,8 @@ Studies the previously-compiled PCRE which may result in faster
|
|||
matching.
|
||||
)
|
||||
findex(pcre_match)
|
||||
item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] var(string))(
|
||||
item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
|
||||
[ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
|
||||
Returns successfully if tt(string) matches the previously-compiled
|
||||
PCRE.
|
||||
|
||||
|
@ -35,6 +38,36 @@ var(MATCH) will be set to the entire matched portion of the
|
|||
string, unless the tt(-v) option is given, in which case the variable
|
||||
var(var) will be set.
|
||||
No variables are altered if there is no successful match.
|
||||
A tt(-n) option starts searching for a match from the
|
||||
byte var(offset) position in var(string). If the tt(-b) option is given,
|
||||
the variable var(ZPCRE_OP) will be set to an offset pair string,
|
||||
representing the byte offset positions of the entire matched portion
|
||||
within the var(string). For example, a var(ZPCRE_OP) set to "32 45" indicates
|
||||
that the matched portion began on byte offset 32 and ended on byte offset 44.
|
||||
Here, byte offset position 45 is the position directly after the matched
|
||||
portion. Keep in mind that the byte position isn't necessarily the same
|
||||
as the character position when UTF-8 characters are involved.
|
||||
Consequently, the byte offset positions are only to be relied on in the
|
||||
context of using them for subsequent searches on var(string), using an offset
|
||||
position as an argument to the tt(-n) option. This is mostly
|
||||
used to implement the "find all non-overlapping matches" functionality.
|
||||
|
||||
A simple example of "find all non-overlapping matches":
|
||||
|
||||
example(
|
||||
string="The following zip codes: 78884 90210 99513"
|
||||
pcre_compile -m "\d{5}"
|
||||
accum=()
|
||||
pcre_match -b -- $string
|
||||
while [[ $? -eq 0 ]] do
|
||||
b=($=ZPCRE_OP)
|
||||
accum+=$MATCH
|
||||
pcre_match -b -n $b[2] -- $string
|
||||
done
|
||||
print -l $accum
|
||||
|
||||
|
||||
)
|
||||
)
|
||||
enditem()
|
||||
|
||||
|
|
|
@ -82,6 +82,7 @@ bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
|
||||
if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
|
||||
if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
|
||||
if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
|
||||
|
||||
if (zpcre_utf8_enabled())
|
||||
pcre_opts |= PCRE_UTF8;
|
||||
|
@ -137,9 +138,11 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f
|
|||
|
||||
/**/
|
||||
static int
|
||||
zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
|
||||
zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar,
|
||||
int want_offset_pair, int matchedinarr)
|
||||
{
|
||||
char **captures, *match_all, **matches;
|
||||
char offset_all[50];
|
||||
int capture_start = 1;
|
||||
|
||||
if (matchedinarr)
|
||||
|
@ -148,9 +151,14 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
|
|||
matchvar = "MATCH";
|
||||
if (substravar == NULL)
|
||||
substravar = "match";
|
||||
|
||||
|
||||
/* captures[0] will be entire matched string, [1] first substring */
|
||||
if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
|
||||
if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
|
||||
/* Set to the offsets of the complete match */
|
||||
if (want_offset_pair) {
|
||||
sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
|
||||
setsparam("ZPCRE_OP", ztrdup(offset_all));
|
||||
}
|
||||
match_all = ztrdup(captures[0]);
|
||||
setsparam(matchvar, match_all);
|
||||
matches = zarrdup(&captures[capture_start]);
|
||||
|
@ -161,6 +169,22 @@ zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substr
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**/
|
||||
static int
|
||||
getposint(char *instr, char *nam)
|
||||
{
|
||||
char *eptr;
|
||||
int ret;
|
||||
|
||||
ret = (int)zstrtol(instr, &eptr, 10);
|
||||
if (*eptr || ret < 0) {
|
||||
zwarnnam(nam, "integer expected: %s", instr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**/
|
||||
static int
|
||||
bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
||||
|
@ -169,6 +193,10 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
char *matched_portion = NULL;
|
||||
char *receptacle = NULL;
|
||||
int return_value = 1;
|
||||
/* The subject length and offset start are both int values in pcre_exec */
|
||||
int subject_len;
|
||||
int offset_start = 0;
|
||||
int want_offset_pair = 0;
|
||||
|
||||
if (pcre_pattern == NULL) {
|
||||
zwarnnam(nam, "no pattern has been compiled");
|
||||
|
@ -181,6 +209,12 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
if(OPT_HASARG(ops,c='v')) {
|
||||
matched_portion = OPT_ARG(ops,c);
|
||||
}
|
||||
if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
|
||||
offset_start = getposint(OPT_ARG(ops,c), nam);
|
||||
}
|
||||
/* For the entire match, 'Return' the offset byte positions instead of the matched string */
|
||||
if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
|
||||
|
||||
if(!*args) {
|
||||
zwarnnam(nam, "not enough arguments");
|
||||
}
|
||||
|
@ -194,12 +228,17 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
|||
ovecsize = (capcount+1)*3;
|
||||
ovec = zalloc(ovecsize*sizeof(int));
|
||||
|
||||
ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
|
||||
|
||||
subject_len = (int)strlen(*args);
|
||||
|
||||
if (offset_start < 0 || offset_start >= subject_len)
|
||||
ret = PCRE_ERROR_NOMATCH;
|
||||
else
|
||||
ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
|
||||
|
||||
if (ret==0) return_value = 0;
|
||||
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
|
||||
else if (ret>0) {
|
||||
zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
|
||||
zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
|
||||
return_value = 0;
|
||||
}
|
||||
else {
|
||||
|
@ -258,7 +297,7 @@ cond_pcre_match(char **a, int id)
|
|||
break;
|
||||
}
|
||||
else if (r>0) {
|
||||
zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
|
||||
zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
|
||||
return_value = 1;
|
||||
break;
|
||||
}
|
||||
|
@ -289,8 +328,8 @@ static struct conddef cotab[] = {
|
|||
#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
|
||||
|
||||
static struct builtin bintab[] = {
|
||||
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx", NULL),
|
||||
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:", NULL),
|
||||
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
|
||||
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:n:b", NULL),
|
||||
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue