mirror of
git://git.code.sf.net/p/zsh/code
synced 2025-06-24 11:28:16 +02:00
51738: support pcre's alternative DFA matching algorithm
This commit is contained in:
parent
f3f371deb3
commit
b4d1c756f5
4 changed files with 49 additions and 18 deletions
|
@ -1,5 +1,8 @@
|
||||||
2023-05-13 Oliver Kiddle <opk@zsh.org>
|
2023-05-13 Oliver Kiddle <opk@zsh.org>
|
||||||
|
|
||||||
|
* 51738: Doc/Zsh/mod_pcre.yo, Src/Modules/pcre.c,
|
||||||
|
Test/V07pcre.ztst: support pcre's DFA matching algorithm
|
||||||
|
|
||||||
* 51728: Doc/Zsh/mod_pcre.yo, Src/Modules/pcre.c,
|
* 51728: Doc/Zsh/mod_pcre.yo, Src/Modules/pcre.c,
|
||||||
Test/V07pcre.ztst: assign pcre named capture groups to a hash
|
Test/V07pcre.ztst: assign pcre named capture groups to a hash
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ may result in faster matching.
|
||||||
)
|
)
|
||||||
findex(pcre_match)
|
findex(pcre_match)
|
||||||
item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
|
item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
|
||||||
[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
|
[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-bd) ] var(string))(
|
||||||
Returns successfully if tt(string) matches the previously-compiled
|
Returns successfully if tt(string) matches the previously-compiled
|
||||||
PCRE.
|
PCRE.
|
||||||
|
|
||||||
|
@ -69,6 +69,10 @@ print -l $accum)
|
||||||
)
|
)
|
||||||
enditem()
|
enditem()
|
||||||
|
|
||||||
|
The option tt(-d) uses the alternative breadth-first DFA search algorithm of
|
||||||
|
pcre. This sets tt(match), or the array given with tt(-a), to all the matches
|
||||||
|
found from the same start point in the subject.
|
||||||
|
|
||||||
The tt(zsh/pcre) module makes available the following test condition:
|
The tt(zsh/pcre) module makes available the following test condition:
|
||||||
|
|
||||||
startitem()
|
startitem()
|
||||||
|
|
|
@ -305,30 +305,29 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
||||||
pcre2_match_data *pcre_mdata = NULL;
|
pcre2_match_data *pcre_mdata = NULL;
|
||||||
char *matched_portion = NULL;
|
char *matched_portion = NULL;
|
||||||
char *plaintext = NULL;
|
char *plaintext = NULL;
|
||||||
char *receptacle = NULL;
|
char *receptacle;
|
||||||
char *named = ".pcre.match";
|
char *named = NULL;
|
||||||
int return_value = 1;
|
int return_value = 1;
|
||||||
/* The subject length and offset start are both int values in pcre_exec */
|
/* The subject length and offset start are both int values in pcre_exec */
|
||||||
int subject_len;
|
int subject_len;
|
||||||
int offset_start = 0;
|
int offset_start = 0;
|
||||||
int want_offset_pair = 0;
|
int want_offset_pair = 0;
|
||||||
|
int use_dfa = 0;
|
||||||
|
|
||||||
if (pcre_pattern == NULL) {
|
if (pcre_pattern == NULL) {
|
||||||
zwarnnam(nam, "no pattern has been compiled");
|
zwarnnam(nam, "no pattern has been compiled");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
matched_portion = "MATCH";
|
if (!(use_dfa = OPT_ISSET(ops, 'd'))) {
|
||||||
receptacle = "match";
|
matched_portion = OPT_HASARG(ops, c='v') ? OPT_ARG(ops, c) : "MATCH";
|
||||||
if(OPT_HASARG(ops,c='a')) {
|
named = OPT_HASARG(ops, c='A') ? OPT_ARG(ops, c) : ".pcre.match";
|
||||||
receptacle = OPT_ARG(ops,c);
|
} else if (OPT_HASARG(ops, c='v') || OPT_HASARG(ops, c='A')) {
|
||||||
}
|
zwarnnam(nam, "-d cannot be combined with -%c", c);
|
||||||
if(OPT_HASARG(ops,c='v')) {
|
return 1;
|
||||||
matched_portion = OPT_ARG(ops,c);
|
|
||||||
}
|
|
||||||
if (OPT_HASARG(ops, c='A')) {
|
|
||||||
named = OPT_ARG(ops, c);
|
|
||||||
}
|
}
|
||||||
|
receptacle = OPT_HASARG(ops, 'a') ? OPT_ARG(ops, 'a') : "match";
|
||||||
|
|
||||||
if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
|
if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
|
||||||
if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
|
if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -341,7 +340,25 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
||||||
|
|
||||||
if (offset_start > 0 && offset_start >= subject_len)
|
if (offset_start > 0 && offset_start >= subject_len)
|
||||||
ret = PCRE2_ERROR_NOMATCH;
|
ret = PCRE2_ERROR_NOMATCH;
|
||||||
else {
|
else if (use_dfa) {
|
||||||
|
PCRE2_SIZE old, wscount = 128, capcount = 128;
|
||||||
|
void *workspace = zhalloc(sizeof(int) * wscount);
|
||||||
|
pcre_mdata = pcre2_match_data_create(capcount, NULL);
|
||||||
|
do {
|
||||||
|
ret = pcre2_dfa_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
|
||||||
|
offset_start, 0, pcre_mdata, NULL, (int *) workspace, wscount);
|
||||||
|
if (ret == PCRE2_ERROR_DFA_WSSIZE) {
|
||||||
|
old = wscount;
|
||||||
|
wscount += wscount / 2;
|
||||||
|
workspace = hrealloc(workspace, sizeof(int) * old, sizeof(int) * wscount);
|
||||||
|
} else if (ret == 0) {
|
||||||
|
capcount += capcount / 2;
|
||||||
|
pcre2_match_data_free(pcre_mdata);
|
||||||
|
pcre_mdata = pcre2_match_data_create(capcount, NULL);
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
} while(1);
|
||||||
|
} else {
|
||||||
pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
|
pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
|
||||||
ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
|
ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
|
||||||
offset_start, 0, pcre_mdata, NULL);
|
offset_start, 0, pcre_mdata, NULL);
|
||||||
|
@ -350,12 +367,14 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
|
||||||
if (ret==0) return_value = 0;
|
if (ret==0) return_value = 0;
|
||||||
else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
|
else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
|
||||||
else if (ret>0) {
|
else if (ret>0) {
|
||||||
zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret, matched_portion,
|
zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret,
|
||||||
receptacle, named, want_offset_pair, 0, 0);
|
matched_portion, receptacle, named, want_offset_pair, use_dfa, 0);
|
||||||
return_value = 0;
|
return_value = 0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
zwarnnam(nam, "error in pcre2_match [%d]", ret);
|
PCRE2_UCHAR buffer[256];
|
||||||
|
pcre2_get_error_message(ret, buffer, sizeof(buffer));
|
||||||
|
zwarnnam(nam, "error in pcre matching for /%s/: %s", plaintext, buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pcre_mdata)
|
if (pcre_mdata)
|
||||||
|
@ -466,7 +485,7 @@ static struct conddef cotab[] = {
|
||||||
|
|
||||||
static struct builtin bintab[] = {
|
static struct builtin bintab[] = {
|
||||||
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
|
BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
|
||||||
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "A:a:v:n:b", NULL),
|
BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "A:a:v:n:bd", NULL),
|
||||||
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
|
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -196,3 +196,8 @@
|
||||||
> [package]=name-12345
|
> [package]=name-12345
|
||||||
> [version]=12345
|
> [version]=12345
|
||||||
>)
|
>)
|
||||||
|
|
||||||
|
pcre_compile 'cat(er(pillar)?)?'
|
||||||
|
pcre_match -d 'the caterpillar catchment' && print $match
|
||||||
|
0:pcre_match -d
|
||||||
|
>caterpillar cater cat
|
||||||
|
|
Loading…
Reference in a new issue