614 lines
17 KiB
C
614 lines
17 KiB
C
/*
|
|
* static char *rcsid_player_c =
|
|
* "$Id: re-cmp.c 11578 2009-02-23 22:02:27Z lalo $";
|
|
*/
|
|
|
|
/**
|
|
* @file re-cmp.c
|
|
* Pattern match a string, parsing some of the common RE-metacharacters.
|
|
*
|
|
* This code is public domain, but I would appreciate to hear of
|
|
* improvements or even the fact that you use it in your program.
|
|
*
|
|
* Deliberate BUGS:
|
|
* - These tokens are not supported: | ( )
|
|
* - You will get the longest expansion of the _first_ string which
|
|
* matches the RE, not the longest string which would be the proper
|
|
* behaviour for a RE-matcher.
|
|
*
|
|
* Author: Kjetil T. Homme (kjetilho@ifi.uio.no) May 1993
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <memory.h>
|
|
#include <limits.h>
|
|
#include <re-cmp.h>
|
|
#include <ctype.h>
|
|
#include <global.h>
|
|
#include <define.h> /* Needed for OUT_OF_MEMORY. */
|
|
|
|
/* Get prototype functions to prevent warnings. */
|
|
#if defined(__sun__) && defined(StupidSunHeaders)
|
|
# include <sys/types.h>
|
|
# include <sys/time.h>
|
|
# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
|
|
#endif
|
|
|
|
/* P r o t o t y p e s
|
|
*/
|
|
const char *re_cmp(const char *, const char *);
|
|
static Boolean re_cmp_step(const char *, const char *, unsigned, int);
|
|
static void re_init(void);
|
|
static Boolean re_match_token(uchar, selection *);
|
|
static const char *re_get_token(selection *, const char *);
|
|
#ifdef DEBUG2
|
|
static void re_dump_sel(selection *);
|
|
#endif
|
|
|
|
/* G l o b a l v a r i a b l e s
|
|
*/
|
|
static Boolean re_init_done = False;
|
|
static selection *re_token[RE_TOKEN_MAX];
|
|
static const char *re_substr[RE_TOKEN_MAX];
|
|
static unsigned int re_token_depth;
|
|
|
|
/* E x t e r n a l f u n c t i o n
|
|
*/
|
|
|
|
/**
|
|
* re-cmp - get regular expression match.
|
|
*
|
|
* @param str
|
|
* string that will be matched against the regexp.
|
|
* @param regexp
|
|
* regular expression.
|
|
* @return
|
|
* @li no match or error in regexp.
|
|
* @li pointer to beginning of matching string
|
|
*/
|
|
const char *re_cmp(const char *str, const char *regexp) {
|
|
const char *next_regexp;
|
|
Boolean once = False;
|
|
Boolean matched;
|
|
|
|
if (re_init_done == False)
|
|
re_init();
|
|
|
|
#ifdef SAFE_CHECKS
|
|
if (regexp == NULL || str == NULL)
|
|
return NULL;
|
|
#endif
|
|
if (*regexp == '^') {
|
|
once = True;
|
|
++regexp;
|
|
}
|
|
if (*regexp == 0) {
|
|
/* // or /^/ matches any string */
|
|
return str;
|
|
}
|
|
|
|
next_regexp = re_get_token(re_token[0], regexp);
|
|
re_token_depth = 0;
|
|
re_substr[0] = next_regexp;
|
|
|
|
matched = False;
|
|
while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
|
|
str++;
|
|
|
|
if (matched && *next_regexp == 0)
|
|
return str;
|
|
|
|
/* Apologies for the nearly duplicated code below, hopefully it
|
|
* speeds things up.
|
|
*/
|
|
if (once) {
|
|
switch (re_token[0]->repeat) {
|
|
case rep_once:
|
|
if (matched == False)
|
|
return NULL;
|
|
break;
|
|
|
|
case rep_once_or_more:
|
|
if (matched == False)
|
|
return NULL;
|
|
|
|
if (re_cmp_step(str+1, regexp, 0, 1))
|
|
return str;
|
|
break;
|
|
|
|
case rep_null_or_once:
|
|
if (matched == False)
|
|
return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
|
|
break;
|
|
|
|
case rep_null_or_more:
|
|
if (matched) {
|
|
if (re_cmp_step(str+1, regexp, 0, 1))
|
|
return str;
|
|
} else {
|
|
return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
|
|
}
|
|
|
|
if (matched) {
|
|
switch (re_token[0]->repeat) {
|
|
case rep_once:
|
|
case rep_null_or_once:
|
|
break;
|
|
|
|
case rep_once_or_more:
|
|
case rep_null_or_more:
|
|
if (re_cmp_step(str+1, regexp, 0, 1))
|
|
return str;
|
|
break;
|
|
}
|
|
|
|
/* The logic here is that re_match_token only sees
|
|
* if the one letter matches. Thus, if the
|
|
* regex is like '@match eureca', and the
|
|
* the user enters anything with an e, re_match_token
|
|
* returns true, but they really need to match the
|
|
* entire regexp, which re_cmp_step will do.
|
|
* However, what happens is that there can be a case
|
|
* where the string being match is something like
|
|
* 'where is eureca'. In this case, the re_match_token
|
|
* matches that first e, but the re_cmp_step below,
|
|
* fails because the next character (r) doesn't match
|
|
* the u. So we call re_cmp with the string
|
|
* after the first r, so that it should hopefully match
|
|
* up properly.
|
|
*/
|
|
if (re_cmp_step(str+1, next_regexp, 1, 0))
|
|
return str;
|
|
else if (*(str+1) != 0)
|
|
return re_cmp(str+1, regexp);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* A u x i l l i a r y f u n c t i o n s
|
|
*/
|
|
|
|
/**
|
|
* Tries to match a string with a regexp.
|
|
*
|
|
* @param str
|
|
* string to match
|
|
* @param regexp
|
|
* pattern
|
|
* @param slot
|
|
* number of the token which under consideration
|
|
* @param matches
|
|
* how many times the token has matched
|
|
* @return
|
|
* True if match, False else.
|
|
*/
|
|
static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
|
|
const char *next_regexp;
|
|
Boolean matched;
|
|
|
|
#ifdef DEBUG
|
|
/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
|
|
#endif
|
|
|
|
if (*regexp == 0) {
|
|
/* When we reach the end of the regexp, the match is a success */
|
|
return True;
|
|
}
|
|
|
|
/* This chunk of code makes sure that the regexp-tokenising happens
|
|
* only once. We only tokenise as much as we need.
|
|
*/
|
|
if (slot > re_token_depth) {
|
|
re_token_depth = slot;
|
|
if (re_token[slot] == NULL)
|
|
re_token[slot] = (selection *)malloc(sizeof(selection));
|
|
next_regexp = re_get_token(re_token[slot], regexp);
|
|
if (next_regexp == NULL) {
|
|
/* Syntax error, what else can we do? */
|
|
return False;
|
|
}
|
|
re_substr[slot] = next_regexp;
|
|
} else {
|
|
next_regexp = re_substr[slot];
|
|
}
|
|
|
|
matched = re_match_token(*str, re_token[slot]);
|
|
if (matched)
|
|
++matches;
|
|
|
|
if (*str == 0)
|
|
return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
|
|
|
|
switch (re_token[slot]->repeat) {
|
|
case rep_once:
|
|
if (matches == 1) { /* (matches == 1) => (matched == True) */
|
|
return re_cmp_step(str+1, next_regexp, slot+1, 0);
|
|
}
|
|
return False;
|
|
|
|
case rep_once_or_more:
|
|
if (matched) { /* (matched == True) => (matches >= 1) */
|
|
/* First check if the current token repeats more */
|
|
if (re_cmp_step(str+1, regexp, slot, matches))
|
|
return True;
|
|
return re_cmp_step(str+1, next_regexp, slot+1, 0);
|
|
}
|
|
return False;
|
|
|
|
case rep_null_or_once:
|
|
/* We must go on to the next token, but should we advance str? */
|
|
if (matches == 0) {
|
|
return re_cmp_step(str, next_regexp, slot+1, 0);
|
|
} else if (matches == 1) {
|
|
return re_cmp_step(str+1, next_regexp, slot+1, 0);
|
|
}
|
|
return False; /* Not reached */
|
|
|
|
case rep_null_or_more:
|
|
if (matched) {
|
|
/* Look for further repeats, advance str */
|
|
if (re_cmp_step(str+1, regexp, slot, matches))
|
|
return True;
|
|
return re_cmp_step(str, next_regexp, slot+1, 0);
|
|
}
|
|
return re_cmp_step(str, next_regexp, slot+1, 0);
|
|
}
|
|
|
|
return False;
|
|
}
|
|
|
|
/**
|
|
* Init the regular expression structures.
|
|
*
|
|
* @note
|
|
* will fatal() in case of memory error.
|
|
*/
|
|
static void re_init(void) {
|
|
int i;
|
|
|
|
re_token[0] = (selection *)malloc(sizeof(selection));
|
|
if (re_token[0] == NULL)
|
|
fatal(OUT_OF_MEMORY);
|
|
for (i = 1; i < RE_TOKEN_MAX; i++)
|
|
re_token[i] = NULL;
|
|
|
|
re_init_done = True;
|
|
}
|
|
|
|
/**
|
|
* Tests if a char matches a token.
|
|
*
|
|
* @param c
|
|
* char to test.
|
|
* @param sel
|
|
* token to test.
|
|
* @return
|
|
* True if matches, False else.
|
|
*/
|
|
static Boolean re_match_token(uchar c, selection *sel) {
|
|
switch (sel->type) {
|
|
case sel_any:
|
|
return True;
|
|
|
|
case sel_end:
|
|
return (c == 0);
|
|
|
|
case sel_single:
|
|
return (tolower(c) == tolower(sel->u.single));
|
|
|
|
case sel_range:
|
|
return (c >= sel->u.range.low && c <= sel->u.range.high);
|
|
|
|
case sel_array:
|
|
return (sel->u.array[c]);
|
|
|
|
case sel_not_single:
|
|
return (tolower(c) != tolower(sel->u.single));
|
|
|
|
case sel_not_range:
|
|
return (c < sel->u.range.low && c > sel->u.range.high);
|
|
}
|
|
|
|
return False;
|
|
}
|
|
|
|
/**
|
|
* Get the first regular expression token found in regexp in sel.
|
|
*
|
|
* @param[out] sel
|
|
* where to store the token.
|
|
* @param regexp
|
|
* regular expression.
|
|
* @return
|
|
* @li NULL: syntax error
|
|
* @li pointer to first character past token.
|
|
*/
|
|
static const char *re_get_token(selection *sel, const char *regexp) {
|
|
#ifdef SAFE_CHECKS
|
|
# define exit_if_null if (*regexp == 0) return NULL
|
|
#else
|
|
# define exit_if_null
|
|
#endif
|
|
Boolean quoted = False;
|
|
uchar looking_at;
|
|
|
|
#ifdef SAFE_CHECKS
|
|
if (sel == NULL || regexp == NULL || *regexp == 0)
|
|
return NULL;
|
|
#endif
|
|
|
|
do {
|
|
looking_at = *regexp++;
|
|
switch (looking_at) {
|
|
case '$':
|
|
if (quoted) {
|
|
quoted = False;
|
|
sel->type = sel_single;
|
|
sel->u.single = looking_at;
|
|
} else {
|
|
sel->type = sel_end;
|
|
}
|
|
break;
|
|
|
|
case '.':
|
|
if (quoted) {
|
|
quoted = False;
|
|
sel->type = sel_single;
|
|
sel->u.single = looking_at;
|
|
} else {
|
|
sel->type = sel_any;
|
|
}
|
|
break;
|
|
|
|
case '[':
|
|
/* The fun stuff... perhaps a little obfuscated since I
|
|
* don't trust the compiler to analyse liveness.
|
|
*/
|
|
if (quoted) {
|
|
quoted = False;
|
|
sel->type = sel_single;
|
|
sel->u.single = looking_at;
|
|
} else {
|
|
Boolean neg = False;
|
|
uchar first, last = 0;
|
|
|
|
exit_if_null;
|
|
looking_at = *regexp++;
|
|
|
|
if (looking_at == '^') {
|
|
neg = True;
|
|
exit_if_null;
|
|
looking_at = *regexp++;
|
|
}
|
|
first = looking_at;
|
|
exit_if_null;
|
|
looking_at = *regexp++;
|
|
if (looking_at == ']') {
|
|
/* On the form [q] or [^q] */
|
|
sel->type = neg ? sel_not_single : sel_single;
|
|
sel->u.single = first;
|
|
break;
|
|
} else if (looking_at == '-') {
|
|
exit_if_null;
|
|
last = *regexp++;
|
|
if (last == ']') {
|
|
/* On the form [A-] or [^A-]. Checking for
|
|
* [,-] and making it a range is probably not
|
|
* worth it :-)
|
|
*/
|
|
sel->type = sel_array;
|
|
memset(sel->u.array, neg, sizeof(sel->u.array));
|
|
sel->u.array[first] = sel->u.array['-'] = !neg;
|
|
break;
|
|
} else {
|
|
exit_if_null;
|
|
looking_at = *regexp++;
|
|
if (looking_at == ']') {
|
|
/* On the form [A-G] or [^A-G]. Note that [G-A]
|
|
* is a syntax error. Fair enough, I think.
|
|
*/
|
|
#ifdef SAFE_CHECKS
|
|
if (first > last)
|
|
return NULL;
|
|
#endif
|
|
sel->type = neg ? sel_not_range : sel_range;
|
|
sel->u.range.low = first;
|
|
sel->u.range.high = last;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
{
|
|
/* The datastructure can only represent a RE this
|
|
* complex with an array.
|
|
*/
|
|
int i;
|
|
uchar previous;
|
|
|
|
sel->type = sel_array;
|
|
memset(sel->u.array, neg, sizeof(sel->u.array));
|
|
if (last) {
|
|
/* It starts with a range */
|
|
#ifdef SAFE_CHECKS
|
|
if (first > last)
|
|
return NULL;
|
|
#endif
|
|
for (i = first; i <= last; i++) {
|
|
sel->u.array[i] = !neg;
|
|
}
|
|
} else {
|
|
/* It begins with a "random" character */
|
|
sel->u.array[first] = !neg;
|
|
}
|
|
sel->u.array[looking_at] = !neg;
|
|
|
|
exit_if_null;
|
|
previous = looking_at;
|
|
looking_at = *regexp++;
|
|
|
|
/* Add more characters to the array until we reach
|
|
* ]. Quoting doesn't and shouldn't work in here.
|
|
* ("]" should be put first, and "-" last if they
|
|
* are needed inside this construct.)
|
|
* Look for ranges as we go along.
|
|
*/
|
|
while (looking_at != ']') {
|
|
if (looking_at == '-') {
|
|
exit_if_null;
|
|
looking_at = *regexp++;
|
|
if (looking_at != ']') {
|
|
#ifdef SAFE_CHECKS
|
|
if (previous > looking_at)
|
|
return NULL;
|
|
#endif
|
|
for (i = previous+1; i < looking_at; i++) {
|
|
/* previous has already been set and
|
|
* looking_at is set below.
|
|
*/
|
|
sel->u.array[i] = !neg;
|
|
}
|
|
exit_if_null;
|
|
} else {
|
|
sel->u.array['-'] = !neg;
|
|
break;
|
|
}
|
|
}
|
|
sel->u.array[looking_at] = !neg;
|
|
previous = looking_at;
|
|
exit_if_null;
|
|
looking_at = *regexp++;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case '\\':
|
|
if (quoted) {
|
|
quoted = False;
|
|
sel->type = sel_single;
|
|
sel->u.single = looking_at;
|
|
} else {
|
|
quoted = True;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
quoted = False;
|
|
sel->type = sel_single;
|
|
sel->u.single = looking_at;
|
|
break;
|
|
}
|
|
} while (quoted);
|
|
|
|
if (*regexp == '*') {
|
|
sel->repeat = rep_null_or_more;
|
|
++regexp;
|
|
} else if (*regexp == '?') {
|
|
sel->repeat = rep_null_or_once;
|
|
++regexp;
|
|
} else if (*regexp == '+') {
|
|
sel->repeat = rep_once_or_more;
|
|
++regexp;
|
|
} else {
|
|
sel->repeat = rep_once;
|
|
}
|
|
|
|
return regexp;
|
|
}
|
|
|
|
/* D e b u g c o d e
|
|
*/
|
|
#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
|
|
/**
|
|
* Dumps specified selection to stdout.
|
|
*
|
|
* @param sel
|
|
* token to dump.
|
|
*/
|
|
static void re_dump_sel(selection *sel) {
|
|
switch (sel->type) {
|
|
case sel_any:
|
|
printf(".");
|
|
break;
|
|
|
|
case sel_end:
|
|
printf("$");
|
|
break;
|
|
|
|
case sel_single:
|
|
printf("<%c>", sel->u.single);
|
|
break;
|
|
|
|
case sel_range:
|
|
printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
|
|
break;
|
|
|
|
case sel_array: {
|
|
int i;
|
|
printf("[");
|
|
for (i = 0; i < UCHAR_MAX; i++) {
|
|
if (sel->u.array[i]) {
|
|
printf("%c", i);
|
|
}
|
|
}
|
|
printf("]");
|
|
}
|
|
break;
|
|
|
|
case sel_not_single:
|
|
printf("[^%c]", sel->u.single);
|
|
break;
|
|
|
|
case sel_not_range:
|
|
printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
|
|
break;
|
|
|
|
default:
|
|
printf("<UNKNOWN TOKEN!>");
|
|
break;
|
|
}
|
|
|
|
switch (sel->repeat) {
|
|
case rep_once:
|
|
break;
|
|
|
|
case rep_null_or_once:
|
|
printf("?");
|
|
break;
|
|
|
|
case rep_null_or_more:
|
|
printf("*");
|
|
break;
|
|
|
|
case rep_once_or_more:
|
|
printf("+");
|
|
break;
|
|
|
|
default:
|
|
printf("<UNKNOWN REP-TOKEN!>");
|
|
break;
|
|
}
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
char *re, *m;
|
|
selection sel;
|
|
|
|
re = re_get_token(&sel, argv[1]);
|
|
|
|
printf("'%s' -> '%s'\n", argv[1], re);
|
|
re_dump_sel(&sel);
|
|
printf("\n");
|
|
m = re_cmp(argv[2], argv[1]);
|
|
if (m)
|
|
printf("MATCH! -> '%s'\n", m);
|
|
return 0;
|
|
}
|
|
#endif
|