server-1.12/common/re-cmp.c

/*
 * static char *rcsid_player_c =
 *   "$Id: re-cmp.c 11578 2009-02-23 22:02:27Z lalo $";
 */

/**
 * @file re-cmp.c
 * Pattern match a string, parsing some of the common RE-metacharacters.
 *
 * This code is public domain, but I would appreciate to hear of
 * improvements or even the fact that you use it in your program.
 *
 * Deliberate BUGS:
 *    - These tokens are not supported: | ( )
 *    - You will get the longest expansion of the _first_ string which
 *      matches the RE, not the longest string which would be the proper
 *      behaviour for a RE-matcher.
 *
 * Author: Kjetil T. Homme (kjetilho@ifi.uio.no) May 1993
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <limits.h>
#include <re-cmp.h>
#include <ctype.h>
#include <global.h>
#include <define.h> /* Needed for OUT_OF_MEMORY. */

/* Get prototype functions to prevent warnings. */
#if defined(__sun__) && defined(StupidSunHeaders)
#  include <sys/types.h>
#  include <sys/time.h>
#  include "sunos.h"   /* Prototypes for standard libraries, sunos lack those */
#endif

/*   P r o t o t y p e s
 */
const char *re_cmp(const char *, const char *);
static Boolean re_cmp_step(const char *, const char *, unsigned, int);
static void re_init(void);
static Boolean re_match_token(uchar, selection *);
static const char *re_get_token(selection *, const char *);
#ifdef DEBUG2
static void re_dump_sel(selection *);
#endif

/*   G l o b a l   v a r i a b l e s
 */
static Boolean      re_init_done = False;
static selection    *re_token[RE_TOKEN_MAX];
static const char   *re_substr[RE_TOKEN_MAX];
static unsigned int re_token_depth;

/*   E x t e r n a l   f u n c t i o n
 */

/**
 * re-cmp - get regular expression match.
 *
 * @param str
 * string that will be matched against the regexp.
 * @param regexp
 * regular expression.
 * @return
 * @li no match or error in regexp.
 * @li pointer to beginning of matching string
 */
const char *re_cmp(const char *str, const char *regexp) {
    const char *next_regexp;
    Boolean once = False;
    Boolean matched;

    if (re_init_done == False)
        re_init();

#ifdef SAFE_CHECKS
    if (regexp == NULL || str == NULL)
        return NULL;
#endif
    if (*regexp == '^') {
        once = True;
        ++regexp;
    }
    if (*regexp == 0) {
        /* // or /^/ matches any string */
        return str;
    }

    next_regexp = re_get_token(re_token[0], regexp);
    re_token_depth = 0;
    re_substr[0] = next_regexp;

    matched = False;
    while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
        str++;

    if (matched && *next_regexp == 0)
        return str;

    /* Apologies for the nearly duplicated code below, hopefully it
     * speeds things up.
     */
    if (once) {
        switch (re_token[0]->repeat) {
        case rep_once:
            if (matched == False)
                return NULL;
            break;

        case rep_once_or_more:
            if (matched == False)
                return NULL;

            if (re_cmp_step(str+1, regexp, 0, 1))
                return str;
            break;

        case rep_null_or_once:
            if (matched == False)
                return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
            break;

        case rep_null_or_more:
            if (matched) {
                if (re_cmp_step(str+1, regexp, 0, 1))
                    return str;
            } else {
                return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
            }
            break;
        }

        return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
    }

    if (matched) {
        switch (re_token[0]->repeat) {
        case rep_once:
        case rep_null_or_once:
            break;

        case rep_once_or_more:
        case rep_null_or_more:
            if (re_cmp_step(str+1, regexp, 0, 1))
                return str;
            break;
        }

        /* The logic here is that re_match_token only sees
         * if the one letter matches.  Thus, if the
         * regex is like '@match eureca', and the
         * the user enters anything with an e, re_match_token
         * returns true, but they really need to match the
         * entire regexp, which re_cmp_step will do.
         * However, what happens is that there can be a case
         * where the string being match is something like
         * 'where is eureca'.  In this case, the re_match_token
         * matches that first e, but the re_cmp_step below,
         * fails because the next character (r) doesn't match
         * the u.  So we call re_cmp with the string
         * after the first r, so that it should hopefully match
         * up properly.
         */
        if (re_cmp_step(str+1, next_regexp, 1, 0))
            return str;
        else if (*(str+1) != 0)
            return re_cmp(str+1, regexp);
    }
    return NULL;
}

/*   A u x i l l i a r y   f u n c t i o n s
 */

/**
 * Tries to match a string with a regexp.
 *
 * @param str
 * string to match
 * @param regexp
 * pattern
 * @param slot
 * number of the token which under consideration
 * @param matches
 * how many times the token has matched
 * @return
 * True if match, False else.
 */
static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
    const char *next_regexp;
    Boolean matched;

#ifdef DEBUG
/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
#endif

    if (*regexp == 0) {
        /* When we reach the end of the regexp, the match is a success */
        return True;
    }

    /* This chunk of code makes sure that the regexp-tokenising happens
     * only once. We only tokenise as much as we need.
     */
    if (slot > re_token_depth) {
        re_token_depth = slot;
        if (re_token[slot] == NULL)
            re_token[slot] = (selection *)malloc(sizeof(selection));
        next_regexp = re_get_token(re_token[slot], regexp);
        if (next_regexp == NULL) {
            /* Syntax error, what else can we do? */
            return False;
        }
        re_substr[slot] = next_regexp;
    } else {
        next_regexp = re_substr[slot];
    }

    matched = re_match_token(*str, re_token[slot]);
    if (matched)
        ++matches;

    if (*str == 0)
        return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;

    switch (re_token[slot]->repeat) {
    case rep_once:
        if (matches == 1) { /* (matches == 1) => (matched == True) */
            return re_cmp_step(str+1, next_regexp, slot+1, 0);
        }
        return False;

    case rep_once_or_more:
        if (matched) { /* (matched == True) => (matches >= 1) */
            /* First check if the current token repeats more */
            if (re_cmp_step(str+1, regexp, slot, matches))
                return True;
            return re_cmp_step(str+1, next_regexp, slot+1, 0);
        }
        return False;

    case rep_null_or_once:
        /* We must go on to the next token, but should we advance str? */
        if (matches == 0) {
            return re_cmp_step(str, next_regexp, slot+1, 0);
            } else if (matches == 1) {
            return re_cmp_step(str+1, next_regexp, slot+1, 0);
        }
        return False; /* Not reached */

    case rep_null_or_more:
        if (matched) {
            /* Look for further repeats, advance str */
            if (re_cmp_step(str+1, regexp, slot, matches))
                return True;
            return re_cmp_step(str, next_regexp, slot+1, 0);
        }
        return re_cmp_step(str, next_regexp, slot+1, 0);
    }

    return False;
}

/**
 * Init the regular expression structures.
 *
 * @note
 * will fatal() in case of memory error.
 */
static void re_init(void) {
    int i;

    re_token[0] = (selection *)malloc(sizeof(selection));
    if (re_token[0] == NULL)
        fatal(OUT_OF_MEMORY);
    for (i = 1; i < RE_TOKEN_MAX; i++)
        re_token[i] = NULL;

    re_init_done = True;
}

/**
 * Tests if a char matches a token.
 *
 * @param c
 * char to test.
 * @param sel
 * token to test.
 * @return
 * True if matches, False else.
 */
static Boolean re_match_token(uchar c, selection *sel) {
    switch (sel->type) {
    case sel_any:
        return True;

    case sel_end:
        return (c == 0);

    case sel_single:
        return (tolower(c) == tolower(sel->u.single));

    case sel_range:
        return (c >= sel->u.range.low && c <= sel->u.range.high);

    case sel_array:
        return (sel->u.array[c]);

    case sel_not_single:
        return (tolower(c) != tolower(sel->u.single));

    case sel_not_range:
        return (c < sel->u.range.low && c > sel->u.range.high);
    }

    return False;
}

/**
 * Get the first regular expression token found in regexp in sel.
 *
 * @param[out] sel
 * where to store the token.
 * @param regexp
 * regular expression.
 * @return
 * @li NULL: syntax error
 * @li pointer to first character past token.
 */
static const char *re_get_token(selection *sel, const char *regexp) {
#ifdef SAFE_CHECKS
#   define exit_if_null if (*regexp == 0) return NULL
#else
#   define exit_if_null
#endif
    Boolean quoted = False;
    uchar looking_at;

#ifdef SAFE_CHECKS
    if (sel == NULL || regexp == NULL || *regexp == 0)
        return NULL;
#endif

    do {
        looking_at = *regexp++;
        switch (looking_at) {
        case '$':
            if (quoted) {
                quoted = False;
                sel->type = sel_single;
                sel->u.single = looking_at;
            } else {
                sel->type = sel_end;
            }
            break;

        case '.':
            if (quoted) {
                quoted = False;
                sel->type = sel_single;
                sel->u.single = looking_at;
            } else {
                sel->type = sel_any;
            }
            break;

        case '[':
            /* The fun stuff... perhaps a little obfuscated since I
             * don't trust the compiler to analyse liveness.
             */
            if (quoted) {
                quoted = False;
                sel->type = sel_single;
                sel->u.single = looking_at;
            } else {
                Boolean neg = False;
                uchar first, last = 0;

                exit_if_null;
                looking_at = *regexp++;

                if (looking_at == '^') {
                    neg = True;
                    exit_if_null;
                    looking_at = *regexp++;
                }
                first = looking_at;
                exit_if_null;
                looking_at = *regexp++;
                if (looking_at == ']') {
                    /* On the form [q] or [^q] */
                    sel->type = neg ? sel_not_single : sel_single;
                    sel->u.single = first;
                    break;
                } else if (looking_at == '-') {
                    exit_if_null;
                    last = *regexp++;
                    if (last == ']') {
                        /* On the form [A-] or [^A-]. Checking for
                         * [,-] and making it a range is probably not
                         * worth it :-)
                         */
                        sel->type = sel_array;
                        memset(sel->u.array, neg, sizeof(sel->u.array));
                        sel->u.array[first] = sel->u.array['-'] = !neg;
                        break;
                    } else {
                        exit_if_null;
                        looking_at = *regexp++;
                        if (looking_at == ']') {
                            /* On the form [A-G] or [^A-G]. Note that [G-A]
                             * is a syntax error. Fair enough, I think.
                             */
#ifdef SAFE_CHECKS
                            if (first > last)
                                return NULL;
#endif
                            sel->type = neg ? sel_not_range : sel_range;
                            sel->u.range.low = first;
                            sel->u.range.high = last;
                            break;
                        }
                    }
                }
                {
                    /* The datastructure can only represent a RE this
                     * complex with an array.
                     */
                    int i;
                    uchar previous;

                    sel->type = sel_array;
                    memset(sel->u.array, neg, sizeof(sel->u.array));
                    if (last) {
                        /* It starts with a range */
#ifdef SAFE_CHECKS
                        if (first > last)
                            return NULL;
#endif
                        for (i = first; i <= last; i++) {
                            sel->u.array[i] = !neg;
                        }
                    } else {
                        /* It begins with a "random" character */
                        sel->u.array[first] = !neg;
                    }
                    sel->u.array[looking_at] = !neg;

                    exit_if_null;
                    previous = looking_at;
                    looking_at = *regexp++;

                    /* Add more characters to the array until we reach
                     * ]. Quoting doesn't and shouldn't work in here.
                     * ("]" should be put first, and "-" last if they
                     * are needed inside this construct.)
                     * Look for ranges as we go along.
                     */
                    while (looking_at != ']') {
                        if (looking_at == '-') {
                            exit_if_null;
                            looking_at = *regexp++;
                            if (looking_at != ']') {
#ifdef SAFE_CHECKS
                                if (previous > looking_at)
                                    return NULL;
#endif
                                for (i = previous+1; i < looking_at; i++) {
                                    /* previous has already been set and
                                     * looking_at is set below.
                                     */
                                    sel->u.array[i] = !neg;
                                }
                                exit_if_null;
                            } else {
                                sel->u.array['-'] = !neg;
                                break;
                            }
                        }
                        sel->u.array[looking_at] = !neg;
                        previous = looking_at;
                        exit_if_null;
                        looking_at = *regexp++;
                    }
                }
            }
            break;

        case '\\':
            if (quoted) {
                quoted = False;
                sel->type = sel_single;
                sel->u.single = looking_at;
            } else {
                quoted = True;
            }
            break;

        default:
            quoted = False;
            sel->type = sel_single;
            sel->u.single = looking_at;
            break;
        }
    } while (quoted);

    if (*regexp == '*') {
        sel->repeat = rep_null_or_more;
        ++regexp;
    } else if (*regexp == '?') {
        sel->repeat = rep_null_or_once;
        ++regexp;
    } else if (*regexp == '+') {
        sel->repeat = rep_once_or_more;
        ++regexp;
    } else {
        sel->repeat = rep_once;
    }

    return regexp;
}

/*   D e b u g   c o d e
 */
#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
/**
 * Dumps specified selection to stdout.
 *
 * @param sel
 * token to dump.
 */
static void re_dump_sel(selection *sel) {
    switch (sel->type) {
    case sel_any:
        printf(".");
        break;

    case sel_end:
        printf("$");
        break;

    case sel_single:
        printf("<%c>", sel->u.single);
        break;

    case sel_range:
        printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
        break;

    case sel_array: {
            int i;
            printf("[");
            for (i = 0; i < UCHAR_MAX; i++) {
                if (sel->u.array[i]) {
                    printf("%c", i);
                }
            }
            printf("]");
        }
        break;

    case sel_not_single:
        printf("[^%c]", sel->u.single);
        break;

    case sel_not_range:
        printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
        break;

    default:
        printf("<UNKNOWN TOKEN!>");
        break;
    }

    switch (sel->repeat) {
    case rep_once:
        break;

    case rep_null_or_once:
        printf("?");
        break;

    case rep_null_or_more:
        printf("*");
        break;

    case rep_once_or_more:
        printf("+");
        break;

    default:
        printf("<UNKNOWN REP-TOKEN!>");
        break;
    }
}

int main(int argc, char *argv[]) {
    char *re, *m;
    selection sel;

    re = re_get_token(&sel, argv[1]);

    printf("'%s' -> '%s'\n", argv[1], re);
    re_dump_sel(&sel);
    printf("\n");
    m = re_cmp(argv[2], argv[1]);
    if (m)
        printf("MATCH! -> '%s'\n", m);
    return 0;
}
#endif