/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com) * * This library is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library. If not, see . * * Authors: Jeffrey Stedfast */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include "camel-trie.h" #include "camel-url-scanner.h" #include "camel-utf8.h" struct _CamelUrlScanner { GPtrArray *patterns; CamelTrie *trie; }; CamelUrlScanner * camel_url_scanner_new (void) { CamelUrlScanner *scanner; scanner = g_new (CamelUrlScanner, 1); scanner->patterns = g_ptr_array_new (); scanner->trie = camel_trie_new (TRUE); return scanner; } void camel_url_scanner_free (CamelUrlScanner *scanner) { g_return_if_fail (scanner != NULL); g_ptr_array_free (scanner->patterns, TRUE); camel_trie_free (scanner->trie); g_free (scanner); } void camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern) { g_return_if_fail (scanner != NULL); camel_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len); g_ptr_array_add (scanner->patterns, pattern); } gboolean camel_url_scanner_scan (CamelUrlScanner *scanner, const gchar *in, gsize inlen, urlmatch_t *match) { const gchar *pos; const guchar *inptr, *inend; urlpattern_t *pat; gint pattern; g_return_val_if_fail (scanner != NULL, FALSE); g_return_val_if_fail (in != NULL, FALSE); inptr = (const guchar *) in; inend = inptr + inlen; /* check validity of a string first */ if (!g_utf8_validate (in, inlen, NULL)) return FALSE; do { if (!(pos = camel_trie_search (scanner->trie, (const gchar *) inptr, inlen, &pattern))) return FALSE; pat = g_ptr_array_index (scanner->patterns, pattern); match->pattern = pat->pattern; match->prefix = pat->prefix; if (pat->start (in, pos, (const gchar *) inend, match) && pat->end (in, pos, (const gchar *) inend, match)) return TRUE; inptr = (const guchar *) pos; if (camel_utf8_getc_limit (&inptr, inend) == 0xffff) break; inlen = inend - inptr; } while (inptr < inend); return FALSE; } static guchar url_scanner_table[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128, 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128, 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; enum { IS_CTRL = (1 << 0), IS_ALPHA = (1 << 1), IS_DIGIT = (1 << 2), IS_LWSP = (1 << 3), IS_SPACE = (1 << 4), IS_SPECIAL = (1 << 5), IS_DOMAIN = (1 << 6), IS_URLSAFE = (1 << 7) }; #define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0) #define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0) #define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0) #define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0) #define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0) #define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0) #define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) static const struct { const gchar open; const gchar close; } url_braces[] = { { '(', ')' }, { '{', '}' }, { '[', ']' }, { '<', '>' }, { '|', '|' }, { '\'', '\'' }, }; static gboolean is_open_brace (gchar c) { gint i; for (i = 0; i < G_N_ELEMENTS (url_braces); i++) { if (c == url_braces[i].open) return TRUE; } return FALSE; } static char url_stop_at_brace (const gchar *in, gsize so, gchar *open_brace) { gint i; if (open_brace != NULL) *open_brace = '\0'; if (so > 0) { for (i = 0; i < G_N_ELEMENTS (url_braces); i++) { if (in[so - 1] == url_braces[i].open) { if (open_brace != NULL) *open_brace = url_braces[i].open; return url_braces[i].close; } } } return '\0'; } gboolean camel_url_addrspec_start (const gchar *in, const gchar *pos, const gchar *inend, urlmatch_t *match) { register const gchar *inptr = pos; g_return_val_if_fail (*inptr == '@', FALSE); if (inptr > in) inptr--; while (inptr > in) { if (is_atom (*inptr)) inptr--; else break; while (inptr > in && is_atom (*inptr)) inptr--; if (inptr > in && *inptr == '.') inptr--; } while (!is_atom (*inptr) || is_open_brace (*inptr)) inptr++; if (inptr >= pos) return FALSE; match->um_so = (inptr - in); return TRUE; } gboolean camel_url_addrspec_end (const gchar *in, const gchar *pos, const gchar *inend, urlmatch_t *match) { const gchar *inptr = pos; gint parts = 0, digits; gboolean got_dot = FALSE; g_return_val_if_fail (*inptr == '@', FALSE); inptr++; if (*inptr == '[') { /* domain literal */ do { inptr++; digits = 0; while (inptr < inend && is_digit (*inptr) && digits < 3) { inptr++; digits++; } parts++; if (*inptr != '.' && parts != 4) return FALSE; } while (parts < 4); if (*inptr == ']') inptr++; else return FALSE; got_dot = TRUE; } else { while (inptr < inend) { if (is_domain (*inptr)) inptr++; else break; while (inptr < inend && is_domain (*inptr)) inptr++; if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) { if (*inptr == '.') got_dot = TRUE; inptr++; } } } /* don't allow toplevel domains */ if (inptr == pos + 1 || !got_dot) return FALSE; match->um_eo = (inptr - in); return TRUE; } gboolean camel_url_file_start (const gchar *in, const gchar *pos, const gchar *inend, urlmatch_t *match) { match->um_so = (pos - in); return TRUE; } gboolean camel_url_file_end (const gchar *in, const gchar *pos, const gchar *inend, urlmatch_t *match) { register const gchar *inptr = pos; gchar close_brace; inptr += strlen (match->pattern); if (*inptr == '/') inptr++; close_brace = url_stop_at_brace (in, match->um_so, NULL); while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace) inptr++; if (inptr == pos) return FALSE; match->um_eo = (inptr - in); return TRUE; } gboolean camel_url_web_start (const gchar *in, const gchar *pos, const gchar *inend, urlmatch_t *match) { if (pos > in && !strncmp (pos, "www", 3)) { /* make sure we aren't actually part of another word */ if (!is_open_brace (pos[-1]) && !isspace (pos[-1])) return FALSE; } match->um_so = (pos - in); return TRUE; } gboolean camel_url_web_end (const gchar *in, const gchar *pos, const gchar *inend, urlmatch_t *match) { register const gchar *inptr = pos; gboolean passwd = FALSE; const gchar *save; gchar close_brace, open_brace; gint brace_stack = 0; gint port; inptr += strlen (match->pattern); close_brace = url_stop_at_brace (in, match->um_so, &open_brace); /* find the end of the domain */ if (is_atom (*inptr)) { /* might be a domain or user@domain */ save = inptr; while (inptr < inend) { if (!is_atom (*inptr)) break; inptr++; while (inptr < inend && is_atom (*inptr)) inptr++; if ((inptr + 1) < inend && *inptr == '.' && (is_atom (inptr[1]) || inptr[1] == '/')) inptr++; } if (*inptr != '@') inptr = save; else inptr++; goto domain; } else if (is_domain (*inptr)) { domain: while (inptr < inend) { if (!is_domain (*inptr)) break; inptr++; while (inptr < inend && is_domain (*inptr)) inptr++; if ((inptr + 1) < inend && *inptr == '.' && (is_domain (inptr[1]) || inptr[1] == '/')) inptr++; } } else { return FALSE; } if (inptr < inend) { switch (*inptr) { case ':': /* we either have a port or a password */ inptr++; if (is_digit (*inptr) || passwd) { port = (*inptr++ - '0'); while (inptr < inend && is_digit (*inptr) && port < 65536) port = (port * 10) + (*inptr++ - '0'); if (!passwd && (port >= 65536 || *inptr == '@')) { if (inptr < inend) { /* this must be a password? */ goto passwd; } inptr--; } } else { passwd: passwd = TRUE; save = inptr; while (inptr < inend && is_atom (*inptr)) inptr++; if ((inptr + 2) < inend) { if (*inptr == '@') { inptr++; if (is_domain (*inptr)) goto domain; } return FALSE; } } if (inptr >= inend || *inptr != '/') break; /* we have a '/' so there could be a path - fall through */ case '/': /* we've detected a path component to our url */ inptr++; /* coverity[fallthrough] */ case '?': while (inptr < inend && is_urlsafe (*inptr)) { if (*inptr == open_brace) { brace_stack++; } else if (*inptr == close_brace) { brace_stack--; if (brace_stack == -1) break; } inptr++; } break; default: break; } } /* urls are extremely unlikely to end with any * punctuation, so strip any trailing * punctuation off. Also strip off any closing * double-quotes. */ while (inptr > pos && strchr (",.:;?!-|}])\"", inptr[-1])) inptr--; match->um_eo = (inptr - in); return TRUE; } #ifdef BUILD_TABLE /* got these from rfc1738 */ #define CHARS_LWSP " \t\n\r" /* linear whitespace chars */ #define CHARS_SPECIAL "()<>@,;:\\\".[]" /* got these from rfc1738 */ #define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&=" static void table_init_bits (guint mask, const guchar *vals) { gint i; for (i = 0; vals[i] != '\0'; i++) url_scanner_table[vals[i]] |= mask; } static void url_scanner_table_init (void) { gint i; for (i = 0; i < 256; i++) { url_scanner_table[i] = 0; if (i < 32) url_scanner_table[i] |= IS_CTRL; if ((i >= '0' && i <= '9')) url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN; if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z')) url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN; if (i >= 127) url_scanner_table[i] |= IS_CTRL; } url_scanner_table[' '] |= IS_SPACE; url_scanner_table['-'] |= IS_DOMAIN; /* not defined to be special in rfc0822, but when scanning * backwards to find the beginning of the email address we do * not want to include this gchar if we come accross it - so * this is kind of a hack */ url_scanner_table['/'] |= IS_SPECIAL; table_init_bits (IS_LWSP, CHARS_LWSP); table_init_bits (IS_SPECIAL, CHARS_SPECIAL); table_init_bits (IS_URLSAFE, CHARS_URLSAFE); } gint main (gint argc, gchar **argv) { gint i; url_scanner_table_init (); printf ("static guchar url_scanner_table[256] = {"); for (i = 0; i < 256; i++) { printf ( "%s%3d%s", (i % 16) ? "" : "\n\t", url_scanner_table[i], i != 255 ? "," : "\n"); } printf ("};\n\n"); return 0; } #endif /* BUILD_TABLE */