/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
*
* This library is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
* Authors: Jeffrey Stedfast
* Michael Zucchi
*/
#ifdef HAVE_CONFIG_H
#include
#endif
/* POSIX requires be included before */
#include
#include
#include
#include
#include
#include
#include "camel-mime-message.h"
#include "camel-multipart.h"
#include "camel-search-private.h"
#include "camel-stream-mem.h"
#define d(x)
/* builds the regex into pattern */
/* taken from camel-folder-search, with added isregex & exception parameter */
/* Basically, we build a new regex, either based on subset regex's, or
* substrings, that can be executed once over the whoel body, to match
* anything suitable. This is more efficient than multiple searches,
* and probably most (naive) strstr implementations, over long content.
*
* A small issue is that case-insenstivity won't work entirely correct
* for utf8 strings. */
gint
camel_search_build_match_regex (regex_t *pattern,
camel_search_flags_t type,
gint argc,
struct _CamelSExpResult **argv,
GError **error)
{
GString *match = g_string_new ("");
gint c, i, count = 0, err;
gchar *word;
gint flags;
/* Build a regex pattern we can use to match the words,
* we OR them together. */
if (argc > 1)
g_string_append_c (match, '(');
for (i = 0; i < argc; i++) {
if (argv[i]->type == CAMEL_SEXP_RES_STRING) {
if (count > 0)
g_string_append_c (match, '|');
word = argv[i]->value.string;
if (type & CAMEL_SEARCH_MATCH_REGEX) {
/* No need to escape because this
* should already be a valid regex. */
g_string_append (match, word);
} else {
/* Escape any special chars (not
* sure if this list is complete). */
if (type & CAMEL_SEARCH_MATCH_START)
g_string_append_c (match, '^');
while ((c = *word++)) {
if (strchr ("*\\.()[]^$+", c) != NULL) {
g_string_append_c (match, '\\');
}
g_string_append_c (match, c);
}
if (type & CAMEL_SEARCH_MATCH_END)
g_string_append_c (match, '^');
}
count++;
} else {
g_warning ("Invalid type passed to body-contains match function");
}
}
if (argc > 1)
g_string_append_c (match, ')');
flags = REG_EXTENDED | REG_NOSUB;
if (type & CAMEL_SEARCH_MATCH_ICASE)
flags |= REG_ICASE;
if (type & CAMEL_SEARCH_MATCH_NEWLINE)
flags |= REG_NEWLINE;
err = regcomp (pattern, match->str, flags);
if (err != 0) {
/* regerror gets called twice to get the full error
* string length to do proper posix error reporting. */
gint len = regerror (err, pattern, NULL, 0);
gchar *buffer = g_malloc0 (len + 1);
regerror (err, pattern, buffer, len);
g_set_error (
error, CAMEL_ERROR, CAMEL_ERROR_GENERIC,
_("Regular expression compilation failed: %s: %s"),
match->str, buffer);
regfree (pattern);
}
d (printf ("Built regex: '%s'\n", match->str));
g_string_free (match, TRUE);
return err;
}
static guchar soundex_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0,
49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static void
soundexify (const gchar *sound,
gchar code[5])
{
guchar *c, last = '\0';
gint n;
for (c = (guchar *) sound; *c && !isalpha (*c); c++);
code[0] = toupper (*c);
memset (code + 1, 0, 3);
for (n = 1; *c && n < 5; c++) {
guchar ch = soundex_table[*c];
if (ch && ch != last) {
code[n++] = ch;
last = ch;
}
}
code[4] = '\0';
}
static gboolean
header_soundex (const gchar *header,
const gchar *match)
{
gchar mcode[5], hcode[5];
const gchar *p;
gchar c;
GString *word;
gint truth = FALSE;
soundexify (match, mcode);
/* Split the header into words and soundexify and compare each one. */
/* FIXME: Should this convert to utf8, and split based on that,
* and what not?
* soundex only makes sense for us-ascii though ... */
word = g_string_new ("");
p = header;
do {
c = *p++;
if (c == 0 || isspace (c)) {
if (word->len > 0) {
soundexify (word->str, hcode);
if (strcmp (hcode, mcode) == 0)
truth = TRUE;
}
g_string_truncate (word, 0);
} else if (isalpha (c))
g_string_append_c (word, c);
} while (c && !truth);
g_string_free (word, TRUE);
return truth;
}
const gchar *
camel_ustrstrcase (const gchar *haystack,
const gchar *needle)
{
gunichar *nuni, *puni;
gunichar u;
const guchar *p;
g_return_val_if_fail (haystack != NULL, NULL);
g_return_val_if_fail (needle != NULL, NULL);
if (strlen (needle) == 0)
return haystack;
if (strlen (haystack) == 0)
return NULL;
puni = nuni = g_alloca (sizeof (gunichar) * (strlen (needle) + 1));
nuni[0] = 0;
p = (const guchar *) needle;
while ((u = camel_utf8_getc (&p)))
*puni++ = g_unichar_tolower (u);
/* NULL means there was illegal utf-8 sequence. */
if (!p)
return NULL;
p = (const guchar *) haystack;
while ((u = camel_utf8_getc (&p))) {
gunichar c;
c = g_unichar_tolower (u);
/* We have valid stripped gchar. */
if (c == nuni[0]) {
const guchar *q = p;
gint npos = 1;
while (nuni + npos < puni) {
u = camel_utf8_getc (&q);
if (!q || !u)
return NULL;
c = g_unichar_tolower (u);
if (c != nuni[npos])
break;
npos++;
}
if (nuni + npos == puni)
return (const gchar *) p;
}
}
return NULL;
}
#define CAMEL_SEARCH_COMPARE(x, y, z) G_STMT_START { \
if ((x) == (z)) { \
if ((y) == (z)) \
return 0; \
else \
return -1; \
} else if ((y) == (z)) \
return 1; \
} G_STMT_END
static gint
camel_ustrcasecmp (const gchar *ps1,
const gchar *ps2)
{
gunichar u1, u2 = 0;
const guchar *s1 = (const guchar *) ps1;
const guchar *s2 = (const guchar *) ps2;
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
u1 = camel_utf8_getc (&s1);
u2 = camel_utf8_getc (&s2);
while (u1 && u2) {
u1 = g_unichar_tolower (u1);
u2 = g_unichar_tolower (u2);
if (u1 < u2)
return -1;
else if (u1 > u2)
return 1;
u1 = camel_utf8_getc (&s1);
u2 = camel_utf8_getc (&s2);
}
/* end of one of the strings ? */
CAMEL_SEARCH_COMPARE (u1, u2, 0);
/* if we have invalid utf8 sequence ? */
/* coverity[dead_error_begin] */
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
return 0;
}
static gchar *
depunct_string (const gchar *str)
{
gchar *res;
gint ii;
g_return_val_if_fail (str != NULL, NULL);
res = g_strdup (str);
for (ii = 0; res[ii]; ii++) {
if (ispunct (res[ii]))
res[ii] = ' ';
}
return res;
}
static gboolean
camel_uwordcase (const gchar *haystack,
const gchar *needle)
{
struct _camel_search_words *hwords, *nwords;
gchar *copy_haystack, *copy_needle;
gboolean found_all;
gint ii, jj;
g_return_val_if_fail (haystack != NULL, FALSE);
g_return_val_if_fail (needle != NULL, FALSE);
if (!*needle)
return TRUE;
if (!*haystack)
return FALSE;
copy_haystack = depunct_string (haystack);
copy_needle = depunct_string (needle);
hwords = camel_search_words_split ((const guchar *) copy_haystack);
nwords = camel_search_words_split ((const guchar *) copy_needle);
g_free (copy_haystack);
g_free (copy_needle);
found_all = TRUE;
for (ii = 0; ii < nwords->len && found_all; ii++) {
found_all = FALSE;
for (jj = 0; jj < hwords->len; jj++) {
if (camel_ustrcasecmp (hwords->words[jj]->word, nwords->words[ii]->word) == 0) {
found_all = TRUE;
break;
}
}
}
camel_search_words_free (hwords);
camel_search_words_free (nwords);
return found_all;
}
static gint
camel_ustrncasecmp (const gchar *ps1,
const gchar *ps2,
gsize len)
{
gunichar u1, u2 = 0;
const guchar *s1 = (const guchar *) ps1;
const guchar *s2 = (const guchar *) ps2;
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
u1 = camel_utf8_getc (&s1);
u2 = camel_utf8_getc (&s2);
while (len > 0 && u1 && u2) {
u1 = g_unichar_tolower (u1);
u2 = g_unichar_tolower (u2);
if (u1 < u2)
return -1;
else if (u1 > u2)
return 1;
len--;
u1 = camel_utf8_getc (&s1);
u2 = camel_utf8_getc (&s2);
}
if (len == 0)
return 0;
/* end of one of the strings ? */
CAMEL_SEARCH_COMPARE (u1, u2, 0);
/* if we have invalid utf8 sequence ? */
/* coverity[dead_error_begin] */
CAMEL_SEARCH_COMPARE (s1, s2, NULL);
return 0;
}
/* Value is the match value suitable for exact match if required. */
static gint
header_match (const gchar *value,
const gchar *match,
camel_search_match_t how)
{
gint vlen, mlen;
if (how == CAMEL_SEARCH_MATCH_SOUNDEX)
return header_soundex (value, match);
vlen = strlen (value);
mlen = strlen (match);
if (vlen < mlen)
return FALSE;
switch (how) {
case CAMEL_SEARCH_MATCH_EXACT:
return camel_ustrcasecmp (value, match) == 0;
case CAMEL_SEARCH_MATCH_CONTAINS:
return camel_ustrstrcase (value, match) != NULL;
case CAMEL_SEARCH_MATCH_WORD:
return camel_uwordcase (value, match);
case CAMEL_SEARCH_MATCH_STARTS:
return camel_ustrncasecmp (value, match, mlen) == 0;
case CAMEL_SEARCH_MATCH_ENDS:
return camel_ustrcasecmp (value + vlen - mlen, match) == 0;
default:
break;
}
return FALSE;
}
/* Searches for match inside value. If match is mixed
* case, then use case-sensitive, else insensitive. */
gboolean
camel_search_header_match (const gchar *value,
const gchar *match,
camel_search_match_t how,
camel_search_t type,
const gchar *default_charset)
{
const gchar *name, *addr;
const guchar *ptr;
gint truth = FALSE, i;
CamelInternetAddress *cia;
gchar *v, *vdom, *mdom;
gunichar c;
ptr = (const guchar *) value;
while ((c = camel_utf8_getc (&ptr)) && g_unichar_isspace (c))
value = (const gchar *) ptr;
switch (type) {
case CAMEL_SEARCH_TYPE_ENCODED:
/* FIXME Find header charset. */
v = camel_header_decode_string (value, default_charset);
truth = header_match (v, match, how);
g_free (v);
break;
case CAMEL_SEARCH_TYPE_MLIST:
/* Special mailing list old-version domain hack.
* If one of the mailing list names doesn't have an @ in it,
* its old-style, so only match against the pre-domain part,
* which should be common. */
vdom = strchr (value, '@');
mdom = strchr (match, '@');
if (mdom != NULL && vdom == NULL) {
v = g_alloca (mdom - match + 1);
memcpy (v, match, mdom - match);
v[mdom - match] = 0;
match = (gchar *) v;
}
/* Falls through */
case CAMEL_SEARCH_TYPE_ASIS:
truth = header_match (value, match, how);
break;
case CAMEL_SEARCH_TYPE_ADDRESS_ENCODED:
case CAMEL_SEARCH_TYPE_ADDRESS:
/* Possible simple case to save some work if we can. */
if (header_match (value, match, how))
return TRUE;
/* Now we decode any addresses, and try
* as-is matches on name and address parts. */
cia = camel_internet_address_new ();
if (type == CAMEL_SEARCH_TYPE_ADDRESS_ENCODED)
camel_address_decode ((CamelAddress *) cia, value);
else
camel_address_unformat ((CamelAddress *) cia, value);
for (i = 0; !truth && camel_internet_address_get (cia, i, &name, &addr); i++)
truth =
(name && header_match (name, match, how)) ||
(addr && header_match (addr, match, how));
g_object_unref (cia);
break;
}
return truth;
}
/* Performs a 'slow' content-based match. */
/* There is also an identical copy of this in camel-filter-search.c. */
gboolean
camel_search_message_body_contains (CamelDataWrapper *object,
regex_t *pattern)
{
CamelDataWrapper *containee;
gint truth = FALSE;
gint parts, i;
containee = camel_medium_get_content (CAMEL_MEDIUM (object));
if (containee == NULL)
return FALSE;
/* Using the object types is more accurate than using mime/types. */
if (CAMEL_IS_MULTIPART (containee)) {
parts = camel_multipart_get_number (CAMEL_MULTIPART (containee));
for (i = 0; i < parts && truth == FALSE; i++) {
CamelDataWrapper *part = (CamelDataWrapper *) camel_multipart_get_part (CAMEL_MULTIPART (containee), i);
if (part)
truth = camel_search_message_body_contains (part, pattern);
}
} else if (CAMEL_IS_MIME_MESSAGE (containee)) {
/* For messages we only look at its contents. */
truth = camel_search_message_body_contains ((CamelDataWrapper *) containee, pattern);
} else if (camel_content_type_is (CAMEL_DATA_WRAPPER (containee)->mime_type, "text", "*")
|| camel_content_type_is (CAMEL_DATA_WRAPPER (containee)->mime_type, "x-evolution", "evolution-rss-feed")) {
/* For all other text parts we look
* inside, otherwise we don't care. */
CamelStream *stream;
GByteArray *byte_array;
const gchar *charset;
byte_array = g_byte_array_new ();
stream = camel_stream_mem_new_with_byte_array (byte_array);
charset = camel_content_type_param (CAMEL_DATA_WRAPPER (containee)->mime_type, "charset");
if (charset && *charset) {
CamelMimeFilter *filter = camel_mime_filter_charset_new (charset, "UTF-8");
if (filter) {
CamelStream *filtered = camel_stream_filter_new (stream);
if (filtered) {
camel_stream_filter_add (CAMEL_STREAM_FILTER (filtered), filter);
g_object_unref (stream);
stream = filtered;
}
g_object_unref (filter);
}
}
camel_data_wrapper_decode_to_stream_sync (
containee, stream, NULL, NULL);
camel_stream_write (stream, "", 1, NULL, NULL);
truth = regexec (pattern, (gchar *) byte_array->data, 0, NULL, 0) == 0;
g_object_unref (stream);
}
return truth;
}
static void
output_c (GString *w,
guint32 c,
gint *type)
{
gint utf8len;
gchar utf8[8];
if (!g_unichar_isalnum (c))
*type = CAMEL_SEARCH_WORD_COMPLEX | (*type & CAMEL_SEARCH_WORD_8BIT);
else
c = g_unichar_tolower (c);
if (c > 0x80)
*type |= CAMEL_SEARCH_WORD_8BIT;
/* FIXME: use camel_utf8_putc */
utf8len = g_unichar_to_utf8 (c, utf8);
utf8[utf8len] = 0;
g_string_append (w, utf8);
}
static void
output_w (GString *w,
GPtrArray *list,
gint type)
{
struct _camel_search_word *word;
if (w->len) {
word = g_malloc0 (sizeof (*word));
word->word = g_strdup (w->str);
word->type = type;
g_ptr_array_add (list, word);
g_string_truncate (w, 0);
}
}
struct _camel_search_words *
camel_search_words_split (const guchar *in)
{
gint type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
GString *w;
struct _camel_search_words *words;
GPtrArray *list = g_ptr_array_new ();
guint32 c;
gint inquote = 0;
words = g_malloc0 (sizeof (*words));
w = g_string_new ("");
do {
c = camel_utf8_getc (&in);
if (c == 0
|| (inquote && c == '"')
|| (!inquote && g_unichar_isspace (c))) {
output_w (w, list, type);
all |= type;
type = CAMEL_SEARCH_WORD_SIMPLE;
inquote = 0;
} else {
if (c == '\\') {
c = camel_utf8_getc (&in);
if (c)
output_c (w, c, &type);
else {
output_w (w, list, type);
all |= type;
}
} else if (c == '\"') {
inquote = 1;
} else {
output_c (w, c, &type);
}
}
} while (c);
g_string_free (w, TRUE);
words->len = list->len;
words->words = (struct _camel_search_word **) list->pdata;
words->type = all;
g_ptr_array_free (list, FALSE);
return words;
}
/* Takes an existing 'words' list, and converts it to another consisting
* of only simple words, with any punctuation, etc stripped. */
struct _camel_search_words *
camel_search_words_simple (struct _camel_search_words *wordin)
{
gint i;
const guchar *ptr, *start, *last;
gint type = CAMEL_SEARCH_WORD_SIMPLE, all = 0;
GPtrArray *list = g_ptr_array_new ();
struct _camel_search_word *word;
struct _camel_search_words *words;
guint32 c;
words = g_malloc0 (sizeof (*words));
for (i = 0; i < wordin->len; i++) {
if ((wordin->words[i]->type & CAMEL_SEARCH_WORD_COMPLEX) == 0) {
word = g_malloc0 (sizeof (*word));
word->type = wordin->words[i]->type;
word->word = g_strdup (wordin->words[i]->word);
g_ptr_array_add (list, word);
} else {
ptr = (const guchar *) wordin->words[i]->word;
start = last = ptr;
do {
c = camel_utf8_getc (&ptr);
if (c == 0 || !g_unichar_isalnum (c)) {
if (last > start) {
word = g_malloc0 (sizeof (*word));
word->word = g_strndup ((gchar *) start, last - start);
word->type = type;
g_ptr_array_add (list, word);
all |= type;
type = CAMEL_SEARCH_WORD_SIMPLE;
}
start = ptr;
}
if (c > 0x80)
type = CAMEL_SEARCH_WORD_8BIT;
last = ptr;
} while (c);
}
}
words->len = list->len;
words->words = (struct _camel_search_word **) list->pdata;
words->type = all;
g_ptr_array_free (list, FALSE);
return words;
}
void
camel_search_words_free (struct _camel_search_words *words)
{
gint i;
for (i = 0; i < words->len; i++) {
struct _camel_search_word *word = words->words[i];
g_free (word->word);
g_free (word);
}
g_free (words->words);
g_free (words);
}