diff options
Diffstat (limited to 'gcc/spellcheck.c')
-rw-r--r-- | gcc/spellcheck.c | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c new file mode 100644 index 00000000000..31ce3224546 --- /dev/null +++ b/gcc/spellcheck.c @@ -0,0 +1,136 @@ +/* Find near-matches for strings and identifiers. + Copyright (C) 2015 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "spellcheck.h" + +/* The Levenshtein distance is an "edit-distance": the minimal + number of one-character insertions, removals or substitutions + that are needed to change one string into another. + + This implementation uses the Wagner-Fischer algorithm. */ + +static edit_distance_t +levenshtein_distance (const char *s, int len_s, + const char *t, int len_t) +{ + const bool debug = false; + + if (debug) + { + printf ("s: \"%s\" (len_s=%i)\n", s, len_s); + printf ("t: \"%s\" (len_t=%i)\n", t, len_t); + } + + if (len_s == 0) + return len_t; + if (len_t == 0) + return len_s; + + /* We effectively build a matrix where each (i, j) contains the + Levenshtein distance between the prefix strings s[0:j] + and t[0:i]. + Rather than actually build an (len_t + 1) * (len_s + 1) matrix, + we simply keep track of the last row, v0 and a new row, v1, + which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses + in favor of two (len_s + 1) allocations. These could potentially be + statically-allocated if we impose a maximum length on the + strings of interest. */ + edit_distance_t *v0 = new edit_distance_t[len_s + 1]; + edit_distance_t *v1 = new edit_distance_t[len_s + 1]; + + /* The first row is for the case of an empty target string, which + we can reach by deleting every character in the source string. */ + for (int i = 0; i < len_s + 1; i++) + v0[i] = i; + + /* Build successive rows. */ + for (int i = 0; i < len_t; i++) + { + if (debug) + { + printf ("i:%i v0 = ", i); + for (int j = 0; j < len_s + 1; j++) + printf ("%i ", v0[j]); + printf ("\n"); + } + + /* The initial column is for the case of an empty source string; we + can reach prefixes of the target string of length i + by inserting i characters. */ + v1[0] = i + 1; + + /* Build the rest of the row by considering neighbours to + the north, west and northwest. */ + for (int j = 0; j < len_s; j++) + { + edit_distance_t cost = (s[j] == t[i] ? 0 : 1); + edit_distance_t deletion = v1[j] + 1; + edit_distance_t insertion = v0[j + 1] + 1; + edit_distance_t substitution = v0[j] + cost; + edit_distance_t cheapest = MIN (deletion, insertion); + cheapest = MIN (cheapest, substitution); + v1[j + 1] = cheapest; + } + + /* Prepare to move on to next row. */ + for (int j = 0; j < len_s + 1; j++) + v0[j] = v1[j]; + } + + if (debug) + { + printf ("final v1 = "); + for (int j = 0; j < len_s + 1; j++) + printf ("%i ", v1[j]); + printf ("\n"); + } + + edit_distance_t result = v1[len_s]; + delete[] v0; + delete[] v1; + return result; +} + +/* Calculate Levenshtein distance between two nil-terminated strings. + This exists purely for the unit tests. */ + +edit_distance_t +levenshtein_distance (const char *s, const char *t) +{ + return levenshtein_distance (s, strlen (s), t, strlen (t)); +} + +/* Calculate Levenshtein distance between two identifiers. */ + +edit_distance_t +levenshtein_distance (tree ident_s, tree ident_t) +{ + gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE); + gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE); + + return levenshtein_distance (IDENTIFIER_POINTER (ident_s), + IDENTIFIER_LENGTH (ident_s), + IDENTIFIER_POINTER (ident_t), + IDENTIFIER_LENGTH (ident_t)); +} |