diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2007-10-15 21:36:50 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2007-10-15 21:36:50 +0000 |
commit | 5fcb079858bb392e87067b5526e9df950db38024 (patch) | |
tree | 4ffb764af092be94fbe0e033dce2f492e6c937f7 /contrib/dict_int | |
parent | fb631dba2a3c2c183bb99f2098491ecf96fb6664 (diff) | |
download | postgresql-5fcb079858bb392e87067b5526e9df950db38024.tar.gz |
Add sample text search dictionary templates and parsers, to replace the
hard-to-maintain textual examples currently in the SGML docs. From
Sergey Karpov.
Diffstat (limited to 'contrib/dict_int')
-rw-r--r-- | contrib/dict_int/Makefile | 19 | ||||
-rw-r--r-- | contrib/dict_int/README.dict_int | 41 | ||||
-rw-r--r-- | contrib/dict_int/dict_int.c | 99 | ||||
-rw-r--r-- | contrib/dict_int/dict_int.sql.in | 29 | ||||
-rw-r--r-- | contrib/dict_int/expected/dict_int.out | 308 | ||||
-rw-r--r-- | contrib/dict_int/sql/dict_int.sql | 61 | ||||
-rw-r--r-- | contrib/dict_int/uninstall_dict_int.sql | 9 |
7 files changed, 566 insertions, 0 deletions
diff --git a/contrib/dict_int/Makefile b/contrib/dict_int/Makefile new file mode 100644 index 0000000000..4e03a69a6e --- /dev/null +++ b/contrib/dict_int/Makefile @@ -0,0 +1,19 @@ +# $PostgreSQL: pgsql/contrib/dict_int/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +MODULE_big = dict_int +OBJS = dict_int.o +DATA_built = dict_int.sql +DATA = uninstall_dict_int.sql +DOCS = README.dict_int +REGRESS = dict_int + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/dict_int +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/dict_int/README.dict_int b/contrib/dict_int/README.dict_int new file mode 100644 index 0000000000..5883c1c2f5 --- /dev/null +++ b/contrib/dict_int/README.dict_int @@ -0,0 +1,41 @@ +Dictionary for integers +======================= + +The motivation for this example dictionary is to control the indexing of +integers (signed and unsigned), and, consequently, to minimize the number of +unique words which greatly affect the performance of searching. + +* Configuration + +The dictionary accepts two options: + + - The MAXLEN parameter specifies the maximum length (number of digits) + allowed in an integer word. The default value is 6. + + - The REJECTLONG parameter specifies if an overlength integer should be + truncated or ignored. If REJECTLONG=FALSE (default), the dictionary returns + the first MAXLEN digits of the integer. If REJECTLONG=TRUE, the + dictionary treats an overlength integer as a stop word, so that it will + not be indexed. + +* Usage + +1. Compile and install + +2. Load dictionary + + psql mydb < dict_int.sql + +3. Test it + + mydb# select ts_lexize('intdict', '12345678'); + ts_lexize + ----------- + {123456} + +4. Change its options as you wish + + mydb# ALTER TEXT SEARCH DICTIONARY intdict (MAXLEN = 4, REJECTLONG = true); + ALTER TEXT SEARCH DICTIONARY + +That's all. diff --git a/contrib/dict_int/dict_int.c b/contrib/dict_int/dict_int.c new file mode 100644 index 0000000000..85d45491cc --- /dev/null +++ b/contrib/dict_int/dict_int.c @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------- + * + * dict_int.c + * Text search dictionary for integers + * + * Copyright (c) 2007, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/dict_int/dict_int.c,v 1.1 2007/10/15 21:36:50 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/defrem.h" +#include "fmgr.h" +#include "tsearch/ts_public.h" + +PG_MODULE_MAGIC; + + +typedef struct { + int maxlen; + bool rejectlong; +} DictInt; + + +PG_FUNCTION_INFO_V1(dintdict_init); +Datum dintdict_init(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(dintdict_lexize); +Datum dintdict_lexize(PG_FUNCTION_ARGS); + +Datum +dintdict_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictInt *d; + ListCell *l; + + d = (DictInt *) palloc0(sizeof(DictInt)); + d->maxlen = 6; + d->rejectlong = false; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (pg_strcasecmp(defel->defname, "MAXLEN") == 0) + { + d->maxlen = atoi(defGetString(defel)); + } + else if (pg_strcasecmp(defel->defname, "REJECTLONG") == 0) + { + d->rejectlong = defGetBoolean(defel); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized intdict parameter: \"%s\"", + defel->defname))); + } + } + + PG_RETURN_POINTER(d); +} + +Datum +dintdict_lexize(PG_FUNCTION_ARGS) +{ + DictInt *d = (DictInt*)PG_GETARG_POINTER(0); + char *in = (char*)PG_GETARG_POINTER(1); + char *txt = pnstrdup(in, PG_GETARG_INT32(2)); + TSLexeme *res=palloc(sizeof(TSLexeme)*2); + + res[1].lexeme = NULL; + if (PG_GETARG_INT32(2) > d->maxlen) + { + if ( d->rejectlong ) + { + /* reject by returning void array */ + pfree(txt); + res[0].lexeme = NULL; + } + else + { + /* trim integer */ + txt[d->maxlen] = '\0'; + res[0].lexeme = txt; + } + } + else + { + res[0].lexeme = txt; + } + + PG_RETURN_POINTER(res); +} diff --git a/contrib/dict_int/dict_int.sql.in b/contrib/dict_int/dict_int.sql.in new file mode 100644 index 0000000000..0bd97a83e2 --- /dev/null +++ b/contrib/dict_int/dict_int.sql.in @@ -0,0 +1,29 @@ +-- $PostgreSQL: pgsql/contrib/dict_int/dict_int.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $ + +-- Adjust this setting to control where the objects get created. +SET search_path = public; + +BEGIN; + +CREATE FUNCTION dintdict_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION dintdict_lexize(internal, internal, internal, internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE TEXT SEARCH TEMPLATE intdict_template ( + LEXIZE = dintdict_lexize, + INIT = dintdict_init +); + +CREATE TEXT SEARCH DICTIONARY intdict ( + TEMPLATE = intdict_template +); + +COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'dictionary for integers'; + +END; diff --git a/contrib/dict_int/expected/dict_int.out b/contrib/dict_int/expected/dict_int.out new file mode 100644 index 0000000000..7feb493e15 --- /dev/null +++ b/contrib/dict_int/expected/dict_int.out @@ -0,0 +1,308 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +RESET client_min_messages; +--lexize +select ts_lexize('intdict', '511673'); + ts_lexize +----------- + {511673} +(1 row) + +select ts_lexize('intdict', '129'); + ts_lexize +----------- + {129} +(1 row) + +select ts_lexize('intdict', '40865854'); + ts_lexize +----------- + {408658} +(1 row) + +select ts_lexize('intdict', '952'); + ts_lexize +----------- + {952} +(1 row) + +select ts_lexize('intdict', '654980341'); + ts_lexize +----------- + {654980} +(1 row) + +select ts_lexize('intdict', '09810106'); + ts_lexize +----------- + {098101} +(1 row) + +select ts_lexize('intdict', '14262713'); + ts_lexize +----------- + {142627} +(1 row) + +select ts_lexize('intdict', '6532082986'); + ts_lexize +----------- + {653208} +(1 row) + +select ts_lexize('intdict', '0150061'); + ts_lexize +----------- + {015006} +(1 row) + +select ts_lexize('intdict', '7778'); + ts_lexize +----------- + {7778} +(1 row) + +select ts_lexize('intdict', '9547'); + ts_lexize +----------- + {9547} +(1 row) + +select ts_lexize('intdict', '753395478'); + ts_lexize +----------- + {753395} +(1 row) + +select ts_lexize('intdict', '647652'); + ts_lexize +----------- + {647652} +(1 row) + +select ts_lexize('intdict', '6988655574'); + ts_lexize +----------- + {698865} +(1 row) + +select ts_lexize('intdict', '1279'); + ts_lexize +----------- + {1279} +(1 row) + +select ts_lexize('intdict', '1266645909'); + ts_lexize +----------- + {126664} +(1 row) + +select ts_lexize('intdict', '7594193969'); + ts_lexize +----------- + {759419} +(1 row) + +select ts_lexize('intdict', '16928207'); + ts_lexize +----------- + {169282} +(1 row) + +select ts_lexize('intdict', '196850350328'); + ts_lexize +----------- + {196850} +(1 row) + +select ts_lexize('intdict', '22026985592'); + ts_lexize +----------- + {220269} +(1 row) + +select ts_lexize('intdict', '2063765'); + ts_lexize +----------- + {206376} +(1 row) + +select ts_lexize('intdict', '242387310'); + ts_lexize +----------- + {242387} +(1 row) + +select ts_lexize('intdict', '93595'); + ts_lexize +----------- + {93595} +(1 row) + +select ts_lexize('intdict', '9374'); + ts_lexize +----------- + {9374} +(1 row) + +select ts_lexize('intdict', '996969'); + ts_lexize +----------- + {996969} +(1 row) + +select ts_lexize('intdict', '353595982'); + ts_lexize +----------- + {353595} +(1 row) + +select ts_lexize('intdict', '925860'); + ts_lexize +----------- + {925860} +(1 row) + +select ts_lexize('intdict', '11848378337'); + ts_lexize +----------- + {118483} +(1 row) + +select ts_lexize('intdict', '333'); + ts_lexize +----------- + {333} +(1 row) + +select ts_lexize('intdict', '799287416765'); + ts_lexize +----------- + {799287} +(1 row) + +select ts_lexize('intdict', '745939'); + ts_lexize +----------- + {745939} +(1 row) + +select ts_lexize('intdict', '67601305734'); + ts_lexize +----------- + {676013} +(1 row) + +select ts_lexize('intdict', '3361113'); + ts_lexize +----------- + {336111} +(1 row) + +select ts_lexize('intdict', '9033778607'); + ts_lexize +----------- + {903377} +(1 row) + +select ts_lexize('intdict', '7507648'); + ts_lexize +----------- + {750764} +(1 row) + +select ts_lexize('intdict', '1166'); + ts_lexize +----------- + {1166} +(1 row) + +select ts_lexize('intdict', '9360498'); + ts_lexize +----------- + {936049} +(1 row) + +select ts_lexize('intdict', '917795'); + ts_lexize +----------- + {917795} +(1 row) + +select ts_lexize('intdict', '9387894'); + ts_lexize +----------- + {938789} +(1 row) + +select ts_lexize('intdict', '42764329'); + ts_lexize +----------- + {427643} +(1 row) + +select ts_lexize('intdict', '564062'); + ts_lexize +----------- + {564062} +(1 row) + +select ts_lexize('intdict', '5413377'); + ts_lexize +----------- + {541337} +(1 row) + +select ts_lexize('intdict', '060965'); + ts_lexize +----------- + {060965} +(1 row) + +select ts_lexize('intdict', '08273593'); + ts_lexize +----------- + {082735} +(1 row) + +select ts_lexize('intdict', '593556010144'); + ts_lexize +----------- + {593556} +(1 row) + +select ts_lexize('intdict', '17988843352'); + ts_lexize +----------- + {179888} +(1 row) + +select ts_lexize('intdict', '252281774'); + ts_lexize +----------- + {252281} +(1 row) + +select ts_lexize('intdict', '313425'); + ts_lexize +----------- + {313425} +(1 row) + +select ts_lexize('intdict', '641439323669'); + ts_lexize +----------- + {641439} +(1 row) + +select ts_lexize('intdict', '314532610153'); + ts_lexize +----------- + {314532} +(1 row) + diff --git a/contrib/dict_int/sql/dict_int.sql b/contrib/dict_int/sql/dict_int.sql new file mode 100644 index 0000000000..3a335f8f3d --- /dev/null +++ b/contrib/dict_int/sql/dict_int.sql @@ -0,0 +1,61 @@ +-- +-- first, define the datatype. Turn off echoing so that expected file +-- does not depend on contents of this file. +-- +SET client_min_messages = warning; +\set ECHO none +\i dict_int.sql +\set ECHO all +RESET client_min_messages; + +--lexize +select ts_lexize('intdict', '511673'); +select ts_lexize('intdict', '129'); +select ts_lexize('intdict', '40865854'); +select ts_lexize('intdict', '952'); +select ts_lexize('intdict', '654980341'); +select ts_lexize('intdict', '09810106'); +select ts_lexize('intdict', '14262713'); +select ts_lexize('intdict', '6532082986'); +select ts_lexize('intdict', '0150061'); +select ts_lexize('intdict', '7778'); +select ts_lexize('intdict', '9547'); +select ts_lexize('intdict', '753395478'); +select ts_lexize('intdict', '647652'); +select ts_lexize('intdict', '6988655574'); +select ts_lexize('intdict', '1279'); +select ts_lexize('intdict', '1266645909'); +select ts_lexize('intdict', '7594193969'); +select ts_lexize('intdict', '16928207'); +select ts_lexize('intdict', '196850350328'); +select ts_lexize('intdict', '22026985592'); +select ts_lexize('intdict', '2063765'); +select ts_lexize('intdict', '242387310'); +select ts_lexize('intdict', '93595'); +select ts_lexize('intdict', '9374'); +select ts_lexize('intdict', '996969'); +select ts_lexize('intdict', '353595982'); +select ts_lexize('intdict', '925860'); +select ts_lexize('intdict', '11848378337'); +select ts_lexize('intdict', '333'); +select ts_lexize('intdict', '799287416765'); +select ts_lexize('intdict', '745939'); +select ts_lexize('intdict', '67601305734'); +select ts_lexize('intdict', '3361113'); +select ts_lexize('intdict', '9033778607'); +select ts_lexize('intdict', '7507648'); +select ts_lexize('intdict', '1166'); +select ts_lexize('intdict', '9360498'); +select ts_lexize('intdict', '917795'); +select ts_lexize('intdict', '9387894'); +select ts_lexize('intdict', '42764329'); +select ts_lexize('intdict', '564062'); +select ts_lexize('intdict', '5413377'); +select ts_lexize('intdict', '060965'); +select ts_lexize('intdict', '08273593'); +select ts_lexize('intdict', '593556010144'); +select ts_lexize('intdict', '17988843352'); +select ts_lexize('intdict', '252281774'); +select ts_lexize('intdict', '313425'); +select ts_lexize('intdict', '641439323669'); +select ts_lexize('intdict', '314532610153'); diff --git a/contrib/dict_int/uninstall_dict_int.sql b/contrib/dict_int/uninstall_dict_int.sql new file mode 100644 index 0000000000..0323ab298e --- /dev/null +++ b/contrib/dict_int/uninstall_dict_int.sql @@ -0,0 +1,9 @@ +SET search_path = public; + +DROP TEXT SEARCH DICTIONARY intdict; + +DROP TEXT SEARCH TEMPLATE intdict_template; + +DROP FUNCTION dintdict_init(internal); + +DROP FUNCTION dintdict_lexize(internal,internal,internal,internal); |