diff options
author | simonmar <unknown> | 2005-03-14 15:57:57 +0000 |
---|---|---|
committer | simonmar <unknown> | 2005-03-14 15:57:57 +0000 |
commit | 976352fe3808588b8317eb3c957ce5979548cc8d (patch) | |
tree | 2e1e640883dd6147c0a8f3b97dfc3f5c6a9342d6 | |
parent | 44c3027bdbcafdb81c2d6a180846d77135bcf1fe (diff) | |
download | haskell-976352fe3808588b8317eb3c957ce5979548cc8d.tar.gz |
[project @ 2005-03-14 15:57:57 by simonmar]
Add the script used to generate WCsubst.c
-rw-r--r-- | libraries/base/cbits/ubconfc | 339 |
1 files changed, 339 insertions, 0 deletions
diff --git a/libraries/base/cbits/ubconfc b/libraries/base/cbits/ubconfc new file mode 100644 index 0000000000..6344eccafe --- /dev/null +++ b/libraries/base/cbits/ubconfc @@ -0,0 +1,339 @@ +#!/bin/sh + +# -------------------------------------------------------------------------- +# This is the script to create the unicode chars property table +# Written by Dimitry Golubovsky (dimitry@golubovsky.org) as part +# of the Partial Unicode Support patch +# +# Adopted for use with GHC. +# License: see libraries/base/LICENSE +# +# ------------------------------------------------------------------------- + +# The script reads the file from the standard input, +# and outputs C code into the standard output. +# The C code contains the chars property table, and basic functions +# to access properties. + +# Output the file header + +echo "/*-------------------------------------------------------------------------" +echo "This is an automatically generated file: do not edit" +echo "Generated by `basename $0` at `date`" +echo "-------------------------------------------------------------------------*/" +echo +echo "#include \"WCsubst.h\"" + +# Define structures + +cat <<EOF + +/* Unicode general categories, listed in the same order as in the Unicode + * standard -- this must be the same order as in GHC.Unicode. + */ + +enum { + NUMCAT_LU, /* Letter, Uppercase */ + NUMCAT_LL, /* Letter, Lowercase */ + NUMCAT_LT, /* Letter, Titlecase */ + NUMCAT_LM, /* Letter, Modifier */ + NUMCAT_LO, /* Letter, Other */ + NUMCAT_MN, /* Mark, Non-Spacing */ + NUMCAT_MC, /* Mark, Spacing Combining */ + NUMCAT_ME, /* Mark, Enclosing */ + NUMCAT_ND, /* Number, Decimal */ + NUMCAT_NL, /* Number, Letter */ + NUMCAT_NO, /* Number, Other */ + NUMCAT_PC, /* Punctuation, Connector */ + NUMCAT_PD, /* Punctuation, Dash */ + NUMCAT_PS, /* Punctuation, Open */ + NUMCAT_PE, /* Punctuation, Close */ + NUMCAT_PI, /* Punctuation, Initial quote */ + NUMCAT_PF, /* Punctuation, Final quote */ + NUMCAT_PO, /* Punctuation, Other */ + NUMCAT_SM, /* Symbol, Math */ + NUMCAT_SC, /* Symbol, Currency */ + NUMCAT_SK, /* Symbol, Modifier */ + NUMCAT_SO, /* Symbol, Other */ + NUMCAT_ZS, /* Separator, Space */ + NUMCAT_ZL, /* Separator, Line */ + NUMCAT_ZP, /* Separator, Paragraph */ + NUMCAT_CC, /* Other, Control */ + NUMCAT_CF, /* Other, Format */ + NUMCAT_CS, /* Other, Surrogate */ + NUMCAT_CO, /* Other, Private Use */ + NUMCAT_CN /* Other, Not Assigned */ +}; + +struct _convrule_ +{ + unsigned int category; + unsigned int catnumber; + int possible; + int updist; + int lowdist; + int titledist; +}; + +struct _charblock_ +{ + int start; + int length; + const struct _convrule_ *rule; +}; + +EOF + +# Convert the stdin file to the C table + +awk ' +BEGIN { + FS=";" + catidx=0 + rulidx=0 + blockidx=0 + cblckidx=0 + sblckidx=0 + blockb=-1 + blockl=0 + digs="0123456789ABCDEF" + for(i=0;i<16;i++) + { + hex[substr(digs,i+1,1)]=i; + } +} +function em1(a) +{ + if(a=="") return "-1" + return "0x"a +} +function h2d(a) +{ + l=length(a) + acc=0 + for(i=1;i<=l;i++) + { + acc=acc*16+hex[substr(a,i,1)]; + } + return acc +} +function dumpblock() +{ + blkd=blockb ", " blockl ", &rule" rules[blockr] + blocks[blockidx]=blkd + blockidx++ + if(blockb<=256) lat1idx++ + split(blockr,rsp,",") + if(substr(rsp[3],2,1)=="1") + { + cblcks[cblckidx]=blkd + cblckidx++ + } + if(rsp[1]=="GENCAT_ZS") + { + sblcks[sblckidx]=blkd + sblckidx++ + } + blockb=self + blockl=1 + blockr=rule +} +{ + name=$2 + cat=toupper($3) + self=h2d($1) + up=h2d($13) + low=h2d($14) + title=h2d($15) + convpos=1 + if((up==0)&&(low==0)&&(title==0)) convpos=0 + if(up==0) up=self + if(low==0) low=self + if(title==0) title=self + updist=up-self + lowdist=low-self + titledist=title-self + rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)? + ("1, " updist ", " lowdist ", " titledist): + ("0, 0, 0, 0")) + if(cats[cat]=="") + { + cats[cat]=(2^catidx); + catidx++; + } + if(rules[rule]=="") + { + rules[rule]=rulidx; + rulidx++; + } + if(blockb==-1) + { + blockb=self + blockl=1 + blockr=rule + } + else + { + if (index(name,"First>")!=0) + { + dumpblock() + } + else if (index(name,"Last>")!=0) + { + blockl+=(self-blockb) + } + else if((self==blockb+blockl)&&(rule==blockr)) blockl++ + else + { + dumpblock() + } + } +} +END { + dumpblock() + for(c in cats) print "#define GENCAT_"c" "cats[c] + print "#define MAX_UNI_CHAR " self + print "#define NUM_BLOCKS " blockidx + print "#define NUM_CONVBLOCKS " cblckidx + print "#define NUM_SPACEBLOCKS " sblckidx + print "#define NUM_LAT1BLOCKS " lat1idx + print "#define NUM_RULES " rulidx + for(r in rules) + { + printf "static const struct _convrule_ rule" rules[r] "={" r "};\n" + } + print "static const struct _charblock_ allchars[]={" + for(i=0;i<blockidx;i++) + { + printf "\t{" blocks[i] "}" + print (i<(blockidx-1))?",":"" + } + print "};" + print "static const struct _charblock_ convchars[]={" + for(i=0;i<cblckidx;i++) + { + printf "\t{" cblcks[i] "}" + print (i<(cblckidx-1))?",":"" + } + print "};" + print "static const struct _charblock_ spacechars[]={" + for(i=0;i<sblckidx;i++) + { + printf "\t{" sblcks[i] "}" + print (i<(sblckidx-1))?",":"" + } + print "};" +} +' +# Output the C procedures code + +cat <<EOF + +/* + Obtain the reference to character rule by doing + binary search over the specified array of blocks. + To make checkattr shorter, the address of + nullrule is returned if the search fails: + this rule defines no category and no conversion + distances. The compare function returns 0 when + key->start is within the block. Otherwise + result of comparison of key->start and start of the + current block is returned as usual. +*/ + +static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0}; + +int blkcmp(const void *vk,const void *vb) +{ + const struct _charblock_ *key,*cur; + key=vk; + cur=vb; + if((key->start>=cur->start)&&(key->start<(cur->start+cur->length))) + { + return 0; + } + if(key->start>cur->start) return 1; + return -1; +} + +static const struct _convrule_ *getrule( + const struct _charblock_ *blocks, + int numblocks, + int unichar) +{ + struct _charblock_ key={unichar,1,(void *)0}; + struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp); + if(cb==(void *)0) return &nullrule; + return cb->rule; +} + + + +/* + Check whether a character (internal code) has certain attributes. + Attributes (category flags) may be ORed. The function ANDs + character category flags and the mask and returns the result. + If the character belongs to one of the categories requested, + the result will be nonzero. +*/ + +inline static int checkattr(int c,unsigned int catmask) +{ + return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category)); +} + +inline static int checkattr_s(int c,unsigned int catmask) +{ + return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category)); +} + +/* + Define predicate functions for some combinations of categories. +*/ + +#define unipred(p,m) \\ +int p(int c) \\ +{ \\ + return checkattr(c,m); \\ +} + +#define unipred_s(p,m) \\ +int p(int c) \\ +{ \\ + return checkattr_s(c,m); \\ +} + +/* + Make these rules as close to Hugs as possible. +*/ + +unipred(u_iswcntrl,GENCAT_CC) +unipred(u_iswprint,~(GENCAT_ZL|GENCAT_ZP|GENCAT_CC|GENCAT_CF|GENCAT_CS|GENCAT_CO)) +unipred_s(u_iswspace,GENCAT_ZS) +unipred(u_iswupper,(GENCAT_LU|GENCAT_LT)) +unipred(u_iswlower,GENCAT_LL) +unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO)) +unipred(u_iswdigit,GENCAT_ND) + +unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO| + GENCAT_MC|GENCAT_ME|GENCAT_MN| + GENCAT_NO|GENCAT_ND|GENCAT_NL)) + +#define caseconv(p,to) \\ +int p(int c) \\ +{ \\ + const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\ + if(rule==&nullrule) return c;\\ + return c+rule->##to;\\ +} + +caseconv(u_towupper,updist) +caseconv(u_towlower,lowdist) +caseconv(u_towtitle,titledist) + +int u_gencat(int c) +{ + return getrule(allchars,NUM_BLOCKS,c)->catnumber; +} + +EOF |