diff options
Diffstat (limited to 'lib/Locale/Script.pm')
-rw-r--r-- | lib/Locale/Script.pm | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/lib/Locale/Script.pm b/lib/Locale/Script.pm new file mode 100644 index 0000000000..a7168fe929 --- /dev/null +++ b/lib/Locale/Script.pm @@ -0,0 +1,528 @@ +#----------------------------------------------------------------------- + +=head1 NAME + +Locale::Script - ISO codes for script identification (ISO 15924) + +=head1 SYNOPSIS + + use Locale::Script; + use Locale::Constants; + + $script = code2script('ph'); # 'Phoenician' + $code = script2code('Tibetan'); # 'bo' + $code3 = script2code('Tibetan', + LOCALE_CODE_ALPHA_3); # 'bod' + $codeN = script2code('Tibetan', + LOCALE_CODE_ALPHA_NUMERIC); # 330 + + @codes = all_script_codes(); + @scripts = all_script_names(); + +=cut + +#----------------------------------------------------------------------- + +package Locale::Script; +use strict; +require 5.002; + +#----------------------------------------------------------------------- + +=head1 DESCRIPTION + +The C<Locale::Script> module provides access to the ISO +codes for identifying scripts, as defined in ISO 15924. +For example, Egyptian hieroglyphs are denoted by the two-letter +code 'eg', the three-letter code 'egy', and the numeric code 050. + +You can either access the codes via the conversion routines +(described below), or with the two functions which return lists +of all script codes or all script names. + +There are three different code sets you can use for identifying +scripts: + +=over 4 + +=item B<alpha-2> + +Two letter codes, such as 'bo' for Tibetan. +This code set is identified with the symbol C<LOCALE_CODE_ALPHA_2>. + +=item B<alpha-3> + +Three letter codes, such as 'ell' for Greek. +This code set is identified with the symbol C<LOCALE_CODE_ALPHA_3>. + +=item B<numeric> + +Numeric codes, such as 410 for Hiragana. +This code set is identified with the symbol C<LOCALE_CODE_NUMERIC>. + +=back + +All of the routines take an optional additional argument +which specifies the code set to use. +If not specified, it defaults to the two-letter codes. +This is partly for backwards compatibility (previous versions +of Locale modules only supported the alpha-2 codes), and +partly because they are the most widely used codes. + +The alpha-2 and alpha-3 codes are not case-dependent, +so you can use 'BO', 'Bo', 'bO' or 'bo' for Tibetan. +When a code is returned by one of the functions in +this module, it will always be lower-case. + +=head2 SPECIAL CODES + +The standard defines various special codes. + +=over 4 + +=item * + +The standard reserves codes in the ranges B<qa> - B<qt>, +B<qaa> - B<qat>, and B<900> - B<919>, for private use. + +=item * + +B<zx>, B<zxx>, and B<997>, are the codes for unwritten languages. + +=item * + +B<zy>, B<zyy>, and B<998>, are the codes for an undetermined script. + +=item * + +B<zz>, B<zzz>, and B<999>, are the codes for an uncoded script. + +=back + +The private codes are not recognised by Locale::Script, +but the others are. + +=cut + +#----------------------------------------------------------------------- + +require Exporter; +use Carp; +use Locale::Constants; + + +#----------------------------------------------------------------------- +# Public Global Variables +#----------------------------------------------------------------------- +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); +$VERSION = sprintf("%d.%02d", q$Revision: 2.0 $ =~ /(\d+)\.(\d+)/); +@ISA = qw(Exporter); +@EXPORT = qw(code2script script2code + all_script_codes all_script_names + script_code2code + LOCALE_CODE_ALPHA_2 LOCALE_CODE_ALPHA_3 LOCALE_CODE_NUMERIC); + +#----------------------------------------------------------------------- +# Private Global Variables +#----------------------------------------------------------------------- +my $CODES = []; +my $COUNTRIES = []; + + +#======================================================================= + +=head1 CONVERSION ROUTINES + +There are three conversion routines: C<code2script()>, C<script2code()>, +and C<script_code2code()>. + +=over 8 + +=item code2script( CODE, [ CODESET ] ) + +This function takes a script code and returns a string +which contains the name of the script identified. +If the code is not a valid script code, as defined by ISO 15924, +then C<undef> will be returned: + + $script = code2script('cy'); # Cyrillic + +=item script2code( STRING, [ CODESET ] ) + +This function takes a script name and returns the corresponding +script code, if such exists. +If the argument could not be identified as a script name, +then C<undef> will be returned: + + $code = script2code('Gothic', LOCALE_CODE_ALPHA_3); + # $code will now be 'gth' + +The case of the script name is not important. +See the section L<KNOWN BUGS AND LIMITATIONS> below. + +=item script_code2code( CODE, CODESET, CODESET ) + +This function takes a script code from one code set, +and returns the corresponding code from another code set. + + $alpha2 = script_code2code('jwi', + LOCALE_CODE_ALPHA_3 => LOCALE_CODE_ALPHA_2); + # $alpha2 will now be 'jw' (Javanese) + +If the code passed is not a valid script code in +the first code set, or if there isn't a code for the +corresponding script in the second code set, +then C<undef> will be returned. + +=back + +=cut + +#======================================================================= +sub code2script +{ + my $code = shift; + my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; + + + return undef unless defined $code; + + #------------------------------------------------------------------- + # Make sure the code is in the right form before we use it + # to look up the corresponding script. + # We have to sprintf because the codes are given as 3-digits, + # with leading 0's. Eg 070 for Egyptian demotic. + #------------------------------------------------------------------- + if ($codeset == LOCALE_CODE_NUMERIC) + { + return undef if ($code =~ /\D/); + $code = sprintf("%.3d", $code); + } + else + { + $code = lc($code); + } + + if (exists $CODES->[$codeset]->{$code}) + { + return $CODES->[$codeset]->{$code}; + } + else + { + #--------------------------------------------------------------- + # no such script code! + #--------------------------------------------------------------- + return undef; + } +} + +sub script2code +{ + my $script = shift; + my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; + + + return undef unless defined $script; + $script = lc($script); + if (exists $COUNTRIES->[$codeset]->{$script}) + { + return $COUNTRIES->[$codeset]->{$script}; + } + else + { + #--------------------------------------------------------------- + # no such script! + #--------------------------------------------------------------- + return undef; + } +} + +sub script_code2code +{ + (@_ == 3) or croak "script_code2code() takes 3 arguments!"; + + my $code = shift; + my $inset = shift; + my $outset = shift; + my $outcode = shift; + my $script; + + + return undef if $inset == $outset; + $script = code2script($code, $inset); + return undef if not defined $script; + $outcode = script2code($script, $outset); + return $outcode; +} + +#======================================================================= + +=head1 QUERY ROUTINES + +There are two function which can be used to obtain a list of all codes, +or all script names: + +=over 8 + +=item C<all_script_codes ( [ CODESET ] )> + +Returns a list of all two-letter script codes. +The codes are guaranteed to be all lower-case, +and not in any particular order. + +=item C<all_script_names ( [ CODESET ] )> + +Returns a list of all script names for which there is a corresponding +script code in the specified code set. +The names are capitalised, and not returned in any particular order. + +=back + +=cut + +#======================================================================= +sub all_script_codes +{ + my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; + + return keys %{ $CODES->[$codeset] }; +} + +sub all_script_names +{ + my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT; + + return values %{ $CODES->[$codeset] }; +} + + +#----------------------------------------------------------------------- + +=head1 EXAMPLES + +The following example illustrates use of the C<code2script()> function. +The user is prompted for a script code, and then told the corresponding +script name: + + $| = 1; # turn off buffering + + print "Enter script code: "; + chop($code = <STDIN>); + $script = code2script($code, LOCALE_CODE_ALPHA_2); + if (defined $script) + { + print "$code = $script\n"; + } + else + { + print "'$code' is not a valid script code!\n"; + } + + +=head1 KNOWN BUGS AND LIMITATIONS + +=over 4 + +=item * + +When using C<script2code()>, the script name must currently appear +exactly as it does in the source of the module. For example, + + script2code('Egyptian hieroglyphs') + +will return B<eg>, as expected. But the following will all return C<undef>: + + script2code('hieroglyphs') + script2code('Egyptian Hieroglypics') + +If there's need for it, a future version could have variants +for script names. + +=item * + +In the current implementation, all data is read in when the +module is loaded, and then held in memory. +A lazy implementation would be more memory friendly. + +=back + +=head1 SEE ALSO + +=over 4 + +=item Locale::Language + +ISO two letter codes for identification of language (ISO 639). + +=item Locale::Currency + +ISO three letter codes for identification of currencies +and funds (ISO 4217). + +=item Locale::Country + +ISO three letter codes for identification of countries (ISO 3166) + +=item ISO 15924 + +The ISO standard which defines these codes. + +=item http://www.evertype.com/standards/iso15924/ + +Home page for ISO 15924. + + +=back + + +=head1 AUTHOR + +Neil Bowers E<lt>neil@bowers.comE<gt> + +=head1 COPYRIGHT + +Copyright (c) 2002 Neil Bowers. + +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +#----------------------------------------------------------------------- + +#======================================================================= +# initialisation code - stuff the DATA into the ALPHA2 hash +#======================================================================= +{ + my ($alpha2, $alpha3, $numeric); + my $script; + + + while (<DATA>) + { + next unless /\S/; + chop; + ($alpha2, $alpha3, $numeric, $script) = split(/:/, $_, 4); + + $CODES->[LOCALE_CODE_ALPHA_2]->{$alpha2} = $script; + $COUNTRIES->[LOCALE_CODE_ALPHA_2]->{"\L$script"} = $alpha2; + + if ($alpha3) + { + $CODES->[LOCALE_CODE_ALPHA_3]->{$alpha3} = $script; + $COUNTRIES->[LOCALE_CODE_ALPHA_3]->{"\L$script"} = $alpha3; + } + + if ($numeric) + { + $CODES->[LOCALE_CODE_NUMERIC]->{$numeric} = $script; + $COUNTRIES->[LOCALE_CODE_NUMERIC]->{"\L$script"} = $numeric; + } + + } +} + +1; + +__DATA__ +am:ama:130:Aramaic +ar:ara:160:Arabic +av:ave:151:Avestan +bh:bhm:300:Brahmi (Ashoka) +bi:bid:372:Buhid +bn:ben:325:Bengali +bo:bod:330:Tibetan +bp:bpm:285:Bopomofo +br:brl:570:Braille +bt:btk:365:Batak +bu:bug:367:Buginese (Makassar) +by:bys:550:Blissymbols +ca:cam:358:Cham +ch:chu:221:Old Church Slavonic +ci:cir:291:Cirth +cm:cmn:402:Cypro-Minoan +co:cop:205:Coptic +cp:cpr:403:Cypriote syllabary +cy:cyr:220:Cyrillic +ds:dsr:250:Deserel (Mormon) +dv:dvn:315:Devanagari (Nagari) +ed:egd:070:Egyptian demotic +eg:egy:050:Egyptian hieroglyphs +eh:egh:060:Egyptian hieratic +el:ell:200:Greek +eo:eos:210:Etruscan and Oscan +et:eth:430:Ethiopic +gl:glg:225:Glagolitic +gm:gmu:310:Gurmukhi +gt:gth:206:Gothic +gu:guj:320:Gujarati +ha:han:500:Han ideographs +he:heb:125:Hebrew +hg:hgl:420:Hangul +hm:hmo:450:Pahawh Hmong +ho:hoo:371:Hanunoo +hr:hrg:410:Hiragana +hu:hun:176:Old Hungarian runic +hv:hvn:175:Kok Turki runic +hy:hye:230:Armenian +iv:ivl:610:Indus Valley +ja:jap:930:(alias for Han + Hiragana + Katakana) +jl:jlg:445:Cherokee syllabary +jw:jwi:360:Javanese +ka:kam:241:Georgian (Mxedruli) +kh:khn:931:(alias for Hangul + Han) +kk:kkn:411:Katakana +km:khm:354:Khmer +kn:kan:345:Kannada +kr:krn:357:Karenni (Kayah Li) +ks:kst:305:Kharoshthi +kx:kax:240:Georgian (Xucuri) +la:lat:217:Latin +lf:laf:215:Latin (Fraktur variant) +lg:lag:216:Latin (Gaelic variant) +lo:lao:356:Lao +lp:lpc:335:Lepcha (Rong) +md:mda:140:Mandaean +me:mer:100:Meroitic +mh:may:090:Mayan hieroglyphs +ml:mlm:347:Malayalam +mn:mon:145:Mongolian +my:mya:350:Burmese +na:naa:400:Linear A +nb:nbb:401:Linear B +og:ogm:212:Ogham +or:ory:327:Oriya +os:osm:260:Osmanya +ph:phx:115:Phoenician +ph:pah:150:Pahlavi +pl:pld:282:Pollard Phonetic +pq:pqd:295:Klingon plQaD +pr:prm:227:Old Permic +ps:pst:600:Phaistos Disk +rn:rnr:211:Runic (Germanic) +rr:rro:620:Rongo-rongo +sa:sar:110:South Arabian +si:sin:348:Sinhala +sj:syj:137:Syriac (Jacobite variant) +sl:slb:440:Unified Canadian Aboriginal Syllabics +sn:syn:136:Syriac (Nestorian variant) +sw:sww:281:Shavian (Shaw) +sy:syr:135:Syriac (Estrangelo) +ta:tam:346:Tamil +tb:tbw:373:Tagbanwa +te:tel:340:Telugu +tf:tfn:120:Tifnagh +tg:tag:370:Tagalog +th:tha:352:Thai +tn:tna:170:Thaana +tw:twr:290:Tengwar +va:vai:470:Vai +vs:vsp:280:Visible Speech +xa:xas:000:Cuneiform, Sumero-Akkadian +xf:xfa:105:Cuneiform, Old Persian +xk:xkn:412:(alias for Hiragana + Katakana) +xu:xug:106:Cuneiform, Ugaritic +yi:yii:460:Yi +zx:zxx:997:Unwritten language +zy:zyy:998:Undetermined script +zz:zzz:999:Uncoded script |