summaryrefslogtreecommitdiff
path: root/lib/Locale/Script.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Locale/Script.pm')
-rw-r--r--lib/Locale/Script.pm528
1 files changed, 528 insertions, 0 deletions
diff --git a/lib/Locale/Script.pm b/lib/Locale/Script.pm
new file mode 100644
index 0000000000..a7168fe929
--- /dev/null
+++ b/lib/Locale/Script.pm
@@ -0,0 +1,528 @@
+#-----------------------------------------------------------------------
+
+=head1 NAME
+
+Locale::Script - ISO codes for script identification (ISO 15924)
+
+=head1 SYNOPSIS
+
+ use Locale::Script;
+ use Locale::Constants;
+
+ $script = code2script('ph'); # 'Phoenician'
+ $code = script2code('Tibetan'); # 'bo'
+ $code3 = script2code('Tibetan',
+ LOCALE_CODE_ALPHA_3); # 'bod'
+ $codeN = script2code('Tibetan',
+ LOCALE_CODE_ALPHA_NUMERIC); # 330
+
+ @codes = all_script_codes();
+ @scripts = all_script_names();
+
+=cut
+
+#-----------------------------------------------------------------------
+
+package Locale::Script;
+use strict;
+require 5.002;
+
+#-----------------------------------------------------------------------
+
+=head1 DESCRIPTION
+
+The C<Locale::Script> module provides access to the ISO
+codes for identifying scripts, as defined in ISO 15924.
+For example, Egyptian hieroglyphs are denoted by the two-letter
+code 'eg', the three-letter code 'egy', and the numeric code 050.
+
+You can either access the codes via the conversion routines
+(described below), or with the two functions which return lists
+of all script codes or all script names.
+
+There are three different code sets you can use for identifying
+scripts:
+
+=over 4
+
+=item B<alpha-2>
+
+Two letter codes, such as 'bo' for Tibetan.
+This code set is identified with the symbol C<LOCALE_CODE_ALPHA_2>.
+
+=item B<alpha-3>
+
+Three letter codes, such as 'ell' for Greek.
+This code set is identified with the symbol C<LOCALE_CODE_ALPHA_3>.
+
+=item B<numeric>
+
+Numeric codes, such as 410 for Hiragana.
+This code set is identified with the symbol C<LOCALE_CODE_NUMERIC>.
+
+=back
+
+All of the routines take an optional additional argument
+which specifies the code set to use.
+If not specified, it defaults to the two-letter codes.
+This is partly for backwards compatibility (previous versions
+of Locale modules only supported the alpha-2 codes), and
+partly because they are the most widely used codes.
+
+The alpha-2 and alpha-3 codes are not case-dependent,
+so you can use 'BO', 'Bo', 'bO' or 'bo' for Tibetan.
+When a code is returned by one of the functions in
+this module, it will always be lower-case.
+
+=head2 SPECIAL CODES
+
+The standard defines various special codes.
+
+=over 4
+
+=item *
+
+The standard reserves codes in the ranges B<qa> - B<qt>,
+B<qaa> - B<qat>, and B<900> - B<919>, for private use.
+
+=item *
+
+B<zx>, B<zxx>, and B<997>, are the codes for unwritten languages.
+
+=item *
+
+B<zy>, B<zyy>, and B<998>, are the codes for an undetermined script.
+
+=item *
+
+B<zz>, B<zzz>, and B<999>, are the codes for an uncoded script.
+
+=back
+
+The private codes are not recognised by Locale::Script,
+but the others are.
+
+=cut
+
+#-----------------------------------------------------------------------
+
+require Exporter;
+use Carp;
+use Locale::Constants;
+
+
+#-----------------------------------------------------------------------
+# Public Global Variables
+#-----------------------------------------------------------------------
+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
+$VERSION = sprintf("%d.%02d", q$Revision: 2.0 $ =~ /(\d+)\.(\d+)/);
+@ISA = qw(Exporter);
+@EXPORT = qw(code2script script2code
+ all_script_codes all_script_names
+ script_code2code
+ LOCALE_CODE_ALPHA_2 LOCALE_CODE_ALPHA_3 LOCALE_CODE_NUMERIC);
+
+#-----------------------------------------------------------------------
+# Private Global Variables
+#-----------------------------------------------------------------------
+my $CODES = [];
+my $COUNTRIES = [];
+
+
+#=======================================================================
+
+=head1 CONVERSION ROUTINES
+
+There are three conversion routines: C<code2script()>, C<script2code()>,
+and C<script_code2code()>.
+
+=over 8
+
+=item code2script( CODE, [ CODESET ] )
+
+This function takes a script code and returns a string
+which contains the name of the script identified.
+If the code is not a valid script code, as defined by ISO 15924,
+then C<undef> will be returned:
+
+ $script = code2script('cy'); # Cyrillic
+
+=item script2code( STRING, [ CODESET ] )
+
+This function takes a script name and returns the corresponding
+script code, if such exists.
+If the argument could not be identified as a script name,
+then C<undef> will be returned:
+
+ $code = script2code('Gothic', LOCALE_CODE_ALPHA_3);
+ # $code will now be 'gth'
+
+The case of the script name is not important.
+See the section L<KNOWN BUGS AND LIMITATIONS> below.
+
+=item script_code2code( CODE, CODESET, CODESET )
+
+This function takes a script code from one code set,
+and returns the corresponding code from another code set.
+
+ $alpha2 = script_code2code('jwi',
+ LOCALE_CODE_ALPHA_3 => LOCALE_CODE_ALPHA_2);
+ # $alpha2 will now be 'jw' (Javanese)
+
+If the code passed is not a valid script code in
+the first code set, or if there isn't a code for the
+corresponding script in the second code set,
+then C<undef> will be returned.
+
+=back
+
+=cut
+
+#=======================================================================
+sub code2script
+{
+ my $code = shift;
+ my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
+
+
+ return undef unless defined $code;
+
+ #-------------------------------------------------------------------
+ # Make sure the code is in the right form before we use it
+ # to look up the corresponding script.
+ # We have to sprintf because the codes are given as 3-digits,
+ # with leading 0's. Eg 070 for Egyptian demotic.
+ #-------------------------------------------------------------------
+ if ($codeset == LOCALE_CODE_NUMERIC)
+ {
+ return undef if ($code =~ /\D/);
+ $code = sprintf("%.3d", $code);
+ }
+ else
+ {
+ $code = lc($code);
+ }
+
+ if (exists $CODES->[$codeset]->{$code})
+ {
+ return $CODES->[$codeset]->{$code};
+ }
+ else
+ {
+ #---------------------------------------------------------------
+ # no such script code!
+ #---------------------------------------------------------------
+ return undef;
+ }
+}
+
+sub script2code
+{
+ my $script = shift;
+ my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
+
+
+ return undef unless defined $script;
+ $script = lc($script);
+ if (exists $COUNTRIES->[$codeset]->{$script})
+ {
+ return $COUNTRIES->[$codeset]->{$script};
+ }
+ else
+ {
+ #---------------------------------------------------------------
+ # no such script!
+ #---------------------------------------------------------------
+ return undef;
+ }
+}
+
+sub script_code2code
+{
+ (@_ == 3) or croak "script_code2code() takes 3 arguments!";
+
+ my $code = shift;
+ my $inset = shift;
+ my $outset = shift;
+ my $outcode = shift;
+ my $script;
+
+
+ return undef if $inset == $outset;
+ $script = code2script($code, $inset);
+ return undef if not defined $script;
+ $outcode = script2code($script, $outset);
+ return $outcode;
+}
+
+#=======================================================================
+
+=head1 QUERY ROUTINES
+
+There are two function which can be used to obtain a list of all codes,
+or all script names:
+
+=over 8
+
+=item C<all_script_codes ( [ CODESET ] )>
+
+Returns a list of all two-letter script codes.
+The codes are guaranteed to be all lower-case,
+and not in any particular order.
+
+=item C<all_script_names ( [ CODESET ] )>
+
+Returns a list of all script names for which there is a corresponding
+script code in the specified code set.
+The names are capitalised, and not returned in any particular order.
+
+=back
+
+=cut
+
+#=======================================================================
+sub all_script_codes
+{
+ my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
+
+ return keys %{ $CODES->[$codeset] };
+}
+
+sub all_script_names
+{
+ my $codeset = @_ > 0 ? shift : LOCALE_CODE_DEFAULT;
+
+ return values %{ $CODES->[$codeset] };
+}
+
+
+#-----------------------------------------------------------------------
+
+=head1 EXAMPLES
+
+The following example illustrates use of the C<code2script()> function.
+The user is prompted for a script code, and then told the corresponding
+script name:
+
+ $| = 1; # turn off buffering
+
+ print "Enter script code: ";
+ chop($code = <STDIN>);
+ $script = code2script($code, LOCALE_CODE_ALPHA_2);
+ if (defined $script)
+ {
+ print "$code = $script\n";
+ }
+ else
+ {
+ print "'$code' is not a valid script code!\n";
+ }
+
+
+=head1 KNOWN BUGS AND LIMITATIONS
+
+=over 4
+
+=item *
+
+When using C<script2code()>, the script name must currently appear
+exactly as it does in the source of the module. For example,
+
+ script2code('Egyptian hieroglyphs')
+
+will return B<eg>, as expected. But the following will all return C<undef>:
+
+ script2code('hieroglyphs')
+ script2code('Egyptian Hieroglypics')
+
+If there's need for it, a future version could have variants
+for script names.
+
+=item *
+
+In the current implementation, all data is read in when the
+module is loaded, and then held in memory.
+A lazy implementation would be more memory friendly.
+
+=back
+
+=head1 SEE ALSO
+
+=over 4
+
+=item Locale::Language
+
+ISO two letter codes for identification of language (ISO 639).
+
+=item Locale::Currency
+
+ISO three letter codes for identification of currencies
+and funds (ISO 4217).
+
+=item Locale::Country
+
+ISO three letter codes for identification of countries (ISO 3166)
+
+=item ISO 15924
+
+The ISO standard which defines these codes.
+
+=item http://www.evertype.com/standards/iso15924/
+
+Home page for ISO 15924.
+
+
+=back
+
+
+=head1 AUTHOR
+
+Neil Bowers E<lt>neil@bowers.comE<gt>
+
+=head1 COPYRIGHT
+
+Copyright (c) 2002 Neil Bowers.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+#-----------------------------------------------------------------------
+
+#=======================================================================
+# initialisation code - stuff the DATA into the ALPHA2 hash
+#=======================================================================
+{
+ my ($alpha2, $alpha3, $numeric);
+ my $script;
+
+
+ while (<DATA>)
+ {
+ next unless /\S/;
+ chop;
+ ($alpha2, $alpha3, $numeric, $script) = split(/:/, $_, 4);
+
+ $CODES->[LOCALE_CODE_ALPHA_2]->{$alpha2} = $script;
+ $COUNTRIES->[LOCALE_CODE_ALPHA_2]->{"\L$script"} = $alpha2;
+
+ if ($alpha3)
+ {
+ $CODES->[LOCALE_CODE_ALPHA_3]->{$alpha3} = $script;
+ $COUNTRIES->[LOCALE_CODE_ALPHA_3]->{"\L$script"} = $alpha3;
+ }
+
+ if ($numeric)
+ {
+ $CODES->[LOCALE_CODE_NUMERIC]->{$numeric} = $script;
+ $COUNTRIES->[LOCALE_CODE_NUMERIC]->{"\L$script"} = $numeric;
+ }
+
+ }
+}
+
+1;
+
+__DATA__
+am:ama:130:Aramaic
+ar:ara:160:Arabic
+av:ave:151:Avestan
+bh:bhm:300:Brahmi (Ashoka)
+bi:bid:372:Buhid
+bn:ben:325:Bengali
+bo:bod:330:Tibetan
+bp:bpm:285:Bopomofo
+br:brl:570:Braille
+bt:btk:365:Batak
+bu:bug:367:Buginese (Makassar)
+by:bys:550:Blissymbols
+ca:cam:358:Cham
+ch:chu:221:Old Church Slavonic
+ci:cir:291:Cirth
+cm:cmn:402:Cypro-Minoan
+co:cop:205:Coptic
+cp:cpr:403:Cypriote syllabary
+cy:cyr:220:Cyrillic
+ds:dsr:250:Deserel (Mormon)
+dv:dvn:315:Devanagari (Nagari)
+ed:egd:070:Egyptian demotic
+eg:egy:050:Egyptian hieroglyphs
+eh:egh:060:Egyptian hieratic
+el:ell:200:Greek
+eo:eos:210:Etruscan and Oscan
+et:eth:430:Ethiopic
+gl:glg:225:Glagolitic
+gm:gmu:310:Gurmukhi
+gt:gth:206:Gothic
+gu:guj:320:Gujarati
+ha:han:500:Han ideographs
+he:heb:125:Hebrew
+hg:hgl:420:Hangul
+hm:hmo:450:Pahawh Hmong
+ho:hoo:371:Hanunoo
+hr:hrg:410:Hiragana
+hu:hun:176:Old Hungarian runic
+hv:hvn:175:Kok Turki runic
+hy:hye:230:Armenian
+iv:ivl:610:Indus Valley
+ja:jap:930:(alias for Han + Hiragana + Katakana)
+jl:jlg:445:Cherokee syllabary
+jw:jwi:360:Javanese
+ka:kam:241:Georgian (Mxedruli)
+kh:khn:931:(alias for Hangul + Han)
+kk:kkn:411:Katakana
+km:khm:354:Khmer
+kn:kan:345:Kannada
+kr:krn:357:Karenni (Kayah Li)
+ks:kst:305:Kharoshthi
+kx:kax:240:Georgian (Xucuri)
+la:lat:217:Latin
+lf:laf:215:Latin (Fraktur variant)
+lg:lag:216:Latin (Gaelic variant)
+lo:lao:356:Lao
+lp:lpc:335:Lepcha (Rong)
+md:mda:140:Mandaean
+me:mer:100:Meroitic
+mh:may:090:Mayan hieroglyphs
+ml:mlm:347:Malayalam
+mn:mon:145:Mongolian
+my:mya:350:Burmese
+na:naa:400:Linear A
+nb:nbb:401:Linear B
+og:ogm:212:Ogham
+or:ory:327:Oriya
+os:osm:260:Osmanya
+ph:phx:115:Phoenician
+ph:pah:150:Pahlavi
+pl:pld:282:Pollard Phonetic
+pq:pqd:295:Klingon plQaD
+pr:prm:227:Old Permic
+ps:pst:600:Phaistos Disk
+rn:rnr:211:Runic (Germanic)
+rr:rro:620:Rongo-rongo
+sa:sar:110:South Arabian
+si:sin:348:Sinhala
+sj:syj:137:Syriac (Jacobite variant)
+sl:slb:440:Unified Canadian Aboriginal Syllabics
+sn:syn:136:Syriac (Nestorian variant)
+sw:sww:281:Shavian (Shaw)
+sy:syr:135:Syriac (Estrangelo)
+ta:tam:346:Tamil
+tb:tbw:373:Tagbanwa
+te:tel:340:Telugu
+tf:tfn:120:Tifnagh
+tg:tag:370:Tagalog
+th:tha:352:Thai
+tn:tna:170:Thaana
+tw:twr:290:Tengwar
+va:vai:470:Vai
+vs:vsp:280:Visible Speech
+xa:xas:000:Cuneiform, Sumero-Akkadian
+xf:xfa:105:Cuneiform, Old Persian
+xk:xkn:412:(alias for Hiragana + Katakana)
+xu:xug:106:Cuneiform, Ugaritic
+yi:yii:460:Yi
+zx:zxx:997:Unwritten language
+zy:zyy:998:Undetermined script
+zz:zzz:999:Uncoded script