#-------------------------------------------------------------------- # Copyright (c) 1999-2006, International Business Machines # Corporation and others. All Rights Reserved. #-------------------------------------------------------------------- # $Source: /xsrl/Nsvn/icu/icuhtml/icu.sf.net/docs/papers/translit/Transliterator_Latin_InterIndic.txt,v $ # $Date: 2006/03/10 22:18:36 $ # $Revision: 1.1 $ #-------------------------------------------------------------------- # Latin-InterIndic #:: NFD; #\u0e00 reserved #consonants $chandrabindu=\ue001; $anusvara=\ue002; $visarga=\ue003; #\u0e004 reserved # w represents the stand-alone form $wa=\ue005; $waa=\ue006; $wi=\ue007; $wii=\ue008; $wu=\ue009; $wuu=\ue00a; $wr=\ue00b; $wl=\ue00c; $wce=\ue00d; # LETTER CANDRA E $wse=\ue00e; # LETTER SHORT E $we=\ue00f; # \u090f LETTER E $wai=\ue010; $wco=\ue011; # LETTER CANDRA O $wso=\ue012; # LETTER SHORT O $wo=\ue013; # \u0913 LETTER O $wau=\ue014; $ka=\ue015; $kha=\ue016; $ga=\ue017; $gha=\ue018; $nga=\ue019; $ca=\ue01a; $cha=\ue01b; $ja=\ue01c; $jha=\ue01d; $nya=\ue01e; $tta=\ue01f; $ttha=\ue020; $dda=\ue021; $ddha=\ue022; $nna=\ue023; $ta=\ue024; $tha=\ue025; $da=\ue026; $dha=\ue027; $na=\ue028; $ena=\ue029; #compatibility $pa=\ue02a; $pha=\ue02b; $ba=\ue02c; $bha=\ue02d; $ma=\ue02e; $ya=\ue02f; $ra=\ue030; $rra=\ue031; $la=\ue032; $lla=\ue033; $ela=\ue034; #compatibility $va=\ue035; $sha=\ue036; $ssa=\ue037; $sa=\ue038; $ha=\ue039; #\u093a Reserved #\u093b Reserved $nukta=\ue03c; $avagraha=\ue03d; # SIGN AVAGRAHA # represents the dependent form $aa=\ue03e; $i=\ue03f; $ii=\ue040; $u=\ue041; $uu=\ue042; $rh=\ue043; $lh=\ue044; $ce=\ue045; #VOWEL SIGN CANDRA E $se=\ue046; #VOWEL SIGN SHORT E $e=\ue047; $ai=\ue048; $co=\ue049; # VOWEL SIGN CANDRA O $so=\ue04a; # VOWEL SIGN SHORT O $o=\ue04b; # \u094b $au=\ue04c; $virama=\ue04d; # \u094e Reserved # \u094f Reserved $om = \ue050; # OM # \u0951>; # UNMAPPED STRESS SIGN UDATTA # \u0952>; # UNMAPPED STRESS SIGN ANUDATTA # \u0953>; # UNMAPPED GRAVE ACCENT # \u0954>; # UNMAPPED ACUTE ACCENT $lm = \ue055;# Telugu Length Mark $ailm=\ue056;# AI Length Mark $aulm=\ue057;# AU Length Mark #urdu compatibity forms $uka=\ue058; $ukha=\ue059; $ugha=\ue05a; $ujha=\ue05b; $uddha=\ue05c; $udha=\ue05d; $ufa=\ue05e; $uya=\ue05f; $wrr=\ue060; $wll=\ue061; $rrh=\ue062; $llh=\ue063; $danda=\ue064; $doubleDanda=\ue065; $zero=\ue066; # DIGIT ZERO $one=\ue067; # DIGIT ONE $two=\ue068; # DIGIT TWO $three=\ue069; # DIGIT THREE $four=\ue06a; # DIGIT FOUR $five=\ue06b; # DIGIT FIVE $six=\ue06c; # DIGIT SIX $seven=\ue06d; # DIGIT SEVEN $eight=\ue06e; # DIGIT EIGHT $nine=\ue06f; # DIGIT NINE # For all other scripts $ecp0=\ue070; $ecp1=\ue071; $ecp2=\ue072; $ecp3=\ue073; $ecp4=\ue074; $ecp5=\ue075; $ecp6=\ue076; $ecp7=\ue077; $ecp8=\ue078; $ecp9=\ue079; $ecpA=\ue07a; $ecpB=\ue07b; $ecpC=\ue07c; $ecpD=\ue07d; $ecpE=\ue07e; $ecpF=\ue07f; # \u0970>; # UNMAPPED ABBREVIATION SIGN $depVowelAbove=[\ue03e-\ue040\ue045-\ue04c]; $depVowelBelow=[\ue041-\ue044]; $endThing=[$danda$doubleDanda]; # $x was originally called '&'; $z was '%' $x=[$virama$aa$ai$au$ii$i$uu$u$rrh$rh$lh$e$o$se$ce$so$co]; $z=[bcdfghjklmnpqrstvwxyz]; $consonants=[[$ka-$ha]$z[\u0915-\u0939][\u0995-\u09b9][\u0a15-\u0a39][\u0a95-\u0ab9][\u0b15-\u0b39][\u0b95-\u0bb9][\u0c15-\u0c39][\u0c95-\u0cb9][\u0d15-\u0d39]]; \u0315 > $avagraha; \u0303>$chandrabindu$anusvara; m\u0310>$chandrabindu; h\u0323>$visarga; x>$ka$virama$sa; # convert to independent forms at start of word or syllable: # dependent forms for roundtrip \u0314a\u0304>$aa; \u0314ai>$ai; \u0314au>$au; \u0314ii>$ii; \u0314i\u0304>$ii; \u0314i>$i; \u0314u\u0304>$uu; \u0314u>$u; \u0314r\u0325\u0304>$rrh; \u0314r\u0325>$rh; \u0314l\u0325\u0304>$llh; \u0314lh>$lh; \u0314l\u0325>$lh; \u0314e\u0304>$e; \u0314o\u0304>$o; \u0314a>; \u0314e\u0306>$ce; \u0314o\u0306>$co; \u0314e>$se; \u0314o>$so; # preceeded by consonants $consonants{ a\u0304>$aa; $consonants{ ai>$ai; $consonants{ au>$au; $consonants{ ii>$ii; $consonants{ i\u0304>$ii; $consonants{ i>$i; $consonants{ u\u0304>$uu; $consonants{ u>$u; $consonants{ r\u0325\u0304>$rrh; $consonants{ r\u0325a>$rh; $consonants{ r\u0325>$rh; $consonants{ l\u0325\u0304>$llh; $consonants{ lh>$lh; $consonants{ l\u0325>$lh; $consonants{ e\u0304>$e; $consonants{ o\u0304>$o; $consonants{ e\u0306>$ce; $consonants{ o\u0306>$co; $consonants{ e>$se; $consonants{ o>$so; # e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai}) a\u0304>$waa; ai>$wai; au>$wau; i\u0304>$wii; i>$wi; u\u0304>$wuu; u>$wu; r\u0325\u0304>$wrr; r\u0325>$wr; l\u0325\u0304>$wll; lh>$wl; l\u0325>$wl; e\u0304>$we; o\u0304>$wo; a>$wa; e\u0306>$wce; o\u0306>$wco; e>$wse; ''om>$om; o>$wso; # rules for anusvara n}r\u0325 > $na|$virama; n}l\u0325 > $na|$virama; n}na > $na|$virama; n\u0307}[kg] > $anusvara; n\u0307}n\u0307 > $anusvara; n\u0304}[cj] > $anusvara; n\u0304}n\u0303 > $anusvara; n\u0323}[tdn]\u0323 > $anusvara; n}[tdn] > $anusvara; m}[pbm] > $anusvara; n}[ylvshr] > $anusvara; m\u0307 > $anusvara; #urdu compatibility q>$uka|$virama; k\u0331h\u0331>$ukha |$virama; g\u0307> $ugha | $virama; z > $ujha |$virama; f > $ufa|$virama; # dev y\u0307>$uya|$virama; l\u0331>$ela|$virama; n\u0331>$ena|$virama; n\u0307>$nga|$virama; n\u0303>$nya|$virama; n\u0323>$nna|$virama; t\u0323h>$ttha|$virama; t\u0323>$tta|$virama; r\u0323h>$udha|$virama; r\u0323>$uddha|$virama; d\u0323h>$ddha|$virama; d\u0323>$dda|$virama; kh>$kha|$virama; k>$ka|$virama; gh>$gha|$virama; g>$ga|$virama; ch>$cha|$virama; c>$ca|$virama; jh>$jha|$virama; j>$ja|$virama; ny>$nya|$virama; tth>$ttha|$virama; ddh>$ddha|$virama; th>$tha|$virama; t>$ta|$virama; dh>$dha|$virama; d>$da|$virama; n>$na|$virama; ph>$pha|$virama; p>$pa|$virama; bh>$bha|$virama; b>$ba|$virama; m>$ma|$virama; y>$ya|$virama; r\u0331>$rra|$virama; r>$ra|$virama; l\u0323>$lla|$virama; l>$la|$virama; v>$va|$virama; w>$va|$virama; sh>$sha|$virama; ss>$ssa|$virama; s\u0323>$ssa|$virama; s\u0301>$sha|$virama; s>$sa|$virama; h>$ha|$virama; '.'>$danda; $danda'.'>$doubleDanda; $depVowelAbove{'~'>$anusvara; $depVowelBelow{'~'>$chandrabindu; # convert to dependent forms after consonant with no vowel: # e.g. kai -> {ka}{virama}ai -> {ka}{ai} #$virama aa>$aa; $virama a\u0304>$aa; $virama ai>$ai; $virama au>$au; $virama ii>$ii; $virama i\u0304>$ii; $virama i>$i; #$virama uu>$uu; $virama u\u0304>$uu; $virama u>$u; #$virama rrh>$rrh; $virama r\u0325\u0304>$rrh; #$virama rh>$rh; $virama r\u0325a>$rh; $virama r\u0325>$rh; $virama l\u0325\u0304>$llh; $virama lh>$lh; $virama l\u0325>$lh; $virama e\u0304>$e; $virama o\u0304>$o; $virama a>; $virama e\u0306>$ce; $virama o\u0306>$co; $virama e>$se; $virama o>$so; # otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai} #$virama''aa>$waa; $virama''a\u0304>$waa; $virama''ai>$wai; $virama''au>$wau; #$virama''ii>$wii; $virama''i\u0304>$wii; $virama''i>$wi; #$virama''uu>$wuu; $virama''u\u0304>$wuu; $virama''u>$wu; #$virama''rrh>$wrr; $virama''r\u0325\u0304>$wrr; #$virama''rh>$wr; $virama''r\u0325>$wr; $virama''l\u0325\u0304>$wll; #$virama''lh>$wl; $virama''l\u0325>$wl; $virama''e\u0304>$we; $virama''o\u0304>$wo; $virama''a>$wa; $virama''e\u0306>$wce; $virama''o\u0306>$wco; $virama''e>$wse; $virama''o>$wso; # no virama ''a\u0304>$waa; ''ai>$wai; ''au>$wau; ''i\u0304>$wii; ''i>$wi; ''u\u0304>$wuu; ''u>$wu; ''r\u0325\u0304>$wrr; ''r\u0325>$wr; ''l\u0325\u0304>$wll; ''l\u0325>$wl; ''e\u0304>$we; ''o\u0304>$wo; ''a>$wa; ''e\u0306>$wce; ''o\u0306>$wco; ''e>$wse; ''o>$wso; $virama } [$z] > $virama; $virama } ' ' > $virama ; $virama}$endThing>; 0>$zero; 1>$one; 2>$two; 3>$three; 4>$four; 5>$five; 6>$six; 7>$seven; 8>$eight; 9>$nine; ''>; #:: NFC (NFD) ;