Normalizer.java

Go to the documentation of this file.
00001 // (C) 2003 by Dominique Unruh. GPL 00002 00003 // Reference: 00004 // http://www.unicode.org/unicode/reports/tr15/NormalizerBuilder.java 00005 // http://www.unicode.org/unicode/reports/tr15/Normalizer.java 00006 00007 import java.io.*; 00008 00009 public class Normalizer { 00010 public static final int D = 2; 00011 public static final int C = 3; 00012 public static final int KD = 4; 00013 public static final int KC = 5; 00014 00015 public static final int NORMAL = 0; 00016 public static final int COMPATIBILITY = 1; 00017 public static final int EXCLUDE = 2; 00018 00019 public static final int MAX_UNICODE = 0xFFFF; 00020 00021 private static boolean initialized = false; 00022 private static String decompositions[] = null; 00023 private static boolean compatibility[] = null; 00024 private static int canonicalClass[] = null; 00025 private static char compositions[][] = null; 00026 00027 public static synchronized void init() throws IOException { 00028 if (initialized) return; 00029 decompositions = new String[MAX_UNICODE+1]; 00030 compatibility = new boolean[MAX_UNICODE+1]; 00031 canonicalClass = new int[MAX_UNICODE+1]; 00032 compositions = new char[MAX_UNICODE+1][]; 00033 DataInputStream data = 00034 new DataInputStream 00035 (new BufferedInputStream 00036 (new Normalizer().getClass().getResourceAsStream("normalizer_data.dat"))); 00037 try { 00038 while (true) { 00039 int num = data.readUnsignedShort(); 00040 //System.out.println("init: "+num); 00041 if (num>0xFFFF) 00042 throw new IllegalArgumentException 00043 ("Cannot handle Unicode > U+FFFF"); 00044 String decomposition = data.readUTF(); 00045 if (decomposition.length()==0) decomposition = null; 00046 decompositions[num] = decomposition; 00047 canonicalClass[num] = data.readUnsignedByte(); 00048 int t = data.readUnsignedByte(); 00049 compatibility[num] = (t==COMPATIBILITY); 00050 if (t==NORMAL && decomposition!=null && decomposition.length()>1) { 00051 if (decomposition.length()>2) 00052 throw new IllegalArgumentException 00053 ("decomposition for "+num+" has length "+decomposition.length()); 00054 char c1 = decomposition.charAt(0); 00055 char c2 = decomposition.charAt(1); 00056 if (compositions[c1]==null) { 00057 compositions[c1] = new char[c2+0x100]; 00058 } else if (compositions[c1].length<=c2) { 00059 char old[] = compositions[c1]; 00060 compositions[c1] = new char[c2+0x100]; 00061 System.arraycopy(old,0,compositions[c1],0,old.length); 00062 } 00063 //System.out.println(c1+"+"+c2+"="+(char)num); 00064 compositions[c1][c2] = (char)num; 00065 } 00066 } 00067 } catch (EOFException e) {}; 00068 initialized = true; 00069 //System.out.println(compositions[97][776]); 00070 } 00071 00072 public static String normalize(int form, String string) throws IOException { 00073 StringBuffer target = new StringBuffer(); 00074 if (normalize(form,string,0,string.length(),target)!=0) 00075 throw new RuntimeException("Internal Error: normalize did not return 0"); 00076 return target.toString(); 00077 } 00078 00079 private static int findStarter(String string, int start) { 00080 //System.out.print("findStarter("+string+","+start+") = "); 00081 while (start>0) { 00082 if (canonicalClass[string.charAt(start)]==0) break; 00083 start--; 00084 } 00085 //System.out.println(start); 00086 return start; 00087 } 00088 00089 public static int normalize(int form, String string, int start, int end, 00090 StringBuffer target) throws IOException { 00091 if (start==end) { 00092 target.setLength(0); return start; }; 00093 if (!initialized) init(); 00094 start = findStarter(string,start); 00095 string = string.substring(start,end); 00096 switch (form) { 00097 case D: 00098 decompose(true,string,target); 00099 break; 00100 case C: 00101 decompose(true,string,target); 00102 compose(target); 00103 break; 00104 case KD: 00105 decompose(false,string,target); 00106 break; 00107 case KC: 00108 decompose(false,string,target); 00109 compose(target); 00110 break; 00111 default: 00112 throw new IllegalArgumentException 00113 ("Unknown normalization form "+form); 00114 } 00115 return start; 00116 } 00117 00118 private static void compose(StringBuffer target) { 00119 if (target.length()==0) return; 00120 int starterPos = 0, compPos = 1; 00121 char starterChar = target.charAt(0); 00122 int lastClass = canonicalClass[starterChar]; 00123 if (lastClass != 0) lastClass = 256; 00124 int len = target.length(); 00125 for (int decompPos = 1; decompPos < len; decompPos++) { 00126 char c = target.charAt(decompPos); 00127 int cClass = canonicalClass[c]; 00128 char composite = 0; 00129 if (compositions[starterChar]!=null && compositions[starterChar].length>c) 00130 composite = compositions[starterChar][c]; 00131 //System.out.println("Starter "+starterChar+", c "+c); 00132 if (composite!=0 && (lastClass < cClass || lastClass == 0)) { 00133 target.setCharAt(starterPos,composite); 00134 starterChar = composite; 00135 } else { 00136 if (cClass == 0) { 00137 starterPos = compPos; 00138 starterChar = c; 00139 } 00140 lastClass = cClass; 00141 target.setCharAt(compPos++,c); 00142 } 00143 } 00144 target.setLength(compPos); 00145 return; 00146 } 00147 00148 private static void decomposeChar(boolean canonical, 00149 char source, StringBuffer target) { 00150 String decomposition = decompositions[source]; 00151 if (decomposition != null && !(canonical && compatibility[source])) { 00152 int len = decomposition.length(); 00153 for (int i=0; i<len; i++) 00154 decomposeChar(canonical,decomposition.charAt(i),target); 00155 } else { 00156 target.append(source); 00157 } 00158 } 00159 00160 private static void decompose(boolean canonical, 00161 String source, StringBuffer target) { 00162 int sourceLen = source.length(); 00163 StringBuffer buff = new StringBuffer(); 00164 for (int i=0; i<sourceLen; i++) { 00165 buff.setLength(0); 00166 decomposeChar(canonical,source.charAt(i),buff); 00167 int buffLen = buff.length(); 00168 for (int j=0; j<buff.length(); j++) { 00169 char c = buff.charAt(j); 00170 int cl = canonicalClass[c]; 00171 int pos = target.length(); 00172 if (cl!=0) { 00173 for (; pos>0; pos--) { 00174 if (canonicalClass[target.charAt(pos-1)] <= cl) 00175 break; 00176 } 00177 } 00178 target.insert(pos,c); 00179 } 00180 } 00181 } 00182 00183 public static void main(String args[]) throws IOException { 00184 StringBuffer arg = new StringBuffer(); 00185 for (int i=0; i<args.length; i++) { 00186 if (i>0) arg.append(" "); 00187 arg.append(args[i]); 00188 } 00189 System.out.println("String: "+arg); 00190 System.out.println("Normal: "+normalize(C,arg.toString())); 00191 } 00192 00193 public static boolean isCombining(char c) throws IOException { 00194 if (!initialized) init(); 00195 //System.out.println("isCombining("+c+") = "+(canonicalClass[c]!=0)+" ("+canonicalClass[c]+")"); 00196 return canonicalClass[c]!=0; 00197 } 00198 00199 public static boolean isSurrogate(int c) { 00200 return ((c>=0xD800)&&(c<=0xDFFF)); 00201 } 00202 public static boolean isLowSurrogate(int c) { 00203 return ((c>=0xDC00)&&(c<=0xDFFF)); 00204 } 00205 public static boolean isHighSurrogate(int c) { 00206 return ((c>=0xD800)&&(c<=0xDBFF)); 00207 } 00208 }

Generated on Sun Aug 15 11:56:53 2004 for International Input by doxygen 1.3.7