HangulReplacement.java

Go to the documentation of this file.
00001 import java.io.*; 00002 00003 public class HangulReplacement 00004 extends InputReplacement implements ReplacementFactory { 00005 final static int CASE_UPPER = 1; 00006 final static int CASE_LOWER = 2; 00007 final static int CASE_IGNORE = 3; 00008 String prefix = null; 00009 String suffix = null; 00010 String placeHolder = "XXXX"; 00011 String replinfo = null; 00012 int caseHandling = CASE_UPPER; 00013 private String cache = null; 00014 private int pfxLength; 00015 private String continuation1; 00016 private String continuation2; 00017 int choseong = 0; 00018 int jungseong = 0; 00019 int jongseong = 0; 00020 public synchronized String getReplacement(String input) { 00021 parseInput(input); 00022 //String res = "["+Integer.toString(choseong,16)+","+Integer.toString(jungseong,16); 00023 //if (jongseong!=0) res += ","+Integer.toString(jongseong,16); 00024 //return res+"]"; 00025 int c = ((choseong-0x1100)*21+jungseong-0x1161)*28+0xAC00; 00026 if (jongseong!=0) c += jongseong-0x11A7; 00027 return new Character((char)c).toString(); 00028 } 00029 public synchronized String replacesPrefixOf(String input) { 00030 parseInput(input); 00031 if (pfxLength==0) return null; 00032 return input.substring(0,pfxLength); 00033 } 00034 private void parseInput(String input) { 00035 if (caseHandling==CASE_IGNORE) input = input.toUpperCase(); 00036 if (cache==input || input.equals(cache)) return; 00037 if (caseHandling==CASE_LOWER) 00038 throw new UnsupportedOperationException 00039 ("caseHandling==CASE_LOWER not yet implemented"); 00040 cache = input; 00041 System.out.println("Parsing: "+input); 00042 pfxLength = 0; 00043 continuation1 = continuation2 = null; 00044 if (prefix!=null && !input.startsWith(prefix)) { 00045 if (prefix.startsWith(input)) 00046 continuation1 = getInputInfo(); 00047 return; 00048 } 00049 System.out.println("Prefix "+prefix+" present"); 00050 int pos = 0; 00051 int len = input.length(); 00052 if (prefix!=null) pos = prefix.length(); 00053 final int STATE_CHOSEONG = 1; 00054 final int STATE_CHOSEONG_JUNGSEONG = 2; 00055 final int STATE_JUNGSEONG = 3; 00056 final int STATE_JUNGSEONG_JONGSEONG = 4; 00057 final int STATE_JONGSEONG = 5; 00058 final int STATE_END = 6; 00059 final int STATE_FAILED = 7; 00060 final int STATE_NOCONT = 8; 00061 int state = STATE_CHOSEONG; 00062 choseong = 0; 00063 jongseong = 0; 00064 jungseong = 0; 00065 while (pos < len) { 00066 char c = input.charAt(pos); 00067 00068 boolean consumed = false; 00069 if (state==STATE_CHOSEONG) { 00070 consumed = true; 00071 switch (c) { 00072 // 1100; G # HANGUL CHOSEONG KIYEOK 00073 case 'G': choseong = 0x1100; break; 00074 // 1102; N # HANGUL CHOSEONG NIEUN 00075 case 'N': choseong = 0x1102; break; 00076 // 1103; D # HANGUL CHOSEONG TIKEUT 00077 case 'D': choseong = 0x1103; break; 00078 // 1105; R # HANGUL CHOSEONG RIEUL 00079 case 'R': choseong = 0x1105; break; 00080 // 1106; M # HANGUL CHOSEONG MIEUM 00081 case 'M': choseong = 0x1106; break; 00082 // 1107; B # HANGUL CHOSEONG PIEUP 00083 case 'B': choseong = 0x1107; break; 00084 // 1109; S # HANGUL CHOSEONG SIOS 00085 case 'S': choseong = 0x1109; break; 00086 // 110C; J # HANGUL CHOSEONG CIEUC 00087 case 'J': choseong = 0x110C; break; 00088 // 110E; C # HANGUL CHOSEONG CHIEUCH 00089 case 'C': choseong = 0x110E; break; 00090 // 110F; K # HANGUL CHOSEONG KHIEUKH 00091 case 'K': choseong = 0x110F; break; 00092 // 1110; T # HANGUL CHOSEONG THIEUTH 00093 case 'T': choseong = 0x1110; break; 00094 // 1111; P # HANGUL CHOSEONG PHIEUPH 00095 case 'P': choseong = 0x1111; break; 00096 // 1112; H # HANGUL CHOSEONG HIEUH 00097 case 'H': choseong = 0x1112; break; 00098 // 110B; # HANGUL CHOSEONG IEUNG 00099 default: choseong = 0x110B; consumed = false; 00100 } 00101 state = STATE_CHOSEONG_JUNGSEONG; 00102 } 00103 00104 if (!consumed && (state==STATE_JUNGSEONG || 00105 state==STATE_CHOSEONG_JUNGSEONG)) { 00106 int newstate = STATE_JUNGSEONG_JONGSEONG; 00107 consumed = true; 00108 00109 switch (c) { 00110 // 1101; GG # HANGUL CHOSEONG SSANGKIYEOK 00111 case 'G': 00112 if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1100) { 00113 choseong = 0x1101; newstate=STATE_JUNGSEONG; 00114 } else { 00115 newstate=STATE_FAILED; 00116 } 00117 break; 00118 // 1104; DD # HANGUL CHOSEONG SSANGTIKEUT 00119 case 'D': 00120 if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1103) { 00121 choseong = 0x1104; newstate=STATE_JUNGSEONG; 00122 } else { 00123 newstate=STATE_FAILED; 00124 } 00125 break; 00126 // 1108; BB # HANGUL CHOSEONG SSANGPIEUP 00127 case 'B': 00128 if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1107) { 00129 choseong = 0x1108; newstate=STATE_JUNGSEONG; 00130 } else { 00131 newstate=STATE_FAILED; 00132 } 00133 break; 00134 // 110A; SS # HANGUL CHOSEONG SSANGSIOS 00135 case 'S': 00136 if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1109) { 00137 choseong = 0x110A; newstate=STATE_JUNGSEONG; 00138 } else { 00139 newstate=STATE_FAILED; 00140 } 00141 break; 00142 // 110D; JJ # HANGUL CHOSEONG SSANGCIEUC 00143 case 'J': 00144 if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x110C) { 00145 choseong = 0x110D; newstate=STATE_JUNGSEONG; 00146 } else { 00147 newstate=STATE_FAILED; 00148 } 00149 break; 00150 00151 // 1161; A # HANGUL JUNGSEONG A 00152 case 'A': jungseong = 0x1161; break; 00153 // 1166; E # HANGUL JUNGSEONG E 00154 case 'E': jungseong = 0x1166; break; 00155 // 1169; O # HANGUL JUNGSEONG O 00156 case 'O': jungseong = 0x1169; break; 00157 // 116E; U # HANGUL JUNGSEONG U 00158 case 'U': jungseong = 0x116E; break; 00159 // 1175; I # HANGUL JUNGSEONG I 00160 case 'I': jungseong = 0x1175; break; 00161 case 'W': jungseong = 'W'; break; 00162 case 'Y': jungseong = 'Y'; break; 00163 default: 00164 newstate = STATE_FAILED; 00165 } 00166 state = newstate; 00167 } 00168 00169 if (!consumed && (state==STATE_JONGSEONG || 00170 state==STATE_JUNGSEONG_JONGSEONG)) { 00171 int newstate = STATE_JONGSEONG; 00172 00173 switch (c) { 00174 case 'A': 00175 // 1163; YA # HANGUL JUNGSEONG YA 00176 if (state==STATE_JUNGSEONG_JONGSEONG && jungseong == 'Y') { 00177 jungseong = 0x1163; newstate = STATE_JUNGSEONG_JONGSEONG; 00178 // 116A; WA # HANGUL JUNGSEONG WA 00179 } else if 00180 (state==STATE_JUNGSEONG_JONGSEONG && jungseong == 'W') { 00181 jungseong = 0x116A; newstate = STATE_JUNGSEONG_JONGSEONG; 00182 } else { 00183 newstate=STATE_END; 00184 } 00185 break; 00186 00187 case 'E': 00188 // 1168; YE # HANGUL JUNGSEONG YE 00189 if (state==STATE_JUNGSEONG_JONGSEONG && jungseong == 'Y') { 00190 jungseong = 0x1168; newstate = STATE_JUNGSEONG_JONGSEONG; 00191 } else if 00192 // 1170; WE # HANGUL JUNGSEONG WE 00193 (state==STATE_JUNGSEONG_JONGSEONG && jungseong == 'W') { 00194 jungseong = 0x1170; newstate = STATE_JUNGSEONG_JONGSEONG; 00195 } else if 00196 // 116C; OE # HANGUL JUNGSEONG OE 00197 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1169) { 00198 jungseong = 0x116C; 00199 } else if 00200 // 1164; YAE # HANGUL JUNGSEONG YAE 00201 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1163) { 00202 jungseong = 0x1164; 00203 } else if 00204 // 116B; WAE # HANGUL JUNGSEONG WAE 00205 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x116A) { 00206 jungseong = 0x116B; 00207 } else if 00208 // 1162; AE # HANGUL JUNGSEONG AE 00209 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1161) { 00210 jungseong = 0x1162; 00211 } else { 00212 newstate = STATE_END; 00213 }; 00214 break; 00215 00216 case 'O': 00217 // 1165; EO # HANGUL JUNGSEONG EO 00218 if (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1166) { 00219 jungseong = 0x1165; 00220 } else if 00221 // 1167; YEO # HANGUL JUNGSEONG YEO 00222 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1168) { 00223 jungseong = 0x1167; 00224 } else if 00225 // 116D; YO # HANGUL JUNGSEONG YO 00226 (state==STATE_JUNGSEONG_JONGSEONG && jungseong=='Y') { 00227 jungseong = 0x116D; 00228 } else if 00229 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1170) { 00230 // 116F; WEO # HANGUL JUNGSEONG WEO 00231 jungseong = 0x116F; 00232 } else { 00233 newstate = STATE_END; 00234 } 00235 break; 00236 00237 case 'I': 00238 // 1171; WI # HANGUL JUNGSEONG WI 00239 if (state==STATE_JUNGSEONG_JONGSEONG && jungseong=='W') { 00240 jungseong = 0x1171; 00241 } else if 00242 // 1174; YI # HANGUL JUNGSEONG YI 00243 (state==STATE_JUNGSEONG_JONGSEONG && jungseong=='Y') { 00244 jungseong = 0x1174; 00245 } else { 00246 newstate = STATE_END; 00247 } 00248 break; 00249 00250 case 'U': 00251 // 1172; YU # HANGUL JUNGSEONG YU 00252 if (state==STATE_JUNGSEONG_JONGSEONG && jungseong=='Y') { 00253 jungseong = 0x1172; 00254 } else if 00255 // 1173; EU # HANGUL JUNGSEONG EU 00256 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1166) { 00257 jungseong = 0x1173; 00258 } else { 00259 newstate = STATE_END; 00260 } 00261 break; 00262 00263 case 'G': 00264 switch (jongseong) { 00265 // 11A8; G # HANGUL JONGSEONG KIYEOK 00266 case 0: jongseong = 0x11A8; break; 00267 // 11A9; GG # HANGUL JONGSEONG SSANGKIYEOK 00268 case 0x11A8: jongseong = 0x11A9; newstate = STATE_NOCONT; break; 00269 // 11BC; NG # HANGUL JONGSEONG IEUNG 00270 case 0x11AB: jongseong = 0x11BC; newstate = STATE_NOCONT; break; 00271 // 11B0; LG # HANGUL JONGSEONG RIEUL-KIYEOK 00272 case 0x11AF: jongseong = 0x11B0; newstate = STATE_NOCONT; break; 00273 default: newstate = STATE_END; } 00274 break; 00275 // 11AB; N # HANGUL JONGSEONG NIEUN 00276 case 'N': 00277 if (jongseong==0) jongseong = 0x11AB; 00278 else newstate = STATE_END; 00279 break; 00280 00281 // 11AE; D # HANGUL JONGSEONG TIKEUT 00282 case 'D': 00283 if (jongseong==0) { jongseong = 0x11AE; newstate = STATE_NOCONT; } 00284 else newstate = STATE_END; 00285 break; 00286 00287 // 11AF; L # HANGUL JONGSEONG RIEUL 00288 case 'L': 00289 if (jongseong==0) jongseong = 0x11AF; 00290 else newstate = STATE_END; 00291 break; 00292 00293 case 'M': 00294 switch (jongseong) { 00295 // 11B7; M # HANGUL JONGSEONG MIEUM 00296 case 0: jongseong = 0x11B7; newstate = STATE_NOCONT; break; 00297 // 11B1; LM # HANGUL JONGSEONG RIEUL-MIEUM 00298 case 0x11AF: jongseong = 0x11B1; newstate = STATE_NOCONT; break; 00299 default: newstate = STATE_END; 00300 } 00301 break; 00302 00303 case 'B': 00304 switch (jongseong) { 00305 // 11B8; B # HANGUL JONGSEONG PIEUP 00306 case 0: jongseong = 0x11B8; break; 00307 // 11B2; LB # HANGUL JONGSEONG RIEUL-PIEUP 00308 case 0x11AF: jongseong = 0x11B2; newstate = STATE_NOCONT; break; 00309 default: newstate = STATE_END; 00310 } 00311 break; 00312 00313 case 'S': 00314 switch (jongseong) { 00315 // 11BA; S # HANGUL JONGSEONG SIOS 00316 case 0: jongseong = 0x11BA; break; 00317 // 11AA; GS # HANGUL JONGSEONG KIYEOK-SIOS 00318 case 0x11A8: jongseong = 0x11AA; newstate = STATE_NOCONT; break; 00319 // 11B3; LS # HANGUL JONGSEONG RIEUL-SIOS 00320 case 0x11AF: jongseong = 0x11B3; newstate = STATE_NOCONT; break; 00321 // 11BB; SS # HANGUL JONGSEONG SSANGSIOS 00322 case 0x11BA: jongseong = 0x11BB; newstate = STATE_NOCONT; break; 00323 // 11B9; BS # HANGUL JONGSEONG PIEUP-SIOS 00324 case 0x11B8: jongseong = 0x11B9; newstate = STATE_NOCONT; break; 00325 default: newstate = STATE_END; 00326 } 00327 break; 00328 00329 case 'J': 00330 switch (jongseong) { 00331 // 11BD; J # HANGUL JONGSEONG CIEUC 00332 case 0: jongseong = 0x11BD; newstate = STATE_NOCONT; break; 00333 // 11AC; NJ # HANGUL JONGSEONG NIEUN-CIEUC 00334 case 0x11AB: jongseong = 0x11AC; newstate = STATE_NOCONT; break; 00335 default: newstate = STATE_END; 00336 } 00337 break; 00338 00339 // 11BE; C # HANGUL JONGSEONG CHIEUCH 00340 case 'C': 00341 if (jongseong==0) { jongseong = 0x11BE; newstate = STATE_NOCONT; } 00342 else newstate = STATE_END; 00343 break; 00344 00345 // 11BF; K # HANGUL JONGSEONG KHIEUKH 00346 case 'K': 00347 if (jongseong==0) { jongseong = 0x11BF; newstate = STATE_NOCONT; } 00348 else newstate = STATE_END; 00349 break; 00350 00351 case 'T': 00352 switch (jongseong) { 00353 // 11C0; T # HANGUL JONGSEONG THIEUTH 00354 case 0: jongseong = 0x11C0; newstate = STATE_NOCONT; break; 00355 // 11B4; LT # HANGUL JONGSEONG RIEUL-THIEUTH 00356 case 0x11AF: jongseong = 0x11B4; newstate = STATE_NOCONT; break; 00357 default: newstate = STATE_END; 00358 } 00359 break; 00360 00361 case 'P': 00362 switch (jongseong) { 00363 // 11C1; P # HANGUL JONGSEONG PHIEUPH 00364 case 0: jongseong = 0x11C1; newstate = STATE_NOCONT; break; 00365 // 11B5; LP # HANGUL JONGSEONG RIEUL-PHIEUPH 00366 case 0x11AF: jongseong = 0x11B5; newstate = STATE_NOCONT; break; 00367 default: newstate = STATE_END; 00368 } 00369 break; 00370 00371 case 'H': 00372 switch (jongseong) { 00373 // 11C2; H # HANGUL JONGSEONG HIEUH 00374 case 0: jongseong = 0x11C2; newstate = STATE_NOCONT; break; 00375 // 11AD; NH # HANGUL JONGSEONG NIEUN-HIEUH 00376 case 0x11AB: jongseong = 0x11AD; newstate = STATE_NOCONT; break; 00377 // 11B6; LH # HANGUL JONGSEONG RIEUL-HIEUH 00378 case 0x11AF: jongseong = 0x11B6; newstate = STATE_NOCONT; break; 00379 } 00380 break; 00381 00382 default: 00383 newstate = STATE_END; 00384 } 00385 state = newstate; 00386 } 00387 00388 if (jungseong<0x1000) 00389 switch (state) { 00390 case STATE_JONGSEONG: 00391 case STATE_END: 00392 case STATE_NOCONT: 00393 state = STATE_FAILED; 00394 } 00395 00396 // STATE_CHOSEONG: incomplete 00397 // STATE_CHOSEONG_JUNGSEONG: incomplete 00398 // STATE_JUNGSEONG: incomplete 00399 // STATE_JUNGSEONG_JONGSEONG: complete, continuable (any jongseong) 00400 // STATE_JONGSEONG: complete, continuable 00401 // STATE_NOCONT: complete (w/ current char), not continuable 00402 // STATE_END: complete (w/o current char), not continuable 00403 // STATE_FAILED: incomplete, non continuable 00404 00405 if (state==STATE_FAILED) return; 00406 00407 if (state==STATE_END) { 00408 System.out.println("End: "+pos); 00409 state = STATE_NOCONT; 00410 pos--; 00411 } 00412 00413 // If the suffix follows: 00414 // Prefix match if complete 00415 // No match possible otherwise 00416 if (suffix!=null && input.startsWith(suffix,pos+1)) { 00417 switch (state) { 00418 case STATE_NOCONT: 00419 case STATE_JONGSEONG: 00420 case STATE_JUNGSEONG_JONGSEONG: 00421 pfxLength = pos+suffix.length()+1; 00422 } 00423 return; } 00424 00425 pos++; 00426 00427 System.out.print("Char("+pos+") "+c+": State="); 00428 switch (state) { 00429 case STATE_JONGSEONG: System.out.print("JONGSEONG"); break; 00430 case STATE_CHOSEONG: System.out.print("CHOSEONG"); break; 00431 case STATE_CHOSEONG_JUNGSEONG: System.out.print("CHOSEONG_JUNGSEONG"); break; 00432 case STATE_JUNGSEONG_JONGSEONG: System.out.print("JUNGSEONG_JONGSEONG"); break; 00433 case STATE_END: System.out.print("END"); break; 00434 case STATE_NOCONT: System.out.print("NOCONT"); break; 00435 case STATE_FAILED: System.out.print("FAILED"); break; 00436 default: System.out.print(state); 00437 } 00438 System.out.println(" , choseong="+Integer.toString(choseong,16)+ 00439 " , jungseong="+Integer.toString(jungseong,16)+ 00440 " , jongseong="+Integer.toString(jongseong,16)); 00441 00442 if (state==STATE_NOCONT) break; 00443 } 00444 00445 if (suffix!=null) { 00446 switch (state) { 00447 case STATE_CHOSEONG: 00448 case STATE_CHOSEONG_JUNGSEONG: 00449 case STATE_JUNGSEONG: 00450 continuation1 = input+placeHolder+suffix; 00451 break; 00452 case STATE_JONGSEONG: 00453 case STATE_JUNGSEONG_JONGSEONG: 00454 continuation1 = input+placeHolder+suffix; 00455 continuation2 = input+suffix; 00456 break; 00457 case STATE_NOCONT: 00458 if (suffix.length()!=input.length()-pos && 00459 suffix.startsWith(input.substring(pos))) 00460 continuation1 = input.substring(0,pos)+suffix; 00461 break; 00462 } 00463 } else { 00464 switch (state) { 00465 case STATE_CHOSEONG: 00466 case STATE_CHOSEONG_JUNGSEONG: 00467 case STATE_JUNGSEONG: 00468 continuation1 = input+placeHolder; 00469 break; 00470 case STATE_JONGSEONG: 00471 case STATE_JUNGSEONG_JONGSEONG: 00472 continuation1 = input+placeHolder; 00473 if (jungseong>=0x1000) 00474 pfxLength = pos; 00475 break; 00476 case STATE_NOCONT: 00477 pfxLength = pos; 00478 break; 00479 } 00480 } 00481 } 00482 public synchronized String[] isContinuationOf(String input) { 00483 parseInput(input); 00484 if (continuation1==null) return null; 00485 if (continuation2==null) 00486 return new String[] { continuation1 }; 00487 return new String[] { continuation1, continuation2 }; 00488 } 00489 public String getInputInfo() { 00490 return (prefix!=null?prefix:"")+placeHolder+ 00491 (suffix!=null?suffix:""); } 00492 public String getReplacementInfo() { 00493 return replinfo; }; 00494 00495 public HangulReplacement(BufferedReader input) 00496 throws IOException, FileFormatException { 00497 while (true) { 00498 String line=input.readLine(); 00499 if (line==null) { 00500 throw new FileFormatException 00501 ("EOF in CodepositionReplacement"); 00502 } else if (line.equals("###")) { 00503 break; 00504 } else if (line.equals("")) { 00505 } else if (line.equals(":PREFIX")) { 00506 prefix = input.readLine(); 00507 if (prefix==null) 00508 throw new FileFormatException 00509 ("in CodepositionReplacement: :PREFIX not followed by a line"); 00510 } else if (line.equals(":SUFFIX")) { 00511 suffix = input.readLine(); 00512 if (suffix==null) 00513 throw new FileFormatException 00514 ("in CodepositionReplacement: :SUFFIX not followed by a line"); 00515 } else if (line.equals(":PLACEHOLDER")) { 00516 placeHolder = input.readLine(); 00517 if (placeHolder==null) 00518 throw new FileFormatException 00519 ("in CodepositionReplacement: :PLACEHOLDER not followed by a line"); 00520 } else if (line.equals(":INFO")) { 00521 replinfo = input.readLine(); 00522 if (replinfo==null) 00523 throw new FileFormatException 00524 ("in CodepositionReplacement: :INFO not followed by a line"); 00525 00526 } else if (line.equals(":IGNORECASE")) { 00527 caseHandling = CASE_IGNORE; 00528 } else { 00529 throw new FileFormatException 00530 ("in CodepositionReplacement: Unknown command "+line); 00531 } 00532 } 00533 00534 if (replinfo==null) 00535 replinfo = "Hangul syllable "+placeHolder; 00536 00537 if (caseHandling == CASE_IGNORE) { 00538 if (prefix!=null) prefix = prefix.toUpperCase(); 00539 if (suffix!=null) suffix = suffix.toUpperCase(); 00540 } 00541 } 00542 }

Generated on Sun Aug 15 11:56:53 2004 for International Input by doxygen 1.3.7