00001
import java.io.*;
00002
00003 public class HangulReplacement
00004
extends InputReplacement implements
ReplacementFactory {
00005 final static int CASE_UPPER = 1;
00006 final static int CASE_LOWER = 2;
00007 final static int CASE_IGNORE = 3;
00008 String
prefix = null;
00009 String
suffix = null;
00010 String
placeHolder =
"XXXX";
00011 String
replinfo = null;
00012 int caseHandling =
CASE_UPPER;
00013 private String
cache = null;
00014 private int pfxLength;
00015 private String
continuation1;
00016 private String
continuation2;
00017 int choseong = 0;
00018 int jungseong = 0;
00019 int jongseong = 0;
00020 public synchronized String
getReplacement(String input) {
00021
parseInput(input);
00022
00023
00024
00025
int c = ((
choseong-0x1100)*21+
jungseong-0x1161)*28+0xAC00;
00026
if (
jongseong!=0) c +=
jongseong-0x11A7;
00027
return new Character((
char)c).toString();
00028 }
00029 public synchronized String
replacesPrefixOf(String input) {
00030
parseInput(input);
00031
if (
pfxLength==0)
return null;
00032
return input.substring(0,
pfxLength);
00033 }
00034 private void parseInput(String input) {
00035
if (
caseHandling==
CASE_IGNORE) input = input.toUpperCase();
00036
if (
cache==input || input.equals(
cache))
return;
00037
if (
caseHandling==
CASE_LOWER)
00038
throw new UnsupportedOperationException
00039 (
"caseHandling==CASE_LOWER not yet implemented");
00040
cache = input;
00041 System.out.println(
"Parsing: "+input);
00042
pfxLength = 0;
00043
continuation1 =
continuation2 = null;
00044
if (
prefix!=null && !input.startsWith(
prefix)) {
00045
if (
prefix.startsWith(input))
00046
continuation1 =
getInputInfo();
00047
return;
00048 }
00049 System.out.println(
"Prefix "+
prefix+
" present");
00050
int pos = 0;
00051
int len = input.length();
00052
if (
prefix!=null) pos =
prefix.length();
00053
final int STATE_CHOSEONG = 1;
00054
final int STATE_CHOSEONG_JUNGSEONG = 2;
00055
final int STATE_JUNGSEONG = 3;
00056
final int STATE_JUNGSEONG_JONGSEONG = 4;
00057
final int STATE_JONGSEONG = 5;
00058
final int STATE_END = 6;
00059
final int STATE_FAILED = 7;
00060
final int STATE_NOCONT = 8;
00061
int state = STATE_CHOSEONG;
00062
choseong = 0;
00063
jongseong = 0;
00064
jungseong = 0;
00065
while (pos < len) {
00066
char c = input.charAt(pos);
00067
00068
boolean consumed =
false;
00069
if (state==STATE_CHOSEONG) {
00070 consumed =
true;
00071
switch (c) {
00072
00073
case 'G':
choseong = 0x1100;
break;
00074
00075
case 'N':
choseong = 0x1102;
break;
00076
00077
case 'D':
choseong = 0x1103;
break;
00078
00079
case 'R':
choseong = 0x1105;
break;
00080
00081
case 'M':
choseong = 0x1106;
break;
00082
00083
case 'B':
choseong = 0x1107;
break;
00084
00085
case 'S':
choseong = 0x1109;
break;
00086
00087
case 'J':
choseong = 0x110C;
break;
00088
00089
case 'C':
choseong = 0x110E;
break;
00090
00091
case 'K':
choseong = 0x110F;
break;
00092
00093
case 'T':
choseong = 0x1110;
break;
00094
00095
case 'P':
choseong = 0x1111;
break;
00096
00097
case 'H':
choseong = 0x1112;
break;
00098
00099
default:
choseong = 0x110B; consumed =
false;
00100 }
00101 state = STATE_CHOSEONG_JUNGSEONG;
00102 }
00103
00104
if (!consumed && (state==STATE_JUNGSEONG ||
00105 state==STATE_CHOSEONG_JUNGSEONG)) {
00106
int newstate = STATE_JUNGSEONG_JONGSEONG;
00107 consumed =
true;
00108
00109
switch (c) {
00110
00111
case 'G':
00112
if (state==STATE_CHOSEONG_JUNGSEONG &&
choseong==0x1100) {
00113 choseong = 0x1101; newstate=STATE_JUNGSEONG;
00114 }
else {
00115 newstate=STATE_FAILED;
00116 }
00117
break;
00118
00119
case 'D':
00120
if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1103) {
00121 choseong = 0x1104; newstate=STATE_JUNGSEONG;
00122 }
else {
00123 newstate=STATE_FAILED;
00124 }
00125
break;
00126
00127
case 'B':
00128
if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1107) {
00129 choseong = 0x1108; newstate=STATE_JUNGSEONG;
00130 }
else {
00131 newstate=STATE_FAILED;
00132 }
00133
break;
00134
00135
case 'S':
00136
if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x1109) {
00137 choseong = 0x110A; newstate=STATE_JUNGSEONG;
00138 }
else {
00139 newstate=STATE_FAILED;
00140 }
00141
break;
00142
00143
case 'J':
00144
if (state==STATE_CHOSEONG_JUNGSEONG && choseong==0x110C) {
00145 choseong = 0x110D; newstate=STATE_JUNGSEONG;
00146 }
else {
00147 newstate=STATE_FAILED;
00148 }
00149
break;
00150
00151
00152
case 'A':
jungseong = 0x1161;
break;
00153
00154
case 'E':
jungseong = 0x1166;
break;
00155
00156
case 'O':
jungseong = 0x1169;
break;
00157
00158
case 'U':
jungseong = 0x116E;
break;
00159
00160
case 'I':
jungseong = 0x1175;
break;
00161
case 'W':
jungseong =
'W';
break;
00162
case 'Y':
jungseong =
'Y';
break;
00163
default:
00164 newstate = STATE_FAILED;
00165 }
00166 state = newstate;
00167 }
00168
00169
if (!consumed && (state==STATE_JONGSEONG ||
00170 state==STATE_JUNGSEONG_JONGSEONG)) {
00171
int newstate = STATE_JONGSEONG;
00172
00173
switch (c) {
00174
case 'A':
00175
00176
if (state==STATE_JUNGSEONG_JONGSEONG &&
jungseong ==
'Y') {
00177 jungseong = 0x1163; newstate = STATE_JUNGSEONG_JONGSEONG;
00178
00179 }
else if
00180 (state==STATE_JUNGSEONG_JONGSEONG && jungseong ==
'W') {
00181 jungseong = 0x116A; newstate = STATE_JUNGSEONG_JONGSEONG;
00182 }
else {
00183 newstate=STATE_END;
00184 }
00185
break;
00186
00187
case 'E':
00188
00189
if (state==STATE_JUNGSEONG_JONGSEONG && jungseong ==
'Y') {
00190 jungseong = 0x1168; newstate = STATE_JUNGSEONG_JONGSEONG;
00191 }
else if
00192
00193 (state==STATE_JUNGSEONG_JONGSEONG && jungseong ==
'W') {
00194 jungseong = 0x1170; newstate = STATE_JUNGSEONG_JONGSEONG;
00195 }
else if
00196
00197 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1169) {
00198 jungseong = 0x116C;
00199 }
else if
00200
00201 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1163) {
00202 jungseong = 0x1164;
00203 }
else if
00204
00205 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x116A) {
00206 jungseong = 0x116B;
00207 }
else if
00208
00209 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1161) {
00210 jungseong = 0x1162;
00211 }
else {
00212 newstate = STATE_END;
00213 };
00214
break;
00215
00216
case 'O':
00217
00218
if (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1166) {
00219 jungseong = 0x1165;
00220 }
else if
00221
00222 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1168) {
00223 jungseong = 0x1167;
00224 }
else if
00225
00226 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==
'Y') {
00227 jungseong = 0x116D;
00228 }
else if
00229 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1170) {
00230
00231 jungseong = 0x116F;
00232 }
else {
00233 newstate = STATE_END;
00234 }
00235
break;
00236
00237
case 'I':
00238
00239
if (state==STATE_JUNGSEONG_JONGSEONG && jungseong==
'W') {
00240 jungseong = 0x1171;
00241 }
else if
00242
00243 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==
'Y') {
00244 jungseong = 0x1174;
00245 }
else {
00246 newstate = STATE_END;
00247 }
00248
break;
00249
00250
case 'U':
00251
00252
if (state==STATE_JUNGSEONG_JONGSEONG && jungseong==
'Y') {
00253 jungseong = 0x1172;
00254 }
else if
00255
00256 (state==STATE_JUNGSEONG_JONGSEONG && jungseong==0x1166) {
00257 jungseong = 0x1173;
00258 }
else {
00259 newstate = STATE_END;
00260 }
00261
break;
00262
00263
case 'G':
00264
switch (
jongseong) {
00265
00266
case 0:
jongseong = 0x11A8;
break;
00267
00268
case 0x11A8:
jongseong = 0x11A9; newstate = STATE_NOCONT;
break;
00269
00270
case 0x11AB:
jongseong = 0x11BC; newstate = STATE_NOCONT;
break;
00271
00272
case 0x11AF:
jongseong = 0x11B0; newstate = STATE_NOCONT;
break;
00273
default: newstate = STATE_END; }
00274
break;
00275
00276
case 'N':
00277
if (
jongseong==0)
jongseong = 0x11AB;
00278
else newstate = STATE_END;
00279
break;
00280
00281
00282
case 'D':
00283
if (
jongseong==0) {
jongseong = 0x11AE; newstate = STATE_NOCONT; }
00284
else newstate = STATE_END;
00285
break;
00286
00287
00288
case 'L':
00289
if (
jongseong==0)
jongseong = 0x11AF;
00290
else newstate = STATE_END;
00291
break;
00292
00293
case 'M':
00294
switch (
jongseong) {
00295
00296
case 0:
jongseong = 0x11B7; newstate = STATE_NOCONT;
break;
00297
00298
case 0x11AF:
jongseong = 0x11B1; newstate = STATE_NOCONT;
break;
00299
default: newstate = STATE_END;
00300 }
00301
break;
00302
00303
case 'B':
00304
switch (
jongseong) {
00305
00306
case 0:
jongseong = 0x11B8;
break;
00307
00308
case 0x11AF:
jongseong = 0x11B2; newstate = STATE_NOCONT;
break;
00309
default: newstate = STATE_END;
00310 }
00311
break;
00312
00313
case 'S':
00314
switch (
jongseong) {
00315
00316
case 0:
jongseong = 0x11BA;
break;
00317
00318
case 0x11A8:
jongseong = 0x11AA; newstate = STATE_NOCONT;
break;
00319
00320
case 0x11AF:
jongseong = 0x11B3; newstate = STATE_NOCONT;
break;
00321
00322
case 0x11BA:
jongseong = 0x11BB; newstate = STATE_NOCONT;
break;
00323
00324
case 0x11B8:
jongseong = 0x11B9; newstate = STATE_NOCONT;
break;
00325
default: newstate = STATE_END;
00326 }
00327
break;
00328
00329
case 'J':
00330
switch (
jongseong) {
00331
00332
case 0:
jongseong = 0x11BD; newstate = STATE_NOCONT;
break;
00333
00334
case 0x11AB:
jongseong = 0x11AC; newstate = STATE_NOCONT;
break;
00335
default: newstate = STATE_END;
00336 }
00337
break;
00338
00339
00340
case 'C':
00341
if (
jongseong==0) {
jongseong = 0x11BE; newstate = STATE_NOCONT; }
00342
else newstate = STATE_END;
00343
break;
00344
00345
00346
case 'K':
00347
if (
jongseong==0) {
jongseong = 0x11BF; newstate = STATE_NOCONT; }
00348
else newstate = STATE_END;
00349
break;
00350
00351
case 'T':
00352
switch (
jongseong) {
00353
00354
case 0:
jongseong = 0x11C0; newstate = STATE_NOCONT;
break;
00355
00356
case 0x11AF:
jongseong = 0x11B4; newstate = STATE_NOCONT;
break;
00357
default: newstate = STATE_END;
00358 }
00359
break;
00360
00361
case 'P':
00362
switch (
jongseong) {
00363
00364
case 0:
jongseong = 0x11C1; newstate = STATE_NOCONT;
break;
00365
00366
case 0x11AF:
jongseong = 0x11B5; newstate = STATE_NOCONT;
break;
00367
default: newstate = STATE_END;
00368 }
00369
break;
00370
00371
case 'H':
00372
switch (
jongseong) {
00373
00374
case 0:
jongseong = 0x11C2; newstate = STATE_NOCONT;
break;
00375
00376
case 0x11AB:
jongseong = 0x11AD; newstate = STATE_NOCONT;
break;
00377
00378
case 0x11AF:
jongseong = 0x11B6; newstate = STATE_NOCONT;
break;
00379 }
00380
break;
00381
00382
default:
00383 newstate = STATE_END;
00384 }
00385 state = newstate;
00386 }
00387
00388
if (
jungseong<0x1000)
00389
switch (state) {
00390
case STATE_JONGSEONG:
00391
case STATE_END:
00392
case STATE_NOCONT:
00393 state = STATE_FAILED;
00394 }
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
if (state==STATE_FAILED)
return;
00406
00407
if (state==STATE_END) {
00408 System.out.println(
"End: "+pos);
00409 state = STATE_NOCONT;
00410 pos--;
00411 }
00412
00413
00414
00415
00416
if (
suffix!=null && input.startsWith(
suffix,pos+1)) {
00417
switch (state) {
00418
case STATE_NOCONT:
00419
case STATE_JONGSEONG:
00420
case STATE_JUNGSEONG_JONGSEONG:
00421
pfxLength = pos+
suffix.length()+1;
00422 }
00423
return; }
00424
00425 pos++;
00426
00427 System.out.print(
"Char("+pos+
") "+c+
": State=");
00428
switch (state) {
00429
case STATE_JONGSEONG: System.out.print(
"JONGSEONG");
break;
00430
case STATE_CHOSEONG: System.out.print(
"CHOSEONG");
break;
00431
case STATE_CHOSEONG_JUNGSEONG: System.out.print(
"CHOSEONG_JUNGSEONG");
break;
00432
case STATE_JUNGSEONG_JONGSEONG: System.out.print(
"JUNGSEONG_JONGSEONG");
break;
00433
case STATE_END: System.out.print(
"END");
break;
00434
case STATE_NOCONT: System.out.print(
"NOCONT");
break;
00435
case STATE_FAILED: System.out.print(
"FAILED");
break;
00436
default: System.out.print(state);
00437 }
00438 System.out.println(
" , choseong="+Integer.toString(
choseong,16)+
00439
" , jungseong="+Integer.toString(
jungseong,16)+
00440
" , jongseong="+Integer.toString(
jongseong,16));
00441
00442
if (state==STATE_NOCONT)
break;
00443 }
00444
00445
if (
suffix!=null) {
00446
switch (state) {
00447
case STATE_CHOSEONG:
00448
case STATE_CHOSEONG_JUNGSEONG:
00449
case STATE_JUNGSEONG:
00450
continuation1 = input+
placeHolder+
suffix;
00451
break;
00452
case STATE_JONGSEONG:
00453
case STATE_JUNGSEONG_JONGSEONG:
00454
continuation1 = input+
placeHolder+suffix;
00455 continuation2 = input+suffix;
00456
break;
00457
case STATE_NOCONT:
00458
if (suffix.length()!=input.length()-pos &&
00459 suffix.startsWith(input.substring(pos)))
00460
continuation1 = input.substring(0,pos)+suffix;
00461
break;
00462 }
00463 }
else {
00464
switch (state) {
00465
case STATE_CHOSEONG:
00466
case STATE_CHOSEONG_JUNGSEONG:
00467
case STATE_JUNGSEONG:
00468
continuation1 = input+
placeHolder;
00469
break;
00470
case STATE_JONGSEONG:
00471
case STATE_JUNGSEONG_JONGSEONG:
00472
continuation1 = input+placeHolder;
00473
if (
jungseong>=0x1000)
00474
pfxLength = pos;
00475
break;
00476
case STATE_NOCONT:
00477
pfxLength = pos;
00478
break;
00479 }
00480 }
00481 }
00482 public synchronized String[]
isContinuationOf(String input) {
00483 parseInput(input);
00484
if (
continuation1==null)
return null;
00485
if (
continuation2==null)
00486
return new String[] {
continuation1 };
00487
return new String[] {
continuation1,
continuation2 };
00488 }
00489 public String
getInputInfo() {
00490
return (
prefix!=null?
prefix:
"")+
placeHolder+
00491 (
suffix!=null?
suffix:
""); }
00492 public String
getReplacementInfo() {
00493
return replinfo; };
00494
00495 public HangulReplacement(BufferedReader input)
00496
throws IOException,
FileFormatException {
00497
while (
true) {
00498 String line=input.readLine();
00499
if (line==null) {
00500
throw new FileFormatException
00501 (
"EOF in CodepositionReplacement");
00502 }
else if (line.equals(
"###")) {
00503
break;
00504 }
else if (line.equals(
"")) {
00505 }
else if (line.equals(
":PREFIX")) {
00506
prefix = input.readLine();
00507
if (
prefix==null)
00508
throw new FileFormatException
00509 (
"in CodepositionReplacement: :PREFIX not followed by a line");
00510 }
else if (line.equals(
":SUFFIX")) {
00511
suffix = input.readLine();
00512
if (
suffix==null)
00513
throw new FileFormatException
00514 (
"in CodepositionReplacement: :SUFFIX not followed by a line");
00515 }
else if (line.equals(
":PLACEHOLDER")) {
00516
placeHolder = input.readLine();
00517
if (
placeHolder==null)
00518
throw new FileFormatException
00519 (
"in CodepositionReplacement: :PLACEHOLDER not followed by a line");
00520 }
else if (line.equals(
":INFO")) {
00521
replinfo = input.readLine();
00522
if (
replinfo==null)
00523
throw new FileFormatException
00524 (
"in CodepositionReplacement: :INFO not followed by a line");
00525
00526 }
else if (line.equals(
":IGNORECASE")) {
00527
caseHandling =
CASE_IGNORE;
00528 }
else {
00529
throw new FileFormatException
00530 (
"in CodepositionReplacement: Unknown command "+line);
00531 }
00532 }
00533
00534
if (
replinfo==null)
00535
replinfo =
"Hangul syllable "+
placeHolder;
00536
00537
if (
caseHandling ==
CASE_IGNORE) {
00538
if (
prefix!=null)
prefix =
prefix.toUpperCase();
00539
if (
suffix!=null)
suffix =
suffix.toUpperCase();
00540 }
00541 }
00542 }