00001
00002
00003
00004
00005
00006
00007
import java.io.*;
00008
00009 public class Normalizer {
00010 public static final int D = 2;
00011 public static final int C = 3;
00012 public static final int KD = 4;
00013 public static final int KC = 5;
00014
00015 public static final int NORMAL = 0;
00016 public static final int COMPATIBILITY = 1;
00017 public static final int EXCLUDE = 2;
00018
00019 public static final int MAX_UNICODE = 0xFFFF;
00020
00021 private static boolean initialized =
false;
00022 private static String
decompositions[] = null;
00023 private static boolean compatibility[] = null;
00024 private static int canonicalClass[] = null;
00025 private static char compositions[][] = null;
00026
00027 public static synchronized void init() throws IOException {
00028
if (
initialized)
return;
00029
decompositions =
new String[
MAX_UNICODE+1];
00030
compatibility =
new boolean[
MAX_UNICODE+1];
00031
canonicalClass =
new int[
MAX_UNICODE+1];
00032
compositions =
new char[
MAX_UNICODE+1][];
00033 DataInputStream data =
00034
new DataInputStream
00035 (
new BufferedInputStream
00036 (
new Normalizer().getClass().getResourceAsStream(
"normalizer_data.dat")));
00037
try {
00038
while (
true) {
00039
int num = data.readUnsignedShort();
00040
00041
if (num>0xFFFF)
00042
throw new IllegalArgumentException
00043 (
"Cannot handle Unicode > U+FFFF");
00044 String decomposition = data.readUTF();
00045
if (decomposition.length()==0) decomposition = null;
00046
decompositions[num] = decomposition;
00047
canonicalClass[num] = data.readUnsignedByte();
00048
int t = data.readUnsignedByte();
00049
compatibility[num] = (t==
COMPATIBILITY);
00050
if (t==
NORMAL && decomposition!=null && decomposition.length()>1) {
00051
if (decomposition.length()>2)
00052
throw new IllegalArgumentException
00053 (
"decomposition for "+num+
" has length "+decomposition.length());
00054
char c1 = decomposition.charAt(0);
00055
char c2 = decomposition.charAt(1);
00056
if (
compositions[c1]==null) {
00057
compositions[c1] =
new char[c2+0x100];
00058 }
else if (
compositions[c1].length<=c2) {
00059
char old[] =
compositions[c1];
00060 compositions[c1] =
new char[c2+0x100];
00061 System.arraycopy(old,0,compositions[c1],0,old.length);
00062 }
00063
00064
compositions[c1][c2] = (
char)num;
00065 }
00066 }
00067 }
catch (EOFException e) {};
00068
initialized =
true;
00069
00070 }
00071
00072 public static String
normalize(
int form, String string)
throws IOException {
00073 StringBuffer target =
new StringBuffer();
00074
if (normalize(form,string,0,string.length(),target)!=0)
00075
throw new RuntimeException(
"Internal Error: normalize did not return 0");
00076
return target.toString();
00077 }
00078
00079 private static int findStarter(String string,
int start) {
00080
00081
while (start>0) {
00082
if (
canonicalClass[string.charAt(start)]==0)
break;
00083 start--;
00084 }
00085
00086
return start;
00087 }
00088
00089 public static int normalize(
int form, String string,
int start,
int end,
00090 StringBuffer target)
throws IOException {
00091
if (start==end) {
00092 target.setLength(0);
return start; };
00093
if (!
initialized)
init();
00094 start = findStarter(string,start);
00095 string = string.substring(start,end);
00096
switch (form) {
00097
case D:
00098
decompose(
true,string,target);
00099
break;
00100
case C:
00101
decompose(
true,string,target);
00102
compose(target);
00103
break;
00104
case KD:
00105
decompose(
false,string,target);
00106
break;
00107
case KC:
00108
decompose(
false,string,target);
00109
compose(target);
00110
break;
00111
default:
00112
throw new IllegalArgumentException
00113 (
"Unknown normalization form "+form);
00114 }
00115
return start;
00116 }
00117
00118 private static void compose(StringBuffer target) {
00119
if (target.length()==0)
return;
00120
int starterPos = 0, compPos = 1;
00121
char starterChar = target.charAt(0);
00122
int lastClass =
canonicalClass[starterChar];
00123
if (lastClass != 0) lastClass = 256;
00124
int len = target.length();
00125
for (
int decompPos = 1; decompPos < len; decompPos++) {
00126
char c = target.charAt(decompPos);
00127
int cClass = canonicalClass[c];
00128
char composite = 0;
00129
if (
compositions[starterChar]!=null &&
compositions[starterChar].length>c)
00130 composite = compositions[starterChar][c];
00131
00132
if (composite!=0 && (lastClass < cClass || lastClass == 0)) {
00133 target.setCharAt(starterPos,composite);
00134 starterChar = composite;
00135 }
else {
00136
if (cClass == 0) {
00137 starterPos = compPos;
00138 starterChar = c;
00139 }
00140 lastClass = cClass;
00141 target.setCharAt(compPos++,c);
00142 }
00143 }
00144 target.setLength(compPos);
00145
return;
00146 }
00147
00148 private static void decomposeChar(
boolean canonical,
00149
char source, StringBuffer target) {
00150 String decomposition =
decompositions[source];
00151
if (decomposition != null && !(canonical &&
compatibility[source])) {
00152
int len = decomposition.length();
00153
for (
int i=0; i<len; i++)
00154 decomposeChar(canonical,decomposition.charAt(i),target);
00155 }
else {
00156 target.append(source);
00157 }
00158 }
00159
00160 private static void decompose(
boolean canonical,
00161 String source, StringBuffer target) {
00162
int sourceLen = source.length();
00163 StringBuffer buff =
new StringBuffer();
00164
for (
int i=0; i<sourceLen; i++) {
00165 buff.setLength(0);
00166 decomposeChar(canonical,source.charAt(i),buff);
00167
int buffLen = buff.length();
00168
for (
int j=0; j<buff.length(); j++) {
00169
char c = buff.charAt(j);
00170
int cl =
canonicalClass[c];
00171
int pos = target.length();
00172
if (cl!=0) {
00173
for (; pos>0; pos--) {
00174
if (canonicalClass[target.charAt(pos-1)] <= cl)
00175
break;
00176 }
00177 }
00178 target.insert(pos,c);
00179 }
00180 }
00181 }
00182
00183 public static void main(String args[])
throws IOException {
00184 StringBuffer arg =
new StringBuffer();
00185
for (
int i=0; i<args.length; i++) {
00186
if (i>0) arg.append(
" ");
00187 arg.append(args[i]);
00188 }
00189 System.out.println(
"String: "+arg);
00190 System.out.println(
"Normal: "+normalize(
C,arg.toString()));
00191 }
00192
00193 public static boolean isCombining(
char c)
throws IOException {
00194
if (!
initialized)
init();
00195
00196
return canonicalClass[c]!=0;
00197 }
00198
00199 public static boolean isSurrogate(
int c) {
00200
return ((c>=0xD800)&&(c<=0xDFFF));
00201 }
00202 public static boolean isLowSurrogate(
int c) {
00203
return ((c>=0xDC00)&&(c<=0xDFFF));
00204 }
00205 public static boolean isHighSurrogate(
int c) {
00206
return ((c>=0xD800)&&(c<=0xDBFF));
00207 }
00208 }