1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package net.sf.ediknight.codec.edifact.parser;
21
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.io.Reader;
27 import java.util.HashMap;
28 import java.util.Map;
29
30 import net.sf.ediknight.Inspection;
31 import net.sf.ediknight.Recognizer;
32 import net.sf.ediknight.edi.SyntaxHandler;
33
34
35 /***
36 *
37 */
38 final class ISO9735Recognizer
39 implements Recognizer<ISO9735Format> {
40
41 /*** */
42 private static final char[] DEFAULT_SERVICE_CHARACTERS =
43 new char[] {
44 ':', '+', '.', '?', ' ', '\'', '\n'
45 };
46
47 /*** Recognizer hints. */
48 private Map<String, Object> hints =
49 new HashMap<String, Object>();
50
51 private double likeliness;
52
53 /*** The associated parser. */
54 private ISO9735Parser parser;
55
56
57 /***
58 * @param parser the parser
59 */
60 ISO9735Recognizer(ISO9735Parser parser) {
61 this.parser = parser;
62 }
63
64
65 /***
66 * {@inheritDoc}
67 * @see net.sf.ediknight.Recognizer#inspect(java.io.File)
68 */
69 public Inspection<ISO9735Format> inspect(File file) {
70 try {
71 return null;
72 } catch (Exception ex) {
73 return new Inspection<ISO9735Format>();
74 }
75 }
76
77
78 /***
79 * {@inheritDoc}
80 * @see net.sf.ediknight.Recognizer#inspect(
81 * java.io.InputStream)
82 */
83 public Inspection<ISO9735Format> inspect(InputStream stream) {
84 try {
85 return analyze(stream, 1024);
86 } catch (Exception ex) {
87 return new Inspection<ISO9735Format>();
88 }
89 }
90
91
92 /***
93 * {@inheritDoc}
94 * @see net.sf.ediknight.Recognizer#addHint(
95 * java.lang.String, java.lang.Object)
96 */
97 public void addHint(String hint, Object value) {
98 hints.put(hint, value);
99 }
100
101
102 /***
103 * @param istream an input stream
104 * @param lookAhead the number of bytes to look ahead
105 * @return the inspection result
106 * @throws IOException if an I/O error occurs
107 */
108 private Inspection<ISO9735Format> analyze(
109 InputStream istream,
110 int lookAhead)
111 throws IOException {
112 istream.mark(lookAhead);
113 String encoding =
114 (String) hints.get(Recognizer.ENCODING);
115 if (encoding == null) {
116 encoding = analyzeEncoding(istream);
117 }
118 istream.reset();
119 char[] serviceCharacters =
120 analyzeServiceCharacters(istream, encoding);
121 String directoryVersion =
122 (String) hints.get(Recognizer.DIRECTORY);
123 if (directoryVersion == null) {
124 istream.reset();
125 directoryVersion =
126 analyzeDirectory(
127 istream, encoding, serviceCharacters);
128 }
129 ISO9735Format format =
130 new ISO9735Format(
131 encoding,
132 directoryVersion,
133 serviceCharacters);
134 parser.setFormat(format);
135 return new Inspection<ISO9735Format>(
136 parser, format, likeliness);
137 }
138
139
140 /***
141 * @param istream an input stream
142 * @return the guessed encoding
143 * @throws IOException if an I/O error occurs
144 */
145 private String analyzeEncoding(InputStream istream)
146 throws IOException {
147 int b = istream.read();
148 if (b == 0x00) {
149
150 b = istream.read();
151 if (b == 0x00) {
152 likeliness = 1d;
153 return "UTF-32BE";
154 } else if (b == 85) {
155 likeliness = 1d;
156 return "UTF-16BE";
157 }
158 } else if (b == 0xef) {
159
160 likeliness = 1d;
161 return "UTF-8";
162 } else if (b == 0xfe) {
163
164 likeliness = 1d;
165 return "UTF-16BE";
166 } else if (b == 0xff) {
167
168
169 istream.skip(1);
170 b = istream.read();
171 if (b == 0x00) {
172 likeliness = 1d;
173 return "UTF-32LE";
174 }
175 likeliness = 1d;
176 return "UTF-16LE";
177 } else if (b == 85) {
178 b = istream.read();
179 if (b == 0x00) {
180 likeliness = 1d;
181 return "UTF-16LE";
182 }
183
184 byte[] buf = new byte[26];
185 istream.read(buf);
186 String sample = new String(buf, "US-ASCII");
187 int p = sample.indexOf("UNO");
188 if (p < 0 || p > sample.length() - 4) {
189 likeliness = 0.999d;
190 return "ISO-8859-1";
191 }
192 switch (sample.charAt(p + 3)) {
193 case 'A':
194 likeliness = 1d;
195 return "US-ASCII";
196 case 'B':
197 likeliness = 1d;
198 return "US-ASCII";
199 case 'C':
200 likeliness = 1d;
201 return "ISO-8859-1";
202 case 'D':
203 likeliness = 1d;
204 return "ISO-8859-2";
205 case 'E':
206 likeliness = 1d;
207 return "ISO-8859-5";
208 case 'F':
209 likeliness = 1d;
210 return "ISO-8859-7";
211 case 'G':
212 likeliness = 1d;
213 return "ISO-8859-3";
214 case 'H':
215 likeliness = 1d;
216 return "ISO-8859-4";
217 case 'I':
218 likeliness = 1d;
219 return "ISO-8859-6";
220 case 'J':
221 likeliness = 1d;
222 return "ISO-8859-8";
223 case 'K':
224 likeliness = 1d;
225 return "ISO-8859-9";
226 case 'X':
227 likeliness = 1d;
228 return "ISO-2022-CN-GB";
229 case 'Y':
230 likeliness = 1d;
231 return "UTF-8";
232 default:
233 likeliness = 0.999d;
234 return "ISO-8859-1";
235 }
236 } else if (b == 132) {
237
238 likeliness = 1d;
239 return "GB18030";
240 } else if (b == 228) {
241
242 likeliness = 1d;
243 return "CP500";
244 }
245
246 likeliness = 0.1d;
247 return "ISO-8859-1";
248 }
249
250
251 /***
252 * @param istream an input stream
253 * @param encoding the character encoding
254 * @throws IOException if an I/O error occurs
255 */
256 private char[] analyzeServiceCharacters(
257 InputStream istream,
258 String encoding)
259 throws IOException {
260 Reader reader = new InputStreamReader(istream, encoding);
261 int ch = reader.read();
262 if (ch != 'U') {
263 return DEFAULT_SERVICE_CHARACTERS;
264 }
265 ch = reader.read();
266 if (ch != 'N') {
267 return DEFAULT_SERVICE_CHARACTERS;
268 }
269 ch = reader.read();
270 if (ch != 'A') {
271 return DEFAULT_SERVICE_CHARACTERS;
272 }
273 char[] serviceCharacters = new char[7];
274 if (reader.read(serviceCharacters, 0, 7) != 7) {
275 return DEFAULT_SERVICE_CHARACTERS;
276 }
277 if (serviceCharacters[6] == 'U') {
278 serviceCharacters[6] = 0;
279 }
280 return serviceCharacters;
281 }
282
283
284 /***
285 * @param stream an input stream
286 * @param encoding the previously determined encoding
287 * @return the guessed directory
288 * @throws IOException if an I/O error occurs
289 */
290 private String analyzeDirectory(
291 InputStream stream,
292 String encoding,
293 char[] serviceCharacters)
294 throws IOException {
295 Reader reader = new InputStreamReader(stream, encoding);
296 WhichVersion version = new WhichVersion();
297 ISO9735Format format =
298 new ISO9735Format(encoding, null, serviceCharacters);
299 SyntaxHandler save = parser.getSyntaxHandler();
300 try {
301 parser.setSyntaxHandler(version);
302 parser.setFormat(format);
303 parser.parse(reader);
304 return version.toString();
305 } catch (Exception ex) {
306 return version.toString();
307 } finally {
308 parser.setSyntaxHandler(save);
309 }
310 }
311
312
313 private static class WhichVersion
314 implements SyntaxHandler {
315
316 private int segmentCount;
317
318 private boolean inUnhSegment;
319
320 private boolean inS009;
321
322 private String version0;
323
324 private String version1;
325
326
327 WhichVersion() {
328 }
329
330 public void characters(String value) {
331 if (value.length() == 3
332 && Character.isDigit(value.charAt(0))
333 && Character.isDigit(value.charAt(1))
334 && (Character.isUpperCase(value.charAt(2))
335 || Character.isDigit(value.charAt(2)))) {
336 if (inUnhSegment && inS009) {
337 version0 = value.toLowerCase();
338 } else {
339 version1 = value.toLowerCase();
340 }
341 }
342 }
343
344 public void finish() {
345 }
346
347 public void nextCompositeElement() {
348 if (inUnhSegment) {
349 inS009 = true;
350 }
351 }
352
353 public void nextSegment(String segmentId) {
354 if ("UNH".equals(segmentId)) {
355 inUnhSegment = true;
356 } else if (inUnhSegment) {
357 throw new PrematureStop();
358 }
359 if (segmentCount++ > 5) {
360 throw new PrematureStop();
361 }
362 }
363
364 public void nextSimpleElement() {
365 }
366
367 @Override
368 public String toString() {
369 if (version0 != null) {
370 return version0;
371 }
372 if (version1 != null) {
373 return version1;
374 }
375 return "01b";
376 }
377
378 }
379
380 private static class PrematureStop
381 extends RuntimeException {
382
383 private static final long serialVersionUID = 3268184132387374766L;
384
385 }
386
387 }
388