View Javadoc

1   /*
2    * EDI-Knight Integration and Transformation Platform
3    * Copyright (C) 2006-2007 Holger Joest <hjoest@users.sourceforge.net>
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation; either version 2 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  
20  package net.sf.ediknight.codec.edifact.parser;
21  
22  import java.io.File;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.io.Reader;
27  import java.util.HashMap;
28  import java.util.Map;
29  
30  import net.sf.ediknight.Inspection;
31  import net.sf.ediknight.Recognizer;
32  import net.sf.ediknight.edi.SyntaxHandler;
33  
34  
35  /***
36   *
37   */
38  final class ISO9735Recognizer
39  implements Recognizer<ISO9735Format> {
40  
41      /*** */
42      private static final char[] DEFAULT_SERVICE_CHARACTERS =
43          new char[] {
44          ':', '+', '.', '?', ' ', '\'', '\n'
45      };
46  
47      /*** Recognizer hints. */
48      private Map<String, Object> hints =
49          new HashMap<String, Object>();
50  
51      private double likeliness;
52  
53      /*** The associated parser. */
54      private ISO9735Parser parser;
55  
56  
57      /***
58       * @param parser the parser
59       */
60      ISO9735Recognizer(ISO9735Parser parser) {
61          this.parser = parser;
62      }
63  
64  
65      /***
66       * {@inheritDoc}
67       * @see net.sf.ediknight.Recognizer#inspect(java.io.File)
68       */
69      public Inspection<ISO9735Format> inspect(File file) {
70          try {
71              return null;
72          } catch (Exception ex) {
73              return new Inspection<ISO9735Format>();
74          }
75      }
76  
77  
78      /***
79       * {@inheritDoc}
80       * @see net.sf.ediknight.Recognizer#inspect(
81       *      java.io.InputStream)
82       */
83      public Inspection<ISO9735Format> inspect(InputStream stream) {
84          try {
85              return analyze(stream, 1024);
86          } catch (Exception ex) {
87              return new Inspection<ISO9735Format>();
88          }
89      }
90  
91  
92      /***
93       * {@inheritDoc}
94       * @see net.sf.ediknight.Recognizer#addHint(
95       *      java.lang.String, java.lang.Object)
96       */
97      public void addHint(String hint, Object value) {
98          hints.put(hint, value);
99      }
100 
101 
102     /***
103      * @param istream an input stream
104      * @param lookAhead the number of bytes to look ahead
105      * @return the inspection result
106      * @throws IOException if an I/O error occurs
107      */
108     private Inspection<ISO9735Format> analyze(
109             InputStream istream,
110             int lookAhead)
111     throws IOException {
112         istream.mark(lookAhead);
113         String encoding =
114             (String) hints.get(Recognizer.ENCODING);
115         if (encoding == null) {
116             encoding = analyzeEncoding(istream);
117         }
118         istream.reset();
119         char[] serviceCharacters =
120             analyzeServiceCharacters(istream, encoding);
121         String directoryVersion =
122             (String) hints.get(Recognizer.DIRECTORY);
123         if (directoryVersion == null) {
124             istream.reset();
125             directoryVersion =
126                 analyzeDirectory(
127                         istream, encoding, serviceCharacters);
128         }
129         ISO9735Format format =
130             new ISO9735Format(
131                     encoding,
132                     directoryVersion,
133                     serviceCharacters);
134         parser.setFormat(format);
135         return new Inspection<ISO9735Format>(
136                 parser, format, likeliness);
137     }
138 
139 
140     /***
141      * @param istream an input stream
142      * @return the guessed encoding
143      * @throws IOException if an I/O error occurs
144      */
145     private String analyzeEncoding(InputStream istream)
146     throws IOException {
147         int b = istream.read();
148         if (b == 0x00) {
149             /* BOM:  00 00 FE FF   UTF-32BE */
150             b = istream.read();
151             if (b == 0x00) {
152                 likeliness = 1d;
153                 return "UTF-32BE";
154             } else if (b == 85) {
155                 likeliness = 1d;
156                 return "UTF-16BE";
157             }
158         } else if (b == 0xef) {
159             /* BOM:  EF BB BF      UTF-8 */
160             likeliness = 1d;
161             return "UTF-8";
162         } else if (b == 0xfe) {
163             /* BOM:  FE FF         UTF-16BE */
164             likeliness = 1d;
165             return "UTF-16BE";
166         } else if (b == 0xff) {
167             /* BOM:  FF FE 00 00   UTF-32, little-endian
168                      FF FE         UTF-16, little-endian */
169             istream.skip(1);
170             b = istream.read();
171             if (b == 0x00) {
172                 likeliness = 1d;
173                 return "UTF-32LE";
174             }
175             likeliness = 1d;
176             return "UTF-16LE";
177         } else if (b == 85) {
178             b = istream.read();
179             if (b == 0x00) {
180                 likeliness = 1d;
181                 return "UTF-16LE";
182             }
183             /* ASCII derivative */
184             byte[] buf = new byte[26];
185             istream.read(buf);
186             String sample = new String(buf, "US-ASCII");
187             int p = sample.indexOf("UNO");
188             if (p < 0 || p > sample.length() - 4) {
189                 likeliness = 0.999d;
190                 return "ISO-8859-1";
191             }
192             switch (sample.charAt(p + 3)) {
193             case 'A':
194                 likeliness = 1d;
195                 return "US-ASCII";
196             case 'B':
197                 likeliness = 1d;
198                 return "US-ASCII";
199             case 'C':
200                 likeliness = 1d;
201                 return "ISO-8859-1";
202             case 'D':
203                 likeliness = 1d;
204                 return "ISO-8859-2";
205             case 'E':
206                 likeliness = 1d;
207                 return "ISO-8859-5";
208             case 'F':
209                 likeliness = 1d;
210                 return "ISO-8859-7";
211             case 'G':
212                 likeliness = 1d;
213                 return "ISO-8859-3";
214             case 'H':
215                 likeliness = 1d;
216                 return "ISO-8859-4";
217             case 'I':
218                 likeliness = 1d;
219                 return "ISO-8859-6";
220             case 'J':
221                 likeliness = 1d;
222                 return "ISO-8859-8";
223             case 'K':
224                 likeliness = 1d;
225                 return "ISO-8859-9";
226             case 'X':
227                 likeliness = 1d;
228                 return "ISO-2022-CN-GB";
229             case 'Y':
230                 likeliness = 1d;
231                 return "UTF-8";
232             default:
233                 likeliness = 0.999d;
234                 return "ISO-8859-1";
235             }
236         } else if (b == 132) {
237             /* Simplified chinese */
238             likeliness = 1d;
239             return "GB18030";
240         } else if (b == 228) {
241             /* looks like EBCDIC */
242             likeliness = 1d;
243             return "CP500";
244         }
245         // try per default
246         likeliness = 0.1d;
247         return "ISO-8859-1";
248     }
249 
250 
251     /***
252      * @param istream an input stream
253      * @param encoding the character encoding
254      * @throws IOException if an I/O error occurs
255      */
256     private char[] analyzeServiceCharacters(
257             InputStream istream,
258             String encoding)
259     throws IOException {
260         Reader reader = new InputStreamReader(istream, encoding);
261         int ch = reader.read();
262         if (ch != 'U') {
263             return DEFAULT_SERVICE_CHARACTERS;
264         }
265         ch = reader.read();
266         if (ch != 'N') {
267             return DEFAULT_SERVICE_CHARACTERS;
268         }
269         ch = reader.read();
270         if (ch != 'A') {
271             return DEFAULT_SERVICE_CHARACTERS;
272         }
273         char[] serviceCharacters = new char[7];
274         if (reader.read(serviceCharacters, 0, 7) != 7) {
275             return DEFAULT_SERVICE_CHARACTERS;
276         }
277         if (serviceCharacters[6] == 'U') {
278             serviceCharacters[6] = 0;
279         }
280         return serviceCharacters;
281     }
282 
283 
284     /***
285      * @param stream an input stream
286      * @param encoding the previously determined encoding
287      * @return the guessed directory
288      * @throws IOException if an I/O error occurs
289      */
290     private String analyzeDirectory(
291             InputStream stream,
292             String encoding,
293             char[] serviceCharacters)
294     throws IOException {
295         Reader reader = new InputStreamReader(stream, encoding);
296         WhichVersion version = new WhichVersion();
297         ISO9735Format format =
298             new ISO9735Format(encoding, null, serviceCharacters);
299         SyntaxHandler save = parser.getSyntaxHandler();
300         try {
301             parser.setSyntaxHandler(version);
302             parser.setFormat(format);
303             parser.parse(reader);
304             return version.toString();
305         } catch (Exception ex) {
306             return version.toString(); 
307         } finally {
308             parser.setSyntaxHandler(save);
309         }
310     }
311 
312 
313     private static class WhichVersion
314     implements SyntaxHandler {
315 
316         private int segmentCount;
317 
318         private boolean inUnhSegment;
319 
320         private boolean inS009;
321 
322         private String version0;
323 
324         private String version1;
325 
326 
327         WhichVersion() {
328         }
329 
330         public void characters(String value) {
331             if (value.length() == 3
332                     && Character.isDigit(value.charAt(0))
333                     && Character.isDigit(value.charAt(1))
334                     && (Character.isUpperCase(value.charAt(2))
335                             || Character.isDigit(value.charAt(2)))) {
336                 if (inUnhSegment && inS009) {
337                     version0 = value.toLowerCase();
338                 } else {
339                     version1 = value.toLowerCase();
340                 }
341             }
342         }
343 
344         public void finish() {
345         }
346 
347         public void nextCompositeElement() {
348             if (inUnhSegment) {
349                 inS009 = true;
350             }
351         }
352 
353         public void nextSegment(String segmentId) {
354             if ("UNH".equals(segmentId)) {
355                 inUnhSegment = true;
356             } else if (inUnhSegment) {
357                 throw new PrematureStop();
358             }
359             if (segmentCount++ > 5) {
360                 throw new PrematureStop();
361             }
362         }
363 
364         public void nextSimpleElement() {
365         }
366 
367         @Override
368         public String toString() {
369             if (version0 != null) {
370                 return version0;
371             }
372             if (version1 != null) {
373                 return version1;
374             }
375             return "01b";
376         }
377 
378     }
379 
380     private static class PrematureStop
381     extends RuntimeException {
382 
383         private static final long serialVersionUID = 3268184132387374766L;
384 
385     }
386 
387 }
388