View Javadoc

1   /*
2    ****************************************************************************
3    * Copyright (C) 2005-2008, International Business Machines Corporation and *
4    * others. All Rights Reserved.                                             *
5    ****************************************************************************
6    *
7    */
8   package org.apache.tika.parser.txt;
9   
10  import java.util.Arrays;
11  
12  /**
13   * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
14   *                   Match is determined mostly by the input data adhering to the
15   *                   encoding scheme for the charset, and, optionally,
16   *                   frequency-of-occurence of characters.
17   * <p/>
18   *                   Instances of this class are singletons, one per encoding
19   *                   being recognized.  They are created in the main
20   *                   CharsetDetector class and kept in the global list of available
21   *                   encodings to be checked.  The specific encoding being recognized
22   *                   is determined by subclass.
23   * 
24   * @internal                  
25   */
26  abstract class CharsetRecog_mbcs extends CharsetRecognizer {
27  
28     /**
29       * Get the IANA name of this charset.
30       * @return the charset name.
31       */
32      abstract String      getName() ;
33      
34      
35      /**
36       * Test the match of this charset with the input text data
37       *      which is obtained via the CharsetDetector object.
38       * 
39       * @param det  The CharsetDetector, which contains the input text
40       *             to be checked for being in this charset.
41       * @return     Two values packed into one int  (Damn java, anyhow)
42       *             <br/>
43       *             bits 0-7:  the match confidence, ranging from 0-100
44       *             <br/>
45       *             bits 8-15: The match reason, an enum-like value.
46       */
47      int match(CharsetDetector det, int [] commonChars) {
48          int   singleByteCharCount = 0;
49          int   doubleByteCharCount = 0;
50          int   commonCharCount     = 0;
51          int   badCharCount        = 0;
52          int   totalCharCount      = 0;
53          int   confidence          = 0;
54          iteratedChar   iter       = new iteratedChar();
55          
56          detectBlock: {
57              for (iter.reset(); nextChar(iter, det);) {
58                  totalCharCount++;
59                  if (iter.error) {
60                      badCharCount++; 
61                  } else {
62                      long cv = iter.charValue & 0xFFFFFFFFL;
63                                          
64                      if (cv <= 0xff) {
65                          singleByteCharCount++;
66                      } else {
67                          doubleByteCharCount++;
68                          if (commonChars != null) {
69                              // NOTE: This assumes that there are no 4-byte common chars.
70                              if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
71                                  commonCharCount++;
72                              }
73                          }
74                      }
75                  }
76                  if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
77                      // Bail out early if the byte data is not matching the encoding scheme.
78                      break detectBlock;
79                  }
80              }
81              
82              if (doubleByteCharCount <= 10 && badCharCount== 0) {
83                  // Not many multi-byte chars.
84                  if (doubleByteCharCount == 0 && totalCharCount < 10) {
85                      // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
86                      // We don't have enough data to have any confidence.
87                      // Statistical analysis of single byte non-ASCII charcters would probably help here.
88                      confidence = 0;
89                  }
90                  else {
91                      //   ASCII or ISO file?  It's probably not our encoding,
92                      //   but is not incompatible with our encoding, so don't give it a zero.
93                      confidence = 10;
94                  }
95                  
96                  break detectBlock;
97              }
98              
99              //
100             //  No match if there are too many characters that don't fit the encoding scheme.
101             //    (should we have zero tolerance for these?)
102             //
103             if (doubleByteCharCount < 20*badCharCount) {
104                 confidence = 0;
105                 break detectBlock;
106             }
107             
108             if (commonChars == null) {
109                 // We have no statistics on frequently occuring characters.
110                 //  Assess confidence purely on having a reasonable number of
111                 //  multi-byte characters (the more the better
112                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
113                 if (confidence > 100) {
114                     confidence = 100;
115                 }
116             }else {
117                 //
118                 // Frequency of occurence statistics exist.
119                 //
120                 double maxVal = Math.log((float)doubleByteCharCount / 4);
121                 double scaleFactor = 90.0 / maxVal;
122                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
123                 confidence = Math.min(confidence, 100);
124             }
125         }   // end of detectBlock:
126         
127         return confidence;
128     }
129     
130      // "Character"  iterated character class.
131      //    Recognizers for specific mbcs encodings make their "characters" available
132      //    by providing a nextChar() function that fills in an instance of iteratedChar
133      //    with the next char from the input.
134      //    The returned characters are not converted to Unicode, but remain as the raw
135      //    bytes (concatenated into an int) from the codepage data.
136      //
137      //  For Asian charsets, use the raw input rather than the input that has been
138      //   stripped of markup.  Detection only considers multi-byte chars, effectively
139      //   stripping markup anyway, and double byte chars do occur in markup too.
140      //
141      static class iteratedChar {
142          int             charValue = 0;             // 1-4 bytes from the raw input data
143          int             index     = 0;
144          int             nextIndex = 0;
145          boolean         error     = false;
146          boolean         done      = false;
147          
148          void reset() {
149              charValue = 0;
150              index     = -1;
151              nextIndex = 0;
152              error     = false;
153              done      = false;
154          }
155          
156          int nextByte(CharsetDetector det) {
157              if (nextIndex >= det.fRawLength) {
158                  done = true;
159                  return -1;
160              }
161              int byteValue = (int)det.fRawInput[nextIndex++] & 0x00ff;
162              return byteValue;
163          }       
164      }
165      
166      /**
167       * Get the next character (however many bytes it is) from the input data
168       *    Subclasses for specific charset encodings must implement this function
169       *    to get characters according to the rules of their encoding scheme.
170       * 
171       *  This function is not a method of class iteratedChar only because
172       *   that would require a lot of extra derived classes, which is awkward.
173       * @param it  The iteratedChar "struct" into which the returned char is placed.
174       * @param det The charset detector, which is needed to get at the input byte data
175       *            being iterated over.
176       * @return    True if a character was returned, false at end of input.
177       */
178      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
179      
180 
181 
182      
183      
184      /**
185       *   Shift-JIS charset recognizer.   
186       *
187       */
188      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
189          static int [] commonChars = 
190              // TODO:  This set of data comes from the character frequency-
191              //        of-occurence analysis tool.  The data needs to be moved
192              //        into a resource and loaded from there.
193             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 
194              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 
195              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 
196              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 
197              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 
198              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
199          
200          boolean nextChar(iteratedChar it, CharsetDetector det) {
201              it.index = it.nextIndex;
202              it.error = false;
203              int firstByte;
204              firstByte = it.charValue = it.nextByte(det);
205              if (firstByte < 0) {
206                  return false;
207              }
208              
209              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
210                  return true;
211              }
212              
213              int secondByte = it.nextByte(det);
214              if (secondByte < 0)  {
215                  return false;          
216              }
217              it.charValue = (firstByte << 8) | secondByte;
218              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
219                  // Illegal second byte value.
220                  it.error = true;
221              }
222              return true;
223          }
224          
225          int match(CharsetDetector det) {
226              return match(det, commonChars);
227          }
228          
229          String getName() {
230              return "Shift_JIS";
231          }
232          
233          public String getLanguage()
234          {
235              return "ja";
236          }
237 
238          
239      }
240      
241      
242      /**
243       *   Big5 charset recognizer.   
244       *
245       */
246      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
247          static int [] commonChars = 
248              // TODO:  This set of data comes from the character frequency-
249              //        of-occurence analysis tool.  The data needs to be moved
250              //        into a resource and loaded from there.
251             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 
252              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 
253              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 
254              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 
255              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 
256              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 
257              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 
258              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 
259              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 
260              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
261           
262          boolean nextChar(iteratedChar it, CharsetDetector det) {
263              it.index = it.nextIndex;
264              it.error = false;
265              int firstByte;
266              firstByte = it.charValue = it.nextByte(det);
267              if (firstByte < 0) {
268                  return false;
269              }
270              
271              if (firstByte <= 0x7f || firstByte==0xff) {
272                  // single byte character.
273                  return true;
274              }
275              
276              int secondByte = it.nextByte(det);
277              if (secondByte < 0)  {
278                  return false;          
279              }
280              it.charValue = (it.charValue << 8) | secondByte;
281 
282              if (secondByte < 0x40 ||
283                  secondByte ==0x7f ||
284                  secondByte == 0xff) {
285                      it.error = true;
286              }
287              return true;
288          }
289          
290          int match(CharsetDetector det) {
291              return match(det, commonChars);
292          }
293          
294          String getName() {
295              return "Big5";
296          }
297          
298          
299          public String getLanguage()
300          {
301              return "zh";
302          }
303      }
304      
305      
306      /**
307       *   EUC charset recognizers.  One abstract class that provides the common function
308       *             for getting the next character according to the EUC encoding scheme,
309       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.   
310       *
311       */
312      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
313          
314          /*
315           *  (non-Javadoc)
316           *  Get the next character value for EUC based encodings.
317           *  Character "value" is simply the raw bytes that make up the character
318           *     packed into an int.
319           */
320          boolean nextChar(iteratedChar it, CharsetDetector det) {
321              it.index = it.nextIndex;
322              it.error = false;
323              int firstByte  = 0;
324              int secondByte = 0;
325              int thirdByte  = 0;
326              //int fourthByte = 0;
327              
328              buildChar: {
329                  firstByte = it.charValue = it.nextByte(det);                 
330                  if (firstByte < 0) {
331                      // Ran off the end of the input data
332                      it.done = true;
333                      break buildChar;
334                  }
335                  if (firstByte <= 0x8d) {
336                      // single byte char
337                      break buildChar;
338                  }
339                  
340                  secondByte = it.nextByte(det);
341                  it.charValue = (it.charValue << 8) | secondByte;
342                  
343                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
344                      // Two byte Char
345                      if (secondByte < 0xa1) {
346                          it.error = true;
347                      }
348                      break buildChar;
349                  }
350                  if (firstByte == 0x8e) {
351                      // Code Set 2.
352                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
353                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
354                      // We don't know which we've got.
355                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
356                      //   bytes will look like a well formed 2 byte char.  
357                      if (secondByte < 0xa1) {
358                          it.error = true;
359                      }
360                      break buildChar;                     
361                  }
362                  
363                  if (firstByte == 0x8f) {
364                      // Code set 3.
365                      // Three byte total char size, two bytes of actual char value.
366                      thirdByte    = it.nextByte(det);
367                      it.charValue = (it.charValue << 8) | thirdByte;
368                      if (thirdByte < 0xa1) {
369                          it.error = true;
370                      }
371                  }
372               }
373              
374              return (it.done == false);
375          }
376          
377          /**
378           * The charset recognize for EUC-JP.  A singleton instance of this class
379           *    is created and kept by the public CharsetDetector class
380           */
381          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
382              static int [] commonChars = 
383                  // TODO:  This set of data comes from the character frequency-
384                  //        of-occurence analysis tool.  The data needs to be moved
385                  //        into a resource and loaded from there.
386                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 
387                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 
388                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 
389                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 
390                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 
391                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 
392                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 
393                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 
394                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 
395                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};             
396              String getName() {
397                  return "EUC-JP";
398              }
399              
400              int match(CharsetDetector det) {
401                  return match(det, commonChars);
402              }
403              
404              public String getLanguage()
405              {
406                  return "ja";
407              }
408          }
409          
410          /**
411           * The charset recognize for EUC-KR.  A singleton instance of this class
412           *    is created and kept by the public CharsetDetector class
413           */
414          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
415              static int [] commonChars = 
416                  // TODO:  This set of data comes from the character frequency-
417                  //        of-occurence analysis tool.  The data needs to be moved
418                  //        into a resource and loaded from there.
419                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 
420                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 
421                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 
422                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 
423                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 
424                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 
425                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 
426                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 
427                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 
428                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
429              
430              String getName() {
431                  return "EUC-KR";
432              }
433              
434              int match(CharsetDetector det) {
435                  return match(det, commonChars);
436              }
437              
438              public String getLanguage()
439              {
440                  return "ko";
441              }
442          }
443      }
444      
445      /**
446       * 
447       *   GB-18030 recognizer. Uses simplified Chinese statistics.   
448       *
449       */
450      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
451          
452          /*
453           *  (non-Javadoc)
454           *  Get the next character value for EUC based encodings.
455           *  Character "value" is simply the raw bytes that make up the character
456           *     packed into an int.
457           */
458          boolean nextChar(iteratedChar it, CharsetDetector det) {
459              it.index = it.nextIndex;
460              it.error = false;
461              int firstByte  = 0;
462              int secondByte = 0;
463              int thirdByte  = 0;
464              int fourthByte = 0;
465              
466              buildChar: {
467                  firstByte = it.charValue = it.nextByte(det); 
468                  
469                  if (firstByte < 0) {
470                      // Ran off the end of the input data
471                      it.done = true;
472                      break buildChar;
473                  }
474                  
475                  if (firstByte <= 0x80) {
476                      // single byte char
477                      break buildChar;
478                  }
479                  
480                  secondByte = it.nextByte(det);
481                  it.charValue = (it.charValue << 8) | secondByte;
482                  
483                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
484                      // Two byte Char
485                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
486                          break buildChar;
487                      }
488                      
489                      // Four byte char
490                      if (secondByte >= 0x30 && secondByte <= 0x39) {
491                          thirdByte = it.nextByte(det);
492                          
493                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
494                              fourthByte = it.nextByte(det);
495                              
496                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
497                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
498                                  break buildChar;
499                              }
500                          }
501                      }
502                      
503                      it.error = true;
504                      break buildChar;
505                  }
506              }
507                  
508              return (it.done == false);
509          }
510          
511          static int [] commonChars = 
512              // TODO:  This set of data comes from the character frequency-
513              //        of-occurence analysis tool.  The data needs to be moved
514              //        into a resource and loaded from there.
515             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 
516              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 
517              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 
518              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 
519              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 
520              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 
521              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 
522              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 
523              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 
524              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
525 
526          
527          String getName() {
528              return "GB18030";
529          }
530          
531          int match(CharsetDetector det) {
532              return match(det, commonChars);
533          }
534          
535          public String getLanguage()
536          {
537              return "zh";
538          }
539      }
540      
541      
542 }