View Javadoc

1   /*
2   *******************************************************************************
3   * Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package org.apache.tika.parser.txt;
8   
9   /**
10   *  class CharsetRecog_2022  part of the ICU charset detection imlementation.
11   *                           This is a superclass for the individual detectors for
12   *                           each of the detectable members of the ISO 2022 family
13   *                           of encodings.
14   * 
15   *                           The separate classes are nested within this class.
16   * 
17   * @internal
18   */
19  abstract class CharsetRecog_2022 extends CharsetRecognizer {
20  
21      
22      /**
23       * Matching function shared among the 2022 detectors JP, CN and KR
24       * Counts up the number of legal an unrecognized escape sequences in
25       * the sample of text, and computes a score based on the total number &
26       * the proportion that fit the encoding.
27       * 
28       * 
29       * @param text the byte buffer containing text to analyse
30       * @param textLen  the size of the text in the byte.
31       * @param escapeSequences the byte escape sequences to test for.
32       * @return match quality, in the range of 0-100.
33       */
34      int   match(byte [] text, int textLen, byte [][] escapeSequences) {
35          int     i, j;
36          int     escN;
37          int     hits   = 0;
38          int     misses = 0;
39          int     shifts = 0;
40          int     quality;
41          scanInput:
42              for (i=0; i<textLen; i++) {
43                  if (text[i] == 0x1b) {
44                      checkEscapes:
45                          for (escN=0; escN<escapeSequences.length; escN++) {
46                              byte [] seq = escapeSequences[escN];
47                              
48                              if ((textLen - i) < seq.length) {
49                                  continue checkEscapes;
50                              }
51                              
52                              for (j=1; j<seq.length; j++) {
53                                  if (seq[j] != text[i+j])  {
54                                      continue checkEscapes;
55                                  }                                   
56                              }
57                              
58                              hits++; 
59                              i += seq.length-1;
60                              continue scanInput;
61                          }
62                  
63                          misses++;                  
64                  }
65                  
66                  if (text[i] == 0x0e || text[i] == 0x0f) {
67                      // Shift in/out
68                      shifts++;
69                  }
70              }
71          
72          if (hits == 0) {
73              return 0;
74          }
75          
76          //
77          // Initial quality is based on relative proportion of recongized vs.
78          //   unrecognized escape sequences. 
79          //   All good:  quality = 100;
80          //   half or less good: quality = 0;
81          //   linear inbetween.
82          quality = (100*hits - 100*misses) / (hits + misses);
83          
84          // Back off quality if there were too few escape sequences seen.
85          //   Include shifts in this computation, so that KR does not get penalized
86          //   for having only a single Escape sequence, but many shifts.
87          if (hits+shifts < 5) {
88              quality -= (5-(hits+shifts))*10;
89          }
90          
91          if (quality < 0) {
92              quality = 0;
93          }        
94          return quality;
95      }
96  
97      
98   
99      
100     static class CharsetRecog_2022JP extends CharsetRecog_2022 {
101         private byte [] [] escapeSequences = {
102                 {0x1b, 0x24, 0x28, 0x43},   // KS X 1001:1992
103                 {0x1b, 0x24, 0x28, 0x44},   // JIS X 212-1990
104                 {0x1b, 0x24, 0x40},         // JIS C 6226-1978
105                 {0x1b, 0x24, 0x41},         // GB 2312-80
106                 {0x1b, 0x24, 0x42},         // JIS X 208-1983
107                 {0x1b, 0x26, 0x40},         // JIS X 208 1990, 1997
108                 {0x1b, 0x28, 0x42},         // ASCII
109                 {0x1b, 0x28, 0x48},         // JIS-Roman
110                 {0x1b, 0x28, 0x49},         // Half-width katakana
111                 {0x1b, 0x28, 0x4a},         // JIS-Roman
112                 {0x1b, 0x2e, 0x41},         // ISO 8859-1
113                 {0x1b, 0x2e, 0x46}          // ISO 8859-7
114                 };
115         
116         String getName() {
117             return "ISO-2022-JP";
118         }
119         
120         int   match(CharsetDetector det) {
121             return match(det.fInputBytes, det.fInputLen, escapeSequences);
122         }
123     }
124 
125     static class CharsetRecog_2022KR extends CharsetRecog_2022 {
126         private byte [] [] escapeSequences = {
127                 {0x1b, 0x24, 0x29, 0x43}   
128                  };
129         
130         String getName() {
131             return "ISO-2022-KR";
132         }
133         
134         int   match(CharsetDetector det) {
135             return match(det.fInputBytes, det.fInputLen, escapeSequences);
136         }
137         
138     }
139 
140     static class CharsetRecog_2022CN extends CharsetRecog_2022 {
141         private byte [] [] escapeSequences = {
142                 {0x1b, 0x24, 0x29, 0x41},   // GB 2312-80
143                 {0x1b, 0x24, 0x29, 0x47},   // CNS 11643-1992 Plane 1
144                 {0x1b, 0x24, 0x2A, 0x48},   // CNS 11643-1992 Plane 2
145                 {0x1b, 0x24, 0x29, 0x45},   // ISO-IR-165
146                 {0x1b, 0x24, 0x2B, 0x49},   // CNS 11643-1992 Plane 3
147                 {0x1b, 0x24, 0x2B, 0x4A},   // CNS 11643-1992 Plane 4
148                 {0x1b, 0x24, 0x2B, 0x4B},   // CNS 11643-1992 Plane 5
149                 {0x1b, 0x24, 0x2B, 0x4C},   // CNS 11643-1992 Plane 6
150                 {0x1b, 0x24, 0x2B, 0x4D},   // CNS 11643-1992 Plane 7
151                 {0x1b, 0x4e},               // SS2
152                 {0x1b, 0x4f},               // SS3
153         };
154         
155         String getName() {
156             return "ISO-2022-CN";
157         }
158         
159         
160         int   match(CharsetDetector det) {
161             return match(det.fInputBytes, det.fInputLen, escapeSequences);
162         }
163     }
164     
165     }
166