View Javadoc

1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2007, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    *
7    */
8   package org.apache.tika.parser.txt;
9   
10  /**
11   * This class matches UTF-16 and UTF-32, both big- and little-endian. The
12   * BOM will be used if it is present.
13   * 
14   * @internal
15   */
16  abstract class CharsetRecog_Unicode extends CharsetRecognizer {
17  
18      /* (non-Javadoc)
19       * @see com.ibm.icu.text.CharsetRecognizer#getName()
20       */
21      abstract String getName();
22  
23      /* (non-Javadoc)
24       * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
25       */
26      abstract int match(CharsetDetector det);
27      
28      static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode
29      {
30          String getName()
31          {
32              return "UTF-16BE";
33          }
34          
35          int match(CharsetDetector det)
36          {
37              byte[] input = det.fRawInput;
38              
39              if (input.length>=2 && ((input[0] & 0xFF) == 0xFE && (input[1] & 0xFF) == 0xFF)) {
40                  return 100;
41              }
42              
43              // TODO: Do some statistics to check for unsigned UTF-16BE
44              return 0;
45          }
46      }
47      
48      static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode
49      {
50          String getName()
51          {
52              return "UTF-16LE";
53          }
54          
55          int match(CharsetDetector det)
56          {
57              byte[] input = det.fRawInput;
58              
59              if (input.length >= 2 && ((input[0] & 0xFF) == 0xFF && (input[1] & 0xFF) == 0xFE))
60              {
61                 // An LE BOM is present.
62                 if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) {
63                     // It is probably UTF-32 LE, not UTF-16
64                     return 0;
65                 }
66                 return 100;
67              }        
68              
69              // TODO: Do some statistics to check for unsigned UTF-16LE
70              return 0;
71          }
72      }
73      
74      static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode
75      {
76          abstract int getChar(byte[] input, int index);
77          
78          abstract String getName();
79          
80          int match(CharsetDetector det)
81          {
82              byte[] input   = det.fRawInput;
83              int limit      = (det.fRawLength / 4) * 4;
84              int numValid   = 0;
85              int numInvalid = 0;
86              boolean hasBOM = false;
87              int confidence = 0;
88              
89              if (limit==0) {
90                  return 0;
91              }
92              if (getChar(input, 0) == 0x0000FEFF) {
93                  hasBOM = true;
94              }
95              
96              for(int i = 0; i < limit; i += 4) {
97                  int ch = getChar(input, i);
98                  
99                  if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
100                     numInvalid += 1;
101                 } else {
102                     numValid += 1;
103                 }
104             }
105             
106             
107             // Cook up some sort of confidence score, based on presence of a BOM
108             //    and the existence of valid and/or invalid multi-byte sequences.
109             if (hasBOM && numInvalid==0) {
110                 confidence = 100;
111             } else if (hasBOM && numValid > numInvalid*10) {
112                 confidence = 80;
113             } else if (numValid > 3 && numInvalid == 0) {
114                 confidence = 100;            
115             } else if (numValid > 0 && numInvalid == 0) {
116                 confidence = 80;
117             } else if (numValid > numInvalid*10) {
118                 // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
119                 confidence = 25;
120             }
121             
122             return confidence;
123         }
124     }
125     
126     static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32
127     {
128         int getChar(byte[] input, int index)
129         {
130             return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 |
131                    (input[index + 2] & 0xFF) <<  8 | (input[index + 3] & 0xFF);
132         }
133         
134         String getName()
135         {
136             return "UTF-32BE";
137         }
138     }
139 
140     
141     static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32
142     {
143         int getChar(byte[] input, int index)
144         {
145             return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 |
146                    (input[index + 1] & 0xFF) <<  8 | (input[index + 0] & 0xFF);
147         }
148         
149         String getName()
150         {
151             return "UTF-32LE";
152         }
153     }
154 }