View Javadoc

1   /**
2   *******************************************************************************
3   * Copyright (C) 2005 - 2007, International Business Machines Corporation and  *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package org.apache.tika.parser.txt;
8   
9   /**
10   * Charset recognizer for UTF-8
11   *
12   * @internal
13   */
14  class CharsetRecog_UTF8 extends CharsetRecognizer {
15  
16      String getName() {
17          return "UTF-8";
18      }
19  
20      /* (non-Javadoc)
21       * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
22       */
23      int match(CharsetDetector det) {
24          boolean     hasBOM = false;
25          int         numValid = 0;
26          int         numInvalid = 0;
27          byte        input[] = det.fRawInput;
28          int         i;
29          int         trailBytes = 0;
30          int         confidence;
31          
32          if (det.fRawLength >= 3 && 
33                  (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb & (input[2] & 0xFF) == 0xbf) {
34              hasBOM = true;
35          }
36          
37          // Scan for multi-byte sequences
38          for (i=0; i<det.fRawLength; i++) {
39              int b = input[i];
40              if ((b & 0x80) == 0) {
41                  continue;   // ASCII
42              }
43              
44              // Hi bit on char found.  Figure out how long the sequence should be
45              if ((b & 0x0e0) == 0x0c0) {
46                  trailBytes = 1;                
47              } else if ((b & 0x0f0) == 0x0e0) {
48                  trailBytes = 2;
49              } else if ((b & 0x0f8) == 0xf0) {
50                  trailBytes = 3;
51              } else {
52                  numInvalid++;
53                  if (numInvalid > 5) {
54                      break;
55                  }
56                  trailBytes = 0;
57              }
58                  
59              // Verify that we've got the right number of trail bytes in the sequence
60              for (;;) {
61                  i++;
62                  if (i>=det.fRawLength) {
63                      break;
64                  }
65                  b = input[i];
66                  if ((b & 0xc0) != 0x080) {
67                      numInvalid++;
68                      break;
69                  }
70                  if (--trailBytes == 0) {
71                      numValid++;
72                      break;
73                  }
74              }
75                          
76          }
77          
78          // Cook up some sort of confidence score, based on presense of a BOM
79          //    and the existence of valid and/or invalid multi-byte sequences.
80          confidence = 0;
81          if (hasBOM && numInvalid==0) {
82              confidence = 100;
83          } else if (hasBOM && numValid > numInvalid*10) {
84              confidence = 80;
85          } else if (numValid > 3 && numInvalid == 0) {
86              confidence = 100;            
87          } else if (numValid > 0 && numInvalid == 0) {
88              confidence = 80;
89          } else if (numValid == 0 && numInvalid == 0) {
90              // Plain ASCII.  
91              confidence = 10;            
92          } else if (numValid > numInvalid*10) {
93              // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
94              confidence = 25;
95          }
96          return confidence;
97      }
98  
99  }