View Javadoc

1   /**
2   *******************************************************************************
3   * Copyright (C) 2005, International Business Machines Corporation and         *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package org.apache.tika.parser.txt;
8   
9   /**
10   * Abstract class for recognizing a single charset.
11   * Part of the implementation of ICU's CharsetDetector.
12   * 
13   * Each specific charset that can be recognized will have an instance
14   * of some subclass of this class.  All interaction between the overall
15   * CharsetDetector and the stuff specific to an individual charset happens
16   * via the interface provided here.
17   * 
18   * Instances of CharsetDetector DO NOT have or maintain 
19   * state pertaining to a specific match or detect operation.
20   * The WILL be shared by multiple instances of CharsetDetector.
21   * They encapsulate const charset-specific information.
22   * 
23   * @internal
24   */
25  abstract class CharsetRecognizer {
26      /**
27       * Get the IANA name of this charset.
28       * @return the charset name.
29       */
30      abstract String      getName();
31      
32      /**
33       * Get the ISO language code for this charset.
34       * @return the language code, or <code>null</code> if the language cannot be determined.
35       */
36      public   String      getLanguage()
37      {
38          return null;
39      }
40      
41      /**
42       * Test the match of this charset with the input text data
43       *      which is obtained via the CharsetDetector object.
44       * 
45       * @param det  The CharsetDetector, which contains the input text
46       *             to be checked for being in this charset.
47       * @return     Two values packed into one int  (Damn java, anyhow)
48       *             <br/>
49       *             bits 0-7:  the match confidence, ranging from 0-100
50       *             <br/>
51       *             bits 8-15: The match reason, an enum-like value.
52       */
53      abstract int         match(CharsetDetector det);
54  
55  }