View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.language;
18  
19  import java.io.BufferedReader;
20  import java.io.InputStream;
21  import java.io.InputStreamReader;
22  import java.util.HashMap;
23  import java.util.Map;
24  
25  /**
26   * Identifier of the language that best matches a given content profile.
27   * The content profile is compared to generic language profiles based on
28   * material from various sources.
29   *
30   * @since Apache Tika 0.5
31   * @see <a href="http://www.iccs.inf.ed.ac.uk/~pkoehn/publications/europarl/">
32   *      Europarl: A Parallel Corpus for Statistical Machine Translation</a>
33   * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">
34   *      ISO 639 Language Codes</a>
35   */
36  public class LanguageIdentifier {
37  
38      /**
39       * The available language profiles.
40       */
41      private static final Map<String, LanguageProfile> PROFILES =
42          new HashMap<String, LanguageProfile>();
43  
44      private static void addProfile(String language) {
45          try {
46              LanguageProfile profile = new LanguageProfile();
47  
48              InputStream stream =
49                  LanguageIdentifier.class.getResourceAsStream(language + ".ngp");
50              try {
51                  BufferedReader reader =
52                      new BufferedReader(new InputStreamReader(stream, "UTF-8"));
53                  String line = reader.readLine();
54                  while (line != null) {
55                      if (line.length() > 0 && !line.startsWith("#")) {
56                          int space = line.indexOf(' ');
57                          profile.add(
58                                  line.substring(0, space),
59                                  Long.parseLong(line.substring(space + 1)));
60                      }
61                      line = reader.readLine();
62                  }
63              } finally {
64                  stream.close();
65              }
66  
67              PROFILES.put(language, profile);
68          } catch (Throwable t) {
69              // Failed to load this language profile. Log the problem?
70          }
71      }
72  
73      static {
74          addProfile("da"); // Danish
75          addProfile("de"); // German
76          addProfile("ee");
77          addProfile("el"); // Greek
78          addProfile("en"); // English
79          addProfile("es"); // Spanish
80          addProfile("fi"); // Finnish
81          addProfile("fr"); // French
82          addProfile("hu"); // Hungarian
83          addProfile("is"); // Icelandic
84          addProfile("it"); // Italian
85          addProfile("nl"); // Dutch
86          addProfile("no"); // Norwegian
87          addProfile("pl"); // Polish
88          addProfile("pt"); // Portuguese
89          addProfile("ru"); // Russian
90          addProfile("sv"); // Swedish
91          addProfile("th"); // Thai
92      }
93  
94      private final String language;
95  
96      private final double distance;
97  
98      public LanguageIdentifier(LanguageProfile profile) {
99          String minLanguage = "unknown";
100         double minDistance = 1.0;
101         for (Map.Entry<String, LanguageProfile> entry : PROFILES.entrySet()) {
102             double distance = profile.distance(entry.getValue());
103             if (distance < minDistance) {
104                 minDistance = distance;
105                 minLanguage = entry.getKey();
106             }
107         }
108 
109         this.language = minLanguage;
110         this.distance = minDistance;
111     }
112 
113     public LanguageIdentifier(String content) {
114         this(new LanguageProfile(content));
115     }
116 
117     public String getLanguage() {
118         return language;
119     }
120 
121     public boolean isReasonablyCertain() {
122         return distance < 0.022;
123     }
124 
125     @Override
126     public String toString() {
127         return language + " (" + distance + ")";
128     }
129 
130 }