1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.language;
18
19 import java.io.BufferedReader;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.util.HashMap;
23 import java.util.Map;
24
25
26
27
28
29
30
31
32
33
34
35
36 public class LanguageIdentifier {
37
38
39
40
41 private static final Map<String, LanguageProfile> PROFILES =
42 new HashMap<String, LanguageProfile>();
43
44 private static void addProfile(String language) {
45 try {
46 LanguageProfile profile = new LanguageProfile();
47
48 InputStream stream =
49 LanguageIdentifier.class.getResourceAsStream(language + ".ngp");
50 try {
51 BufferedReader reader =
52 new BufferedReader(new InputStreamReader(stream, "UTF-8"));
53 String line = reader.readLine();
54 while (line != null) {
55 if (line.length() > 0 && !line.startsWith("#")) {
56 int space = line.indexOf(' ');
57 profile.add(
58 line.substring(0, space),
59 Long.parseLong(line.substring(space + 1)));
60 }
61 line = reader.readLine();
62 }
63 } finally {
64 stream.close();
65 }
66
67 PROFILES.put(language, profile);
68 } catch (Throwable t) {
69
70 }
71 }
72
73 static {
74 addProfile("da");
75 addProfile("de");
76 addProfile("ee");
77 addProfile("el");
78 addProfile("en");
79 addProfile("es");
80 addProfile("fi");
81 addProfile("fr");
82 addProfile("hu");
83 addProfile("is");
84 addProfile("it");
85 addProfile("nl");
86 addProfile("no");
87 addProfile("pl");
88 addProfile("pt");
89 addProfile("ru");
90 addProfile("sv");
91 addProfile("th");
92 }
93
94 private final String language;
95
96 private final double distance;
97
98 public LanguageIdentifier(LanguageProfile profile) {
99 String minLanguage = "unknown";
100 double minDistance = 1.0;
101 for (Map.Entry<String, LanguageProfile> entry : PROFILES.entrySet()) {
102 double distance = profile.distance(entry.getValue());
103 if (distance < minDistance) {
104 minDistance = distance;
105 minLanguage = entry.getKey();
106 }
107 }
108
109 this.language = minLanguage;
110 this.distance = minDistance;
111 }
112
113 public LanguageIdentifier(String content) {
114 this(new LanguageProfile(content));
115 }
116
117 public String getLanguage() {
118 return language;
119 }
120
121 public boolean isReasonablyCertain() {
122 return distance < 0.022;
123 }
124
125 @Override
126 public String toString() {
127 return language + " (" + distance + ")";
128 }
129
130 }