1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.language;
18
19 import java.util.HashMap;
20 import java.util.HashSet;
21 import java.util.Map;
22 import java.util.Set;
23
24
25
26
27
28
29 public class LanguageProfile {
30
31 public static final int DEFAULT_NGRAM_LENGTH = 3;
32
33 private final int length;
34
35
36
37
38 private final Map<String, Counter> ngrams =
39 new HashMap<String, Counter>();
40
41
42
43
44
45 private long count = 0;
46
47 private class Counter {
48 private long count = 0;
49 public String toString() {
50 return Long.toString(count);
51 }
52 }
53
54 public LanguageProfile(int length) {
55 this.length = length;
56 }
57
58 public LanguageProfile() {
59 this(DEFAULT_NGRAM_LENGTH);
60 }
61
62 public LanguageProfile(String content, int length) {
63 this(length);
64
65 ProfilingWriter writer = new ProfilingWriter(this);
66 char[] ch = content.toCharArray();
67 writer.write(ch, 0, ch.length);
68 }
69
70 public LanguageProfile(String content) {
71 this(content, DEFAULT_NGRAM_LENGTH);
72 }
73
74 public long getCount() {
75 return count;
76 }
77
78 public long getCount(String ngram) {
79 Counter counter = ngrams.get(ngram);
80 if (counter != null) {
81 return counter.count;
82 } else {
83 return 0;
84 }
85 }
86
87
88
89
90
91
92 public void add(String ngram) {
93 add(ngram, 1);
94 }
95
96
97
98
99
100
101
102 public void add(String ngram, long count) {
103 if (length != ngram.length()) {
104 throw new IllegalArgumentException(
105 "Unable to add an ngram of incorrect length: "
106 + ngram.length() + " != " + length);
107 }
108
109 Counter counter = ngrams.get(ngram);
110 if (counter == null) {
111 counter = new Counter();
112 ngrams.put(ngram, counter);
113 }
114 counter.count += count;
115 this.count += count;
116 }
117
118
119
120
121
122
123
124
125 public double distance(LanguageProfile that) {
126 if (length != that.length) {
127 throw new IllegalArgumentException(
128 "Unable to calculage distance of language profiles"
129 + " with different ngram lengths: "
130 + that.length + " != " + length);
131 }
132
133 double sumOfSquares = 0.0;
134 double thisCount = Math.max(this.count, 1.0);
135 double thatCount = Math.max(that.count, 1.0);
136
137 Set<String> ngrams = new HashSet<String>();
138 ngrams.addAll(this.ngrams.keySet());
139 ngrams.addAll(that.ngrams.keySet());
140 for (String ngram : ngrams) {
141 double thisFrequency = this.getCount(ngram) / thisCount;
142 double thatFrequency = that.getCount(ngram) / thatCount;
143 double difference = thisFrequency - thatFrequency;
144 sumOfSquares += difference * difference;
145 }
146
147 return Math.sqrt(sumOfSquares);
148 }
149
150 @Override
151 public String toString() {
152 return ngrams.toString();
153 }
154
155 }