View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.language;
18  
19  import java.util.HashMap;
20  import java.util.HashSet;
21  import java.util.Map;
22  import java.util.Set;
23  
24  /**
25   * Language profile based on ngram counts.
26   *
27   * @since Apache Tika 0.5
28   */
29  public class LanguageProfile {
30  
31      public static final int DEFAULT_NGRAM_LENGTH = 3;
32  
33      private final int length;
34  
35      /**
36       * The ngrams that make up this profile.
37       */
38      private final Map<String, Counter> ngrams =
39          new HashMap<String, Counter>();
40  
41      /**
42       * The sum of all ngram counts in this profile.
43       * Used to calculate relative ngram frequency.
44       */
45      private long count = 0;
46  
47      private class Counter {
48          private long count = 0;
49          public String toString() {
50              return Long.toString(count);
51          }
52      }
53  
54      public LanguageProfile(int length) {
55          this.length = length;
56      }
57  
58      public LanguageProfile() {
59          this(DEFAULT_NGRAM_LENGTH);
60      }
61  
62      public LanguageProfile(String content, int length) {
63          this(length);
64  
65          ProfilingWriter writer = new ProfilingWriter(this);
66          char[] ch = content.toCharArray();
67          writer.write(ch, 0, ch.length);
68      }
69  
70      public LanguageProfile(String content) {
71          this(content, DEFAULT_NGRAM_LENGTH);
72      }
73  
74      public long getCount() {
75          return count;
76      }
77  
78      public long getCount(String ngram) {
79          Counter counter = ngrams.get(ngram);
80          if (counter != null) {
81              return counter.count;
82          } else {
83              return 0;
84          }
85      }
86  
87      /**
88       * Adds a single occurrence of the given ngram to this profile.
89       *
90       * @param ngram the ngram
91       */
92      public void add(String ngram) {
93          add(ngram, 1);
94      }
95  
96      /**
97       * Adds multiple occurrences of the given ngram to this profile.
98       *
99       * @param ngram the ngram
100      * @param count number of occurrences to add
101      */
102     public void add(String ngram, long count) {
103         if (length != ngram.length()) {
104             throw new IllegalArgumentException(
105                     "Unable to add an ngram of incorrect length: "
106                     + ngram.length() + " != " + length);
107         }
108 
109         Counter counter = ngrams.get(ngram);
110         if (counter == null) {
111             counter = new Counter();
112             ngrams.put(ngram, counter);
113         }
114         counter.count += count;
115         this.count += count;
116     }
117 
118     /**
119      * Calculates the geometric distance between this and the given
120      * other language profile.
121      *
122      * @param that the other language profile
123      * @return distance between the profiles
124      */
125     public double distance(LanguageProfile that) {
126         if (length != that.length) {
127             throw new IllegalArgumentException(
128                     "Unable to calculage distance of language profiles"
129                     + " with different ngram lengths: "
130                     + that.length + " != " + length);
131         }
132 
133         double sumOfSquares = 0.0;
134         double thisCount = Math.max(this.count, 1.0);
135         double thatCount = Math.max(that.count, 1.0);
136 
137         Set<String> ngrams = new HashSet<String>();
138         ngrams.addAll(this.ngrams.keySet());
139         ngrams.addAll(that.ngrams.keySet());
140         for (String ngram : ngrams) {
141             double thisFrequency = this.getCount(ngram) / thisCount;
142             double thatFrequency = that.getCount(ngram) / thatCount;
143             double difference = thisFrequency - thatFrequency;
144             sumOfSquares += difference * difference;
145         }
146 
147         return Math.sqrt(sumOfSquares);
148     }
149 
150     @Override
151     public String toString() {
152         return ngrams.toString();
153     }
154 
155 }