View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.language;
18  
19  import java.io.IOException;
20  import java.io.Writer;
21  
22  /**
23   * Writer that builds a language profile based on all the written content.
24   *
25   * @since Apache Tika 0.5
26   */
27  public class ProfilingWriter extends Writer {
28  
29      private final LanguageProfile profile;
30  
31      private char[] buffer = new char[] { 0, 0, '_' };
32  
33      private int n = 1;
34  
35      public ProfilingWriter(LanguageProfile profile) {
36          this.profile = profile;
37      }
38  
39      public ProfilingWriter() {
40          this(new LanguageProfile());
41      }
42  
43      /**
44       * Returns the language profile being built by this writer. Note that
45       * the returned profile gets updated whenever new characters are written.
46       * Use the {@link #getLanguage()} method to get the language that best
47       * matches the current state of the profile.
48       *
49       * @return language profile
50       */
51      public LanguageProfile getProfile() {
52          return profile;
53      }
54  
55      /**
56       * Returns the language that best matches the current state of the
57       * language profile.
58       *
59       * @return language that best matches the current profile
60       */
61      public LanguageIdentifier getLanguage() {
62          return new LanguageIdentifier(profile);
63      }
64  
65      @Override
66      public void write(char[] cbuf, int off, int len) {
67          for (int i = 0; i < len; i++) {
68              char c = Character.toLowerCase(cbuf[off + i]);
69              if (Character.isLetter(c)) {
70                  addLetter(c);
71              } else {
72                  addSeparator();
73              }
74          }
75      }
76  
77      private void addLetter(char c) {
78          System.arraycopy(buffer, 1, buffer, 0, buffer.length - 1);
79          buffer[buffer.length - 1] = c;
80          n++;
81          if (n >= buffer.length) {
82              profile.add(new String(buffer));
83          }
84      }
85  
86      private void addSeparator() {
87          addLetter('_');
88          n = 1;
89      }
90  
91      @Override
92      public void close() throws IOException {
93          addSeparator();
94      }
95  
96      /**
97       * Ignored.
98       */
99      @Override
100     public void flush() {
101     }
102 
103 }