1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.math.random;
18
19 import java.io.Serializable;
20 import java.io.BufferedReader;
21 import java.io.FileReader;
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.InputStreamReader;
25 import java.net.URL;
26 import java.util.ArrayList;
27 import java.util.List;
28
29 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
30 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
31
32 /**
33 * Implements <code>EmpiricalDistribution</code> interface. This implementation
34 * uses what amounts to the
35 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
36 * Variable Kernel Method</a> with Gaussian smoothing:<p>
37 * <strong>Digesting the input file</strong>
38 * <ol><li>Pass the file once to compute min and max.</li>
39 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
40 * <li>Pass the data file again, computing bin counts and univariate
41 * statistics (mean, std dev.) for each of the bins </li>
42 * <li>Divide the interval (0,1) into subintervals associated with the bins,
43 * with the length of a bin's subinterval proportional to its count.</li></ol>
44 * <strong>Generating random values from the distribution</strong><ol>
45 * <li>Generate a uniformly distributed value in (0,1) </li>
46 * <li>Select the subinterval to which the value belongs.
47 * <li>Generate a random Gaussian value with mean = mean of the associated
48 * bin and std dev = std dev of associated bin.</li></ol></p><p>
49 *<strong>USAGE NOTES:</strong><ul>
50 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
51 * is to set the bin count to approximately the length of the input file divided
52 * by 10. </li>
53 *<li>The input file <i>must</i> be a plain text file containing one valid numeric
54 * entry per line.</li>
55 * </ul></p>
56 *
57 * @version $Revision: 348894 $ $Date: 2005-11-24 23:34:47 -0700 (Thu, 24 Nov 2005) $
58 */
59 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
60
61 /** Serializable version identifier */
62 private static final long serialVersionUID = -6773236347582113490L;
63
64 /** List of SummaryStatistics objects characterizing the bins */
65 private ArrayList binStats = null;
66
67 /** Sample statistics */
68 private SummaryStatistics sampleStats = null;
69
70 /** number of bins */
71 private int binCount = 1000;
72
73 /** is the distribution loaded? */
74 private boolean loaded = false;
75
76 /** upper bounds of subintervals in (0,1) "belonging" to the bins */
77 private double[] upperBounds = null;
78
79 /** RandomData instance to use in repeated calls to getNext() */
80 private RandomData randomData = new RandomDataImpl();
81
82 /**
83 * Creates a new EmpiricalDistribution with the default bin count.
84 */
85 public EmpiricalDistributionImpl() {
86 binStats = new ArrayList();
87 }
88
89 /**
90 * Creates a new EmpiricalDistribution with the specified bin count.
91 *
92 * @param binCount number of bins
93 */
94 public EmpiricalDistributionImpl(int binCount) {
95 this.binCount = binCount;
96 binStats = new ArrayList();
97 }
98
99 /**
100 * Computes the empirical distribution from the provided
101 * array of numbers.
102 *
103 * @param in the input data array
104 */
105 public void load(double[] in) {
106 DataAdapter da = new ArrayDataAdapter(in);
107 try {
108 da.computeStats();
109 fillBinStats(in);
110 } catch (Exception e) {
111 throw new RuntimeException(e.getMessage());
112 }
113 loaded = true;
114
115 }
116
117 /**
118 * Computes the empirical distribution using data read from a URL.
119 * @param url url of the input file
120 *
121 * @throws IOException if an IO error occurs
122 */
123 public void load(URL url) throws IOException {
124 BufferedReader in =
125 new BufferedReader(new InputStreamReader(url.openStream()));
126 try {
127 DataAdapter da = new StreamDataAdapter(in);
128 try {
129 da.computeStats();
130 } catch (Exception e) {
131 throw new IOException(e.getMessage());
132 }
133 in = new BufferedReader(new InputStreamReader(url.openStream()));
134 fillBinStats(in);
135 loaded = true;
136 } finally {
137 if (in != null) {
138 try {
139 in.close();
140 } catch (Exception ex) {
141
142 }
143 }
144 }
145 }
146
147 /**
148 * Computes the empirical distribution from the input file.
149 *
150 * @param file the input file
151 * @throws IOException if an IO error occurs
152 */
153 public void load(File file) throws IOException {
154 BufferedReader in = new BufferedReader(new FileReader(file));
155 try {
156 DataAdapter da = new StreamDataAdapter(in);
157 try {
158 da.computeStats();
159 } catch (Exception e) {
160 throw new IOException(e.getMessage());
161 }
162 in = new BufferedReader(new FileReader(file));
163 fillBinStats(in);
164 loaded = true;
165 } finally {
166 if (in != null) {
167 try {
168 in.close();
169 } catch (Exception ex) {
170
171 }
172 }
173 }
174 }
175
176 /**
177 * Provides methods for computing <code>sampleStats</code> and
178 * <code>beanStats</code> abstracting the source of data.
179 */
180 private abstract class DataAdapter{
181 /**
182 * Compute bin stats.
183 *
184 * @param min minimum value
185 * @param delta grid size
186 * @throws Exception if an error occurs computing bin stats
187 */
188 public abstract void computeBinStats(double min, double delta)
189 throws Exception;
190 /**
191 * Compute sample statistics.
192 *
193 * @throws Exception if an error occurs computing sample stats
194 */
195 public abstract void computeStats() throws Exception;
196 }
197 /**
198 * Factory of <code>DataAdapter</code> objects. For every supported source
199 * of data (array of doubles, file, etc.) an instance of the proper object
200 * is returned.
201 */
202 private class DataAdapterFactory{
203 /**
204 * Creates a DataAdapter from a data object
205 *
206 * @param in object providing access to the data
207 * @return DataAdapter instance
208 */
209 public DataAdapter getAdapter(Object in) {
210 if (in instanceof BufferedReader) {
211 BufferedReader inputStream = (BufferedReader) in;
212 return new StreamDataAdapter(inputStream);
213 } else if (in instanceof double[]) {
214 double[] inputArray = (double[]) in;
215 return new ArrayDataAdapter(inputArray);
216 } else {
217 throw new IllegalArgumentException(
218 "Input data comes from the" + " unsupported source");
219 }
220 }
221 }
222 /**
223 * <code>DataAdapter</code> for data provided through some input stream
224 */
225 private class StreamDataAdapter extends DataAdapter{
226
227 /** Input stream providng access to the data */
228 private BufferedReader inputStream;
229
230 /**
231 * Create a StreamDataAdapter from a BufferedReader
232 *
233 * @param in BufferedReader input stream
234 */
235 public StreamDataAdapter(BufferedReader in){
236 super();
237 inputStream = in;
238 }
239 /**
240 * Computes binStats
241 *
242 * @param min minimum value
243 * @param delta grid size
244 * @throws IOException if an IO error occurs
245 */
246 public void computeBinStats(double min, double delta)
247 throws IOException {
248 String str = null;
249 double val = 0.0d;
250 while ((str = inputStream.readLine()) != null) {
251 val = Double.parseDouble(str);
252 SummaryStatistics stats =
253 (SummaryStatistics) binStats.get(findBin(min, val, delta));
254 stats.addValue(val);
255 }
256
257 inputStream.close();
258 inputStream = null;
259 }
260 /**
261 * Computes sampleStats
262 *
263 * @throws IOException if an IOError occurs
264 */
265 public void computeStats() throws IOException {
266 String str = null;
267 double val = 0.0;
268 sampleStats = SummaryStatistics.newInstance();
269 while ((str = inputStream.readLine()) != null) {
270 val = new Double(str).doubleValue();
271 sampleStats.addValue(val);
272 }
273 inputStream.close();
274 inputStream = null;
275 }
276 }
277
278 /**
279 * <code>DataAdapter</code> for data provided as array of doubles.
280 */
281 private class ArrayDataAdapter extends DataAdapter{
282
283 /** Array of input data values */
284 private double[] inputArray;
285
286 /**
287 * Construct an ArrayDataAdapter from a double[] array
288 *
289 * @param in double[] array holding the data
290 */
291 public ArrayDataAdapter(double[] in){
292 super();
293 inputArray = in;
294 }
295 /**
296 * Computes sampleStats
297 *
298 * @throws IOException if an IO error occurs
299 */
300 public void computeStats() throws IOException {
301 sampleStats = SummaryStatistics.newInstance();
302 for (int i = 0; i < inputArray.length; i++) {
303 sampleStats.addValue(inputArray[i]);
304 }
305 }
306 /**
307 * Computes binStats
308 *
309 * @param min minimum value
310 * @param delta grid size
311 * @throws IOException if an IO error occurs
312 */
313 public void computeBinStats(double min, double delta)
314 throws IOException {
315 for (int i = 0; i < inputArray.length; i++) {
316 SummaryStatistics stats =
317 (SummaryStatistics) binStats.get(
318 findBin(min, inputArray[i], delta));
319 stats.addValue(inputArray[i]);
320 }
321 }
322 }
323
324 /**
325 * Fills binStats array (second pass through data file).
326 *
327 * @param in object providing access to the data
328 * @throws IOException if an IO error occurs
329 */
330 private void fillBinStats(Object in) throws IOException {
331
332 double min = sampleStats.getMin();
333 double max = sampleStats.getMax();
334 double delta = (max - min)/(new Double(binCount)).doubleValue();
335 double[] binUpperBounds = new double[binCount];
336 binUpperBounds[0] = min + delta;
337 for (int i = 1; i< binCount - 1; i++) {
338 binUpperBounds[i] = binUpperBounds[i-1] + delta;
339 }
340 binUpperBounds[binCount -1] = max;
341
342
343 if (!binStats.isEmpty()) {
344 binStats.clear();
345 }
346 for (int i = 0; i < binCount; i++) {
347 SummaryStatistics stats = SummaryStatistics.newInstance();
348 binStats.add(i,stats);
349 }
350
351
352 DataAdapterFactory aFactory = new DataAdapterFactory();
353 DataAdapter da = aFactory.getAdapter(in);
354 try {
355 da.computeBinStats(min, delta);
356 } catch (Exception e) {
357 if(e instanceof RuntimeException){
358 throw new RuntimeException(e.getMessage());
359 }else{
360 throw new IOException(e.getMessage());
361 }
362 }
363
364
365 upperBounds = new double[binCount];
366 upperBounds[0] =
367 ((double)((SummaryStatistics)binStats.get(0)).getN())/
368 (double)sampleStats.getN();
369 for (int i = 1; i < binCount-1; i++) {
370 upperBounds[i] = upperBounds[i-1] +
371 ((double)((SummaryStatistics)binStats.get(i)).getN())/
372 (double)sampleStats.getN();
373 }
374 upperBounds[binCount-1] = 1.0d;
375 }
376
377 /**
378 * Returns the index of the bin to which the given value belongs
379 *
380 * @param min the minimum value
381 * @param value the value whose bin we are trying to find
382 * @param delta the grid size
383 * @return the index of the bin containing the value
384 */
385 private int findBin(double min, double value, double delta) {
386 return Math.min(
387 Math.max((int) Math.ceil((value- min) / delta) - 1, 0),
388 binCount - 1);
389 }
390
391 /**
392 * Generates a random value from this distribution.
393 *
394 * @return the random value.
395 * @throws IllegalStateException if the distribution has not been loaded
396 */
397 public double getNextValue() throws IllegalStateException {
398
399 if (!loaded) {
400 throw new IllegalStateException("distribution not loaded");
401 }
402
403
404 double x = Math.random();
405
406
407 for (int i = 0; i < binCount; i++) {
408 if (x <= upperBounds[i]) {
409 SummaryStatistics stats = (SummaryStatistics)binStats.get(i);
410 if (stats.getN() > 0) {
411 if (stats.getStandardDeviation() > 0) {
412 return randomData.nextGaussian
413 (stats.getMean(),stats.getStandardDeviation());
414 } else {
415 return stats.getMean();
416 }
417 }
418 }
419 }
420 throw new RuntimeException("No bin selected");
421 }
422
423 /**
424 * Returns a {@link StatisticalSummary} describing this distribution.
425 * <strong>Preconditions:</strong><ul>
426 * <li>the distribution must be loaded before invoking this method</li></ul>
427 *
428 * @return the sample statistics
429 * @throws IllegalStateException if the distribution has not been loaded
430 */
431 public StatisticalSummary getSampleStats() {
432 return sampleStats;
433 }
434
435 /**
436 * Returns the number of bins.
437 *
438 * @return the number of bins.
439 */
440 public int getBinCount() {
441 return binCount;
442 }
443
444 /**
445 * Returns an ArrayList of {@link SummaryStatistics} instances containing
446 * statistics describing the values in each of the bins. The ArrayList is
447 * indexed on the bin number.
448 *
449 * @return List of bin statistics.
450 */
451 public List getBinStats() {
452 return binStats;
453 }
454
455 /**
456 * Returns (a fresh copy of) the array of upper bounds for the bins.
457 Bins are: <br/>
458 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
459 * (upperBounds[binCount-1],max]
460 *
461 * @return array of bin upper bounds
462 */
463 public double[] getUpperBounds() {
464 int len = upperBounds.length;
465 double[] out = new double[len];
466 System.arraycopy(upperBounds, 0, out, 0, len);
467 return out;
468 }
469
470 /**
471 * Property indicating whether or not the distribution has been loaded.
472 *
473 * @return true if the distribution has been loaded
474 */
475 public boolean isLoaded() {
476 return loaded;
477 }
478 }