001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.math.random; 019 020 import java.io.BufferedReader; 021 import java.io.File; 022 import java.io.FileReader; 023 import java.io.IOException; 024 import java.io.InputStreamReader; 025 import java.io.Serializable; 026 import java.net.URL; 027 import java.util.ArrayList; 028 import java.util.List; 029 030 import org.apache.commons.math.MathRuntimeException; 031 import org.apache.commons.math.stat.descriptive.StatisticalSummary; 032 import org.apache.commons.math.stat.descriptive.SummaryStatistics; 033 034 /** 035 * Implements <code>EmpiricalDistribution</code> interface. This implementation 036 * uses what amounts to the 037 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html"> 038 * Variable Kernel Method</a> with Gaussian smoothing:<p> 039 * <strong>Digesting the input file</strong> 040 * <ol><li>Pass the file once to compute min and max.</li> 041 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li> 042 * <li>Pass the data file again, computing bin counts and univariate 043 * statistics (mean, std dev.) for each of the bins </li> 044 * <li>Divide the interval (0,1) into subintervals associated with the bins, 045 * with the length of a bin's subinterval proportional to its count.</li></ol> 046 * <strong>Generating random values from the distribution</strong><ol> 047 * <li>Generate a uniformly distributed value in (0,1) </li> 048 * <li>Select the subinterval to which the value belongs. 049 * <li>Generate a random Gaussian value with mean = mean of the associated 050 * bin and std dev = std dev of associated bin.</li></ol></p><p> 051 *<strong>USAGE NOTES:</strong><ul> 052 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb 053 * is to set the bin count to approximately the length of the input file divided 054 * by 10. </li> 055 *<li>The input file <i>must</i> be a plain text file containing one valid numeric 056 * entry per line.</li> 057 * </ul></p> 058 * 059 * @version $Revision: 772119 $ $Date: 2009-05-06 05:43:28 -0400 (Wed, 06 May 2009) $ 060 */ 061 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution { 062 063 /** Serializable version identifier */ 064 private static final long serialVersionUID = 5729073523949762654L; 065 066 /** List of SummaryStatistics objects characterizing the bins */ 067 private List<SummaryStatistics> binStats = null; 068 069 /** Sample statistics */ 070 private SummaryStatistics sampleStats = null; 071 072 /** number of bins */ 073 private int binCount = 1000; 074 075 /** is the distribution loaded? */ 076 private boolean loaded = false; 077 078 /** upper bounds of subintervals in (0,1) "belonging" to the bins */ 079 private double[] upperBounds = null; 080 081 /** RandomData instance to use in repeated calls to getNext() */ 082 private RandomData randomData = new RandomDataImpl(); 083 084 /** 085 * Creates a new EmpiricalDistribution with the default bin count. 086 */ 087 public EmpiricalDistributionImpl() { 088 binStats = new ArrayList<SummaryStatistics>(); 089 } 090 091 /** 092 * Creates a new EmpiricalDistribution with the specified bin count. 093 * 094 * @param binCount number of bins 095 */ 096 public EmpiricalDistributionImpl(int binCount) { 097 this.binCount = binCount; 098 binStats = new ArrayList<SummaryStatistics>(); 099 } 100 101 /** 102 * Computes the empirical distribution from the provided 103 * array of numbers. 104 * 105 * @param in the input data array 106 */ 107 public void load(double[] in) { 108 DataAdapter da = new ArrayDataAdapter(in); 109 try { 110 da.computeStats(); 111 fillBinStats(in); 112 } catch (Exception e) { 113 throw new MathRuntimeException(e); 114 } 115 loaded = true; 116 117 } 118 119 /** 120 * Computes the empirical distribution using data read from a URL. 121 * @param url url of the input file 122 * 123 * @throws IOException if an IO error occurs 124 */ 125 public void load(URL url) throws IOException { 126 BufferedReader in = 127 new BufferedReader(new InputStreamReader(url.openStream())); 128 try { 129 DataAdapter da = new StreamDataAdapter(in); 130 try { 131 da.computeStats(); 132 } catch (IOException ioe) { 133 // don't wrap exceptions which are already IOException 134 throw ioe; 135 } catch (RuntimeException rte) { 136 // don't wrap RuntimeExceptions 137 throw rte; 138 } catch (Exception e) { 139 throw MathRuntimeException.createIOException(e); 140 } 141 if (sampleStats.getN() == 0) { 142 throw MathRuntimeException.createEOFException("URL {0} contains no data", 143 url); 144 } 145 in = new BufferedReader(new InputStreamReader(url.openStream())); 146 fillBinStats(in); 147 loaded = true; 148 } finally { 149 try { 150 in.close(); 151 } catch (IOException ex) { 152 // ignore 153 } 154 } 155 } 156 157 /** 158 * Computes the empirical distribution from the input file. 159 * 160 * @param file the input file 161 * @throws IOException if an IO error occurs 162 */ 163 public void load(File file) throws IOException { 164 BufferedReader in = new BufferedReader(new FileReader(file)); 165 try { 166 DataAdapter da = new StreamDataAdapter(in); 167 try { 168 da.computeStats(); 169 } catch (IOException ioe) { 170 // don't wrap exceptions which are already IOException 171 throw ioe; 172 } catch (RuntimeException rte) { 173 // don't wrap RuntimeExceptions 174 throw rte; 175 } catch (Exception e) { 176 throw MathRuntimeException.createIOException(e); 177 } 178 in = new BufferedReader(new FileReader(file)); 179 fillBinStats(in); 180 loaded = true; 181 } finally { 182 try { 183 in.close(); 184 } catch (IOException ex) { 185 // ignore 186 } 187 } 188 } 189 190 /** 191 * Provides methods for computing <code>sampleStats</code> and 192 * <code>beanStats</code> abstracting the source of data. 193 */ 194 private abstract class DataAdapter{ 195 /** 196 * Compute bin stats. 197 * 198 * @param min minimum value 199 * @param delta grid size 200 * @throws Exception if an error occurs computing bin stats 201 */ 202 public abstract void computeBinStats(double min, double delta) 203 throws Exception; 204 /** 205 * Compute sample statistics. 206 * 207 * @throws Exception if an error occurs computing sample stats 208 */ 209 public abstract void computeStats() throws Exception; 210 } 211 /** 212 * Factory of <code>DataAdapter</code> objects. For every supported source 213 * of data (array of doubles, file, etc.) an instance of the proper object 214 * is returned. 215 */ 216 private class DataAdapterFactory{ 217 /** 218 * Creates a DataAdapter from a data object 219 * 220 * @param in object providing access to the data 221 * @return DataAdapter instance 222 */ 223 public DataAdapter getAdapter(Object in) { 224 if (in instanceof BufferedReader) { 225 BufferedReader inputStream = (BufferedReader) in; 226 return new StreamDataAdapter(inputStream); 227 } else if (in instanceof double[]) { 228 double[] inputArray = (double[]) in; 229 return new ArrayDataAdapter(inputArray); 230 } else { 231 throw MathRuntimeException.createIllegalArgumentException( 232 "input data comes from unsupported datasource: {0}, " + 233 "supported sources: {1}, {2}", 234 in.getClass().getName(), 235 BufferedReader.class.getName(), double[].class.getName()); 236 } 237 } 238 } 239 /** 240 * <code>DataAdapter</code> for data provided through some input stream 241 */ 242 private class StreamDataAdapter extends DataAdapter{ 243 244 /** Input stream providng access to the data */ 245 private BufferedReader inputStream; 246 247 /** 248 * Create a StreamDataAdapter from a BufferedReader 249 * 250 * @param in BufferedReader input stream 251 */ 252 public StreamDataAdapter(BufferedReader in){ 253 super(); 254 inputStream = in; 255 } 256 /** 257 * Computes binStats 258 * 259 * @param min minimum value 260 * @param delta grid size 261 * @throws IOException if an IO error occurs 262 */ 263 @Override 264 public void computeBinStats(double min, double delta) 265 throws IOException { 266 String str = null; 267 double val = 0.0d; 268 while ((str = inputStream.readLine()) != null) { 269 val = Double.parseDouble(str); 270 SummaryStatistics stats = binStats.get(findBin(min, val, delta)); 271 stats.addValue(val); 272 } 273 274 inputStream.close(); 275 inputStream = null; 276 } 277 /** 278 * Computes sampleStats 279 * 280 * @throws IOException if an IOError occurs 281 */ 282 @Override 283 public void computeStats() throws IOException { 284 String str = null; 285 double val = 0.0; 286 sampleStats = new SummaryStatistics(); 287 while ((str = inputStream.readLine()) != null) { 288 val = Double.valueOf(str).doubleValue(); 289 sampleStats.addValue(val); 290 } 291 inputStream.close(); 292 inputStream = null; 293 } 294 } 295 296 /** 297 * <code>DataAdapter</code> for data provided as array of doubles. 298 */ 299 private class ArrayDataAdapter extends DataAdapter{ 300 301 /** Array of input data values */ 302 private double[] inputArray; 303 304 /** 305 * Construct an ArrayDataAdapter from a double[] array 306 * 307 * @param in double[] array holding the data 308 */ 309 public ArrayDataAdapter(double[] in){ 310 super(); 311 inputArray = in; 312 } 313 /** 314 * Computes sampleStats 315 * 316 * @throws IOException if an IO error occurs 317 */ 318 @Override 319 public void computeStats() throws IOException { 320 sampleStats = new SummaryStatistics(); 321 for (int i = 0; i < inputArray.length; i++) { 322 sampleStats.addValue(inputArray[i]); 323 } 324 } 325 /** 326 * Computes binStats 327 * 328 * @param min minimum value 329 * @param delta grid size 330 * @throws IOException if an IO error occurs 331 */ 332 @Override 333 public void computeBinStats(double min, double delta) 334 throws IOException { 335 for (int i = 0; i < inputArray.length; i++) { 336 SummaryStatistics stats = 337 binStats.get(findBin(min, inputArray[i], delta)); 338 stats.addValue(inputArray[i]); 339 } 340 } 341 } 342 343 /** 344 * Fills binStats array (second pass through data file). 345 * 346 * @param in object providing access to the data 347 * @throws IOException if an IO error occurs 348 */ 349 private void fillBinStats(Object in) throws IOException { 350 // Load array of bin upper bounds -- evenly spaced from min - max 351 double min = sampleStats.getMin(); 352 double max = sampleStats.getMax(); 353 double delta = (max - min)/(Double.valueOf(binCount)).doubleValue(); 354 double[] binUpperBounds = new double[binCount]; 355 binUpperBounds[0] = min + delta; 356 for (int i = 1; i< binCount - 1; i++) { 357 binUpperBounds[i] = binUpperBounds[i-1] + delta; 358 } 359 binUpperBounds[binCount -1] = max; 360 361 // Initialize binStats ArrayList 362 if (!binStats.isEmpty()) { 363 binStats.clear(); 364 } 365 for (int i = 0; i < binCount; i++) { 366 SummaryStatistics stats = new SummaryStatistics(); 367 binStats.add(i,stats); 368 } 369 370 // Filling data in binStats Array 371 DataAdapterFactory aFactory = new DataAdapterFactory(); 372 DataAdapter da = aFactory.getAdapter(in); 373 try { 374 da.computeBinStats(min, delta); 375 } catch (IOException ioe) { 376 // don't wrap exceptions which are already IOException 377 throw ioe; 378 } catch (RuntimeException rte) { 379 // don't wrap RuntimeExceptions 380 throw rte; 381 } catch (Exception e) { 382 throw MathRuntimeException.createIOException(e); 383 } 384 385 // Assign upperBounds based on bin counts 386 upperBounds = new double[binCount]; 387 upperBounds[0] = 388 ((double) binStats.get(0).getN()) / (double) sampleStats.getN(); 389 for (int i = 1; i < binCount-1; i++) { 390 upperBounds[i] = upperBounds[i-1] + 391 ((double) binStats.get(i).getN()) / (double) sampleStats.getN(); 392 } 393 upperBounds[binCount-1] = 1.0d; 394 } 395 396 /** 397 * Returns the index of the bin to which the given value belongs 398 * 399 * @param min the minimum value 400 * @param value the value whose bin we are trying to find 401 * @param delta the grid size 402 * @return the index of the bin containing the value 403 */ 404 private int findBin(double min, double value, double delta) { 405 return Math.min( 406 Math.max((int) Math.ceil((value- min) / delta) - 1, 0), 407 binCount - 1); 408 } 409 410 /** 411 * Generates a random value from this distribution. 412 * 413 * @return the random value. 414 * @throws IllegalStateException if the distribution has not been loaded 415 */ 416 public double getNextValue() throws IllegalStateException { 417 418 if (!loaded) { 419 throw MathRuntimeException.createIllegalStateException("distribution not loaded"); 420 } 421 422 // Start with a uniformly distributed random number in (0,1) 423 double x = Math.random(); 424 425 // Use this to select the bin and generate a Gaussian within the bin 426 for (int i = 0; i < binCount; i++) { 427 if (x <= upperBounds[i]) { 428 SummaryStatistics stats = binStats.get(i); 429 if (stats.getN() > 0) { 430 if (stats.getStandardDeviation() > 0) { // more than one obs 431 return randomData.nextGaussian 432 (stats.getMean(),stats.getStandardDeviation()); 433 } else { 434 return stats.getMean(); // only one obs in bin 435 } 436 } 437 } 438 } 439 throw new MathRuntimeException("no bin selected"); 440 } 441 442 /** 443 * Returns a {@link StatisticalSummary} describing this distribution. 444 * <strong>Preconditions:</strong><ul> 445 * <li>the distribution must be loaded before invoking this method</li></ul> 446 * 447 * @return the sample statistics 448 * @throws IllegalStateException if the distribution has not been loaded 449 */ 450 public StatisticalSummary getSampleStats() { 451 return sampleStats; 452 } 453 454 /** 455 * Returns the number of bins. 456 * 457 * @return the number of bins. 458 */ 459 public int getBinCount() { 460 return binCount; 461 } 462 463 /** 464 * Returns a List of {@link SummaryStatistics} instances containing 465 * statistics describing the values in each of the bins. The list is 466 * indexed on the bin number. 467 * 468 * @return List of bin statistics. 469 */ 470 public List<SummaryStatistics> getBinStats() { 471 return binStats; 472 } 473 474 /** 475 * Returns (a fresh copy of) the array of upper bounds for the bins. 476 Bins are: <br/> 477 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],..., 478 * (upperBounds[binCount-1],max] 479 * 480 * @return array of bin upper bounds 481 */ 482 public double[] getUpperBounds() { 483 int len = upperBounds.length; 484 double[] out = new double[len]; 485 System.arraycopy(upperBounds, 0, out, 0, len); 486 return out; 487 } 488 489 /** 490 * Property indicating whether or not the distribution has been loaded. 491 * 492 * @return true if the distribution has been loaded 493 */ 494 public boolean isLoaded() { 495 return loaded; 496 } 497 }