001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.correlation;
018    
019    import org.apache.commons.math.MathException;
020    import org.apache.commons.math.MathRuntimeException;
021    import org.apache.commons.math.distribution.TDistribution;
022    import org.apache.commons.math.distribution.TDistributionImpl;
023    import org.apache.commons.math.linear.RealMatrix;
024    import org.apache.commons.math.linear.BlockRealMatrix;
025    import org.apache.commons.math.stat.regression.SimpleRegression;
026    
027    /**
028     * Computes Pearson's product-moment correlation coefficients for pairs of arrays
029     * or columns of a matrix.
030     * 
031     * <p>The constructors that take <code>RealMatrix</code> or 
032     * <code>double[][]</code> arguments generate correlation matrices.  The
033     * columns of the input matrices are assumed to represent variable values.
034     * Correlations are given by the formula</p>
035     * <code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
036     * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
037     * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
038     * 
039     * @version $Revision: 799857 $ $Date: 2009-08-01 09:07:12 -0400 (Sat, 01 Aug 2009) $
040     * @since 2.0
041     */
042    public class PearsonsCorrelation {
043        
044        /** correlation matrix */
045        private final RealMatrix correlationMatrix;
046        
047        /** number of observations */
048        private final int nObs;
049        
050        /**
051         * Create a PearsonsCorrelation instance without data
052         */
053        public PearsonsCorrelation() {
054            super();
055            correlationMatrix = null;
056            nObs = 0;
057        }
058        
059        /**
060         * Create a PearsonsCorrelation from a rectangular array
061         * whose columns represent values of variables to be correlated.
062         * 
063         * @param data rectangular array with columns representing variables
064         * @throws IllegalArgumentException if the input data array is not
065         * rectangular with at least two rows and two columns.
066         */
067        public PearsonsCorrelation(double[][] data) {
068            this(new BlockRealMatrix(data));
069        }
070        
071        /**
072         * Create a PearsonsCorrelation from a RealMatrix whose columns
073         * represent variables to be correlated.
074         * 
075         * @param matrix matrix with columns representing variables to correlate
076         */
077        public PearsonsCorrelation(RealMatrix matrix) {
078            checkSufficientData(matrix);
079            nObs = matrix.getRowDimension();
080            correlationMatrix = computeCorrelationMatrix(matrix);
081        }
082        
083        /**
084         * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
085         * matrix is computed by scaling the Covariance's covariance matrix.
086         * The Covariance instance must have been created from a data matrix with
087         * columns representing variable values.
088         * 
089         * @param covariance Covariance instance
090         */
091        public PearsonsCorrelation(Covariance covariance) {
092            RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
093            if (covarianceMatrix == null) {
094                throw MathRuntimeException.createIllegalArgumentException("covariance matrix is null");
095            }
096            nObs = covariance.getN();
097            correlationMatrix = covarianceToCorrelation(covarianceMatrix);
098        }
099        
100        /**
101         * Create a PearsonsCorrelation from a covariance matrix.  The correlation
102         * matrix is computed by scaling the covariance matrix.
103         * 
104         * @param covarianceMatrix covariance matrix
105         * @param numberOfObservations the number of observations in the dataset used to compute
106         * the covariance matrix
107         */
108        public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
109            nObs = numberOfObservations;
110            correlationMatrix = covarianceToCorrelation(covarianceMatrix);
111            
112        }
113        
114        /**
115         * Returns the correlation matrix
116         * 
117         * @return correlation matrix
118         */
119        public RealMatrix getCorrelationMatrix() {
120            return correlationMatrix;  
121        }
122        
123        /**
124         * Returns a matrix of standard errors associated with the estimates
125         * in the correlation matrix.<br/>
126         * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
127         * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
128         * <p>The formula used to compute the standard error is <br/>
129         * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
130         * where <code>r</code> is the estimated correlation coefficient and 
131         * <code>n</code> is the number of observations in the source dataset.</p>
132         * 
133         * @return matrix of correlation standard errors
134         */
135        public RealMatrix getCorrelationStandardErrors() {
136            int nVars = correlationMatrix.getColumnDimension();
137            double[][] out = new double[nVars][nVars];
138            for (int i = 0; i < nVars; i++) {
139                for (int j = 0; j < nVars; j++) {
140                    double r = correlationMatrix.getEntry(i, j);
141                    out[i][j] = Math.sqrt((1 - r * r) /(nObs - 2));
142                }
143            }
144            return new BlockRealMatrix(out);
145        }
146    
147        /**
148         * Returns a matrix of p-values associated with the (two-sided) null
149         * hypothesis that the corresponding correlation coefficient is zero.
150         * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
151         * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
152         * a value with absolute value greater than or equal to <br>
153         * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
154         * <p>The values in the matrix are sometimes referred to as the 
155         * <i>significance</i> of the corresponding correlation coefficients.</p>
156         * 
157         * @return matrix of p-values
158         * @throws MathException if an error occurs estimating probabilities
159         */
160        public RealMatrix getCorrelationPValues() throws MathException {
161            TDistribution tDistribution = new TDistributionImpl(nObs - 2);
162            int nVars = correlationMatrix.getColumnDimension();
163            double[][] out = new double[nVars][nVars];
164            for (int i = 0; i < nVars; i++) {
165                for (int j = 0; j < nVars; j++) {
166                    if (i == j) {
167                        out[i][j] = 0d;
168                    } else {
169                        double r = correlationMatrix.getEntry(i, j);
170                        double t = Math.abs(r * Math.sqrt((nObs - 2)/(1 - r * r)));
171                        out[i][j] = 2 * (1 - tDistribution.cumulativeProbability(t));
172                    }
173                }
174            }
175            return new BlockRealMatrix(out);
176        }
177        
178        
179        /**
180         * Computes the correlation matrix for the columns of the
181         * input matrix.
182         * 
183         * @param matrix matrix with columns representing variables to correlate
184         * @return correlation matrix
185         */
186        public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
187            int nVars = matrix.getColumnDimension();
188            RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
189            for (int i = 0; i < nVars; i++) {
190                for (int j = 0; j < i; j++) {
191                  double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
192                  outMatrix.setEntry(i, j, corr);
193                  outMatrix.setEntry(j, i, corr);
194                }
195                outMatrix.setEntry(i, i, 1d);
196            }
197            return outMatrix;
198        }
199        
200        /**
201         * Computes the correlation matrix for the columns of the
202         * input rectangular array.  The colums of the array represent values
203         * of variables to be correlated.
204         * 
205         * @param data matrix with columns representing variables to correlate
206         * @return correlation matrix
207         */
208        public RealMatrix computeCorrelationMatrix(double[][] data) {
209           return computeCorrelationMatrix(new BlockRealMatrix(data));
210        }
211        
212        /**
213         * Computes the Pearson's product-moment correlation coefficient between the two arrays.
214         * 
215         * </p>Throws IllegalArgumentException if the arrays do not have the same length
216         * or their common length is less than 2</p>
217         *
218         * @param xArray first data array
219         * @param yArray second data array
220         * @return Returns Pearson's correlation coefficient for the two arrays 
221         * @throws  IllegalArgumentException if the arrays lengths do not match or
222         * there is insufficient data
223         */
224        public double correlation(final double[] xArray, final double[] yArray) throws IllegalArgumentException {
225            SimpleRegression regression = new SimpleRegression();
226            if(xArray.length == yArray.length && xArray.length > 1) {
227                for(int i=0; i<xArray.length; i++) {
228                    regression.addData(xArray[i], yArray[i]);
229                }
230                return regression.getR();
231            }
232            else {
233                throw MathRuntimeException.createIllegalArgumentException(
234                        "invalid array dimensions. xArray has size {0}; yArray has {1} elements",
235                        xArray.length, yArray.length);
236            }
237        }
238        
239        /**
240         * Derives a correlation matrix from a covariance matrix.
241         * 
242         * <p>Uses the formula <br/>
243         * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where 
244         * <code>r(&middot,&middot;)</code> is the correlation coefficient and
245         * <code>s(&middot;)</code> means standard deviation.</p>
246         * 
247         * @param covarianceMatrix the covariance matrix
248         * @return correlation matrix
249         */
250        public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
251            int nVars = covarianceMatrix.getColumnDimension();
252            RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
253            for (int i = 0; i < nVars; i++) {
254                double sigma = Math.sqrt(covarianceMatrix.getEntry(i, i));
255                outMatrix.setEntry(i, i, 1d);
256                for (int j = 0; j < i; j++) {
257                    double entry = covarianceMatrix.getEntry(i, j) / 
258                           (sigma * Math.sqrt(covarianceMatrix.getEntry(j, j)));
259                    outMatrix.setEntry(i, j, entry);
260                    outMatrix.setEntry(j, i, entry);
261                }
262            }
263            return outMatrix;
264        }
265        
266        /**
267         * Throws IllegalArgumentException of the matrix does not have at least
268         * two columns and two rows
269         * 
270         * @param matrix matrix to check for sufficiency
271         */
272        private void checkSufficientData(final RealMatrix matrix) {
273            int nRows = matrix.getRowDimension();
274            int nCols = matrix.getColumnDimension();
275            if (nRows < 2 || nCols < 2) {
276                throw MathRuntimeException.createIllegalArgumentException(
277                        "insufficient data: only {0} rows and {1} columns.",
278                        nRows, nCols);
279            }
280        }
281    }