public string ToString(bool includeInputs, bool includeOutputs, int numDecimalPlaces) { bool decimals = numDecimalPlaces > 0; string decimalsString = decimals ? "#." + new string('0', numDecimalPlaces) : ""; return((includeInputs ? " ~~ Input-Input Correlations:\r\n" + InputsCorrelationsMatrix.ToString(numDecimalPlaces) + "\r\n\r\n" : "") + (includeOutputs ? " ~~ Output-Output Correlations:\r\n" + OutputsCorrelationsMatrix.ToString(numDecimalPlaces) + "\r\n\r\n" : "") + " ~~ Input-Output Correlations:\r\n" + InputsOutputsCorrelationsMatrix.ToString(numDecimalPlaces) + "\r\n\r\n" + " ~~ Multiple Correlations:\r\n[" + String.Join( new string('\t', Math.Max(1, numDecimalPlaces / 2)), MultipleCorrelations.Select( c => decimals ? c.ToString(decimalsString) : Convert.ToInt32(c).ToString() ) ) + "]"); }
public MultiCorrelationHelper(Matrix inputsMatrix, Matrix outputsMatrix) { // In comments... NumInputs = inputsMatrix.NumCols; // Represented by X NumOutputs = outputsMatrix.NumCols; // Represented by Y NumRows = outputsMatrix.NumRows; // Represented by N // Correlation between a dataset and itself will always be 1 // This conveniently translates to the identity of the matrix making a perfect base InputsCorrelationsMatrix = Matrix.Identity(NumInputs); OutputsCorrelationsMatrix = Matrix.Identity(NumOutputs); InputsOutputsCorrelationsMatrix = new Matrix(NumOutputs, NumInputs); int jaggedNumInputs = NumInputs - 1; int jaggedNumOutputs = NumOutputs - 1; // The Helper is to give the user some of the values potentially used in other calculations // InputSums and InputSquaredSums technically aren't necessary after calculating meand and StdDev var Helper = new TempHelper(); Helper.InputSums = new double[NumInputs]; Helper.InputSquaredSums = new double[NumInputs]; Helper.InputMeans = new double[NumInputs]; Helper.InputStdDevs = new double[NumInputs]; Helper.OutputSums = new double[NumOutputs]; Helper.OutputSquaredSums = new double[NumOutputs]; Helper.OutputMeans = new double[NumOutputs]; Helper.OutputStdDevs = new double[NumOutputs]; // Prep and fill the jagged arrays for sums between values // This is later used in the correlation calculations double[][] InputInputSums = new double[jaggedNumInputs][]; double[][] OutputOutputSums = new double[jaggedNumOutputs][]; double[][] InputOutputSums = new double[NumInputs][]; for (int x = 0; x < jaggedNumInputs; ++x) { InputInputSums[x] = new double[jaggedNumInputs - x]; InputOutputSums[x] = new double[NumOutputs]; } InputOutputSums[jaggedNumInputs] = new double[NumOutputs]; for (int y = 0; y < jaggedNumOutputs; ++y) { OutputOutputSums[y] = new double[jaggedNumOutputs - y]; } // The rows of the dataset are iterated through only once // Given how many rows are typically in a dataset of this size, this is much more efficient for (int i = 0; i < NumRows; ++i) { // Setup to reduce number of read operations double[] inputs = inputsMatrix[i]; double[] outputs = outputsMatrix[i]; // Handle input sums, as well as the input * output sums for (int j = 0; j < jaggedNumInputs; ++j) { double input = inputs[j]; Helper.InputSums[j] += input; Helper.InputSquaredSums[j] += input * input; for (int k = 0; k < NumOutputs; ++k) { InputOutputSums[j][k] += input * outputs[k]; } // Efficiently fill in triangular matrix int index = 0; for (int k = jaggedNumInputs; k > j; --k) { InputInputSums[j][index] += input * inputs[k]; ++index; } } // The last input has no work in the triangular matrix // To avoid excessive boolean operators, that input does their processing here double jaggedInput = inputs[jaggedNumInputs]; Helper.InputSums[jaggedNumInputs] += jaggedInput; Helper.InputSquaredSums[jaggedNumInputs] += jaggedInput * jaggedInput; for (int k = 0; k < NumOutputs; ++k) { InputOutputSums[jaggedNumInputs][k] += jaggedInput * outputs[k]; } // Handle output sums for (int j = 0; j < jaggedNumOutputs; ++j) { double output = outputs[j]; Helper.OutputSums[j] += output; Helper.OutputSquaredSums[j] += output * output; int index = 0; for (int k = jaggedNumOutputs; k > j; --k) { OutputOutputSums[j][index] += output * outputs[k]; ++index; } } // Same as above, with jaggedInput double jaggedOutput = outputs[jaggedNumOutputs]; Helper.OutputSums[jaggedNumOutputs] += jaggedOutput; Helper.OutputSquaredSums[jaggedNumOutputs] += jaggedOutput * jaggedOutput; inputs = null; outputs = null; } int sampleNumRows = NumRows - 1; // Get correlations between inputs and other inputs, as well as inputs and outputs for (int j = 0; j < NumInputs; ++j) { // Pre-fetch array values to avoid excessive read operations var sums = InputOutputSums[j]; var sum = Helper.InputSums[j]; var squaredSum = Helper.InputSquaredSums[j]; // Calculate and store standard deviation // Not entirely necessary, but could be useful to the user // Normalization and graph-checking the correlation both come to mind double mean = sum / NumRows; Helper.InputMeans[j] = mean; Helper.InputStdDevs[j] = StdDev(sampleNumRows, mean, squaredSum); // Calculate correlation between each input and each output for (int k = 0; k < NumOutputs; ++k) { InputsOutputsCorrelationsMatrix[k, j] = StatisticsExtensions.PearsonCorrelationFormula( NumRows, sum, Helper.OutputSums[k], squaredSum, Helper.OutputSquaredSums[k], sums[k] ); } // Here we use a boolean simply because of all the excess code above // Also, the number of boolean operations is X, while above the number would be N * X if (j != jaggedNumInputs) { // Calculate the correlation between inputs, for Multiple Correlation // Using the triangular matrix code, so that it is an extra write operation instead of an extra calculation var inputSums = InputInputSums[j]; int index = 0; for (int k = jaggedNumInputs; k > j; --k) { double correlation = StatisticsExtensions.PearsonCorrelationFormula( NumRows, sum, Helper.InputSums[k], squaredSum, Helper.InputSquaredSums[k], inputSums[index] ); InputsCorrelationsMatrix[j, k] = correlation; InputsCorrelationsMatrix[k, j] = correlation; ++index; } } } // Prep for calculating Multiple Correlation MultipleCorrelations = new double[NumOutputs]; var invertedInputs = InputsCorrelationsMatrix.Invert(); // Get correlations between outputs and other outputs // Technically not a use for these correlations // Simply might be interesting to see, while we have the values for (int j = 0; j < NumOutputs; ++j) { var sum = Helper.OutputSums[j]; var squaredSum = Helper.OutputSquaredSums[j]; double mean = sum / NumRows; Helper.OutputMeans[j] = mean; Helper.OutputStdDevs[j] = StdDev(NumRows, mean, squaredSum); if (j != jaggedNumOutputs) { var outputSums = OutputOutputSums[j]; int index = 0; for (int k = jaggedNumOutputs; k > j; --k) { double correlation = StatisticsExtensions.PearsonCorrelationFormula( NumRows, sum, Helper.OutputSums[k], squaredSum, Helper.OutputSquaredSums[k], outputSums[index] ); OutputsCorrelationsMatrix[j, k] = correlation; OutputsCorrelationsMatrix[k, j] = correlation; ++index; } outputSums = null; } var inputCorrelations = InputsOutputsCorrelationsMatrix[j]; MultipleCorrelations[j] = Math.Sqrt(invertedInputs.Dot(inputCorrelations).Dot(inputCorrelations)); inputCorrelations = null; } invertedInputs = null; }