/// <summary> /// Matrix factorization constructor for eigenvalue decomposition. /// </summary> /// <param name="factorizationType">Factorization type</param> /// <param name="rank">Rank</param> /// <param name="determinant">Determinant</param> /// <param name="eigenvalues">Eigenvalues</param> /// <param name="eigenvectors">Eigenvectors</param> /// <param name="D">Diagonal eigenvalues matrix</param> public MatrixFactorization(string factorizationType, int rank, double determinant, InsightVector eigenvalues, InsightMatrix eigenvectors, InsightMatrix D) { this.FactorizationType = factorizationType; this.Rank = rank; this.Determinant = determinant; this.Eigenvalues = eigenvalues; this.Eigenvectors = eigenvectors; this.EigenvaluesDiagonal = D; }
/// <summary> /// Matrix factorization constructor for singular value decomposition. /// </summary> /// <param name="factorizationType">Factorization type</param> /// <param name="rank">Rank</param> /// <param name="l2Norm">L2 norm</param> /// <param name="S">Singular values</param> /// <param name="U">Left singular vectors</param> /// <param name="VT">Right singular vectors</param> /// <param name="W">Diagonal singular values matrix</param> public MatrixFactorization(string factorizationType, int rank, double l2Norm, InsightVector S, InsightMatrix U, InsightMatrix VT, InsightMatrix W) { this.FactorizationType = factorizationType; this.Rank = rank; this.L2Norm = l2Norm; this.SingularValues = S; this.LeftSingularVectors = U; this.RightSingularVectors = VT; this.SingularValuesDiagonal = W; }
/// <summary> /// Calculates the distance between two vectors. /// </summary> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <param name="distanceMethod">Algorithm to use for the distance calculation</param> /// <returns>Distance between the two vectors</returns> public static double DistanceFrom(this InsightVector u, InsightVector v, DistanceMethod distanceMethod) { switch (distanceMethod) { case DistanceMethod.EuclideanDistance: return new EuclideanDistance().CalculateDistance(u, v); case DistanceMethod.HammingDistance: return new HammingDistance().CalculateDistance(u, v); default: return new ManhattanDistance().CalculateDistance(u, v); } }
/// <summary> /// Calculates the similarity between two vectors. /// </summary> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <param name="similarityMethod">Algorithm to use for the similarity calculation</param> /// <returns>Similarity between the two vectors</returns> public static double SimilarityTo(this InsightVector u, InsightVector v, SimilarityMethod similarityMethod) { switch (similarityMethod) { case SimilarityMethod.CosineSimilarity: return new CosineSimilarity().CalculateSimilarity(u, v); case SimilarityMethod.JaccardCoefficient: return new JaccardCoefficient().CalculateSimilarity(u, v); default: return new PearsonCorrelation().CalculateSimilarity(u, v); } }
/// <summary> /// Calculates the distance between two vectors using Manhattan distance. /// </summary> /// <remarks>Range is 0 to infinity</remarks> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <returns>Distance between the two vectors</returns> public double CalculateDistance(InsightVector u, InsightVector v) { if (u.Count != v.Count) throw new Exception("Vector lengths must be equal."); int length = u.Count; double distance = 0; for (int i = 0; i < length; i++) { distance += Math.Abs(u[i] - v[i]); } return distance; }
/// <summary> /// Calculates the distance between two vectors using Euclidean distance. /// </summary> /// <remarks>Range is 0 to infinity</remarks> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <returns>Distance between the two vectors</returns> public double CalculateDistance(InsightVector u, InsightVector v) { if (u.Count != v.Count) throw new Exception("Vector lengths must be equal."); int length = u.Count; double sumOfSquares = 0; for (int i = 0; i < length; i++) { sumOfSquares += (u[i] - v[i]) * (u[i] - v[i]); } return Math.Sqrt(sumOfSquares); }
/// <summary> /// Calculates the similarity between two vectors using the extended Jaccard coefficient. /// </summary> /// <remarks>Range is 0 to 1</remarks> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <returns>Similarity between the two vectors</returns> public double CalculateSimilarity(InsightVector u, InsightVector v) { if (u.Count != v.Count) throw new Exception("Vector lengths must be equal."); int length = u.Count; double uSumSquared = 0, vSumSquared = 0, productSum = 0; for (int i = 0; i < length; i++) { uSumSquared += u[i] * u[i]; vSumSquared += v[i] * v[i]; productSum += u[i] * v[i]; } double numerator = productSum; double denominator = uSumSquared + vSumSquared - productSum; return denominator == 0 ? 0 : numerator / denominator; }
/// <summary> /// Normalizes each column in the matrix by centering and then scaling each column /// in the original matrix. /// </summary> /// <param name="meanVector">Vector with pre-computed column means</param> /// <returns>Column-normalized matrix</returns> public InsightMatrix Normalize(InsightVector meanVector) { return this.Center(meanVector).Scale(); }
/// <summary> /// Performs linear discriminant analysis on the input data set. Extra parameters /// are used to specify the critera or methodology used to limit the number of features /// in the transformed data set. Only one extra parameter must be specified. /// </summary> /// <param name="matrix">Input matrix</param> /// <param name="featureLimit">Maximum number of features in the new data set</param> /// <param name="percentThreshold">Specifies the percent of the concept variance to use /// in limiting the number of features selected for the new data set (range 0-1)</param> /// <returns>Transformed matrix with reduced number of dimensions</returns> private InsightMatrix PerformLDA(InsightMatrix matrix, int? featureLimit, double? percentThreshold) { // Calculate the mean vector for the entire data set (skipping the class column) int columnCount = matrix.ColumnCount - 1; InsightVector totalMean = new InsightVector(columnCount); for (int i = 0; i < columnCount; i++) { totalMean[i] = matrix.Column(i).Mean(); } // Derive a sub-matrix for each class in the data set List<InsightMatrix> classes = matrix.Decompose(columnCount); // Calculate the mean and covariance matrix for each class var meanVectors = new List<KeyValuePair<int, InsightVector>>(); var covariances = new List<InsightMatrix>(); foreach (var classMatrix in classes) { InsightVector means = new InsightVector(columnCount); for (int i = 0; i < columnCount; i++) { means[i] = classMatrix.Column(i).Mean(); } // Using a dictionary to keep the number of samples in the class in // addition to the mean vector - we'll need both later on meanVectors.Add(new KeyValuePair<int, InsightVector>(classMatrix.RowCount, means)); // Drop the class column then compute the covariance matrix for this class InsightMatrix covariance = classMatrix.SubMatrix(0, classMatrix.RowCount, 0, classMatrix.ColumnCount - 1); covariance = covariance.Center().CovarianceMatrix(true); covariances.Add(covariance); } // Calculate the within-class scatter matrix InsightMatrix withinClassScatter = covariances.Aggregate((x, y) => new InsightMatrix((x + y))); // Calculate the between-class scatter matrix InsightMatrix betweenClassScatter = meanVectors.Aggregate( new InsightMatrix(totalMean.Count), (x, y) => x + (y.Key * (y.Value - totalMean).ToColumnMatrix() * (y.Value - totalMean).ToColumnMatrix().Transpose())); // Compute the LDA projection and perform eigenvalue decomposition on the projected matrix InsightMatrix projection = new InsightMatrix( (withinClassScatter.Inverse() * betweenClassScatter)); MatrixFactorization evd = projection.EigenvalueDecomposition(); int rank = evd.Eigenvalues.Where(x => x > 0.001).Count(); // Determine the number of features to keep for the final data set if (featureLimit != null) { // Enforce a raw numeric feature limit if (rank > featureLimit) rank = featureLimit.Value; } else if (percentThreshold != null) { // Limit to a percent of the variance in the data set (represented by the sum of the eigenvalues) double totalVariance = evd.Eigenvalues.Sum() * percentThreshold.Value; double accumulatedVariance = 0; rank = 0; while (accumulatedVariance < totalVariance) { accumulatedVariance += evd.Eigenvalues[rank]; rank++; } } // Extract the most important vectors (in order by eigenvalue size) InsightMatrix projectionVectors = new InsightMatrix(evd.Eigenvalues.Count, rank); for (int i = 0; i < rank; i++) { // Find the largest remaining eigenvalue int index = evd.Eigenvalues.MaxIndex(); projectionVectors.SetColumn(i, evd.Eigenvectors.Column(index)); // Set this position to zero so the next iteration captures the next-largest eigenvalue evd.Eigenvalues[index] = 0; } // Multiply each class matrix by the projection vectors for (int i = 0; i < classes.Count; i++) { // Save the class vector InsightVector classVector = classes[i].Column(0); // Create a new class matrix using the projection vectors classes[i] = (projectionVectors.Transpose() * classes[i].SubMatrix(0, classes[i].RowCount, 1, classes[i].ColumnCount - 1) .Transpose()).Transpose(); // Insert the class vector back into the matrix classes[i] = classes[i].InsertColumn(0, classVector); } // Concatenate back into a single matrix InsightMatrix result = classes.Aggregate((x, y) => x.Stack(y)); return result; }
/// <summary> /// Computes the total error of the solution with parameters theta. /// </summary> /// <param name="X">Training data</param> /// <param name="y">Target variable</param> /// <param name="theta">Model parameters</param> /// <param name="lambda">Regularization weight</param /// <returns>Solution error</returns> private double ComputeError(InsightMatrix X, InsightVector y, InsightVector theta, double lambda) { var first = y.Multiply(Sigmoid((X * theta.ToColumnMatrix()).Column(0)).Log()); var second = (1 - y).Multiply(1 - Sigmoid((X * theta.ToColumnMatrix()).Column(0)).Log()); var thetaSub = theta.SubVector(1, theta.Count - 1); var reg = (lambda / 2 * X.RowCount) * thetaSub.Power(2).Sum(); return (first - second).Sum() / X.RowCount + reg; }
/// <summary> /// Returns the sigmoid (logit) of the original vector. /// </summary> /// <param name="value">Original vector</param> /// <returns>Sigmoid vector</returns> private InsightVector Sigmoid(InsightVector values) { return new InsightVector(values.Select(x => 1 / (1 + Math.Exp(-x))).ToList()); }
/// <summary> /// Calculates the similarity between two vectors. Uses cosine similarity by default. /// </summary> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <returns>Similarity between the two vectors</returns> public static double SimilarityTo(this InsightVector u, InsightVector v) { return new CosineSimilarity().CalculateSimilarity(u, v); }
/// <summary> /// Classifies a new instance of the data using the algorithm's trained model. /// </summary> /// <param name="instance">New instance</param> /// <returns>Classification</returns> public int Classify(InsightVector instance) { return Classify(instance.ToRowMatrix())[0]; }
/// <summary> /// Calculates the distance between two vectors. Uses euclidean distance by default. /// </summary> /// <param name="u">1st vector</param> /// <param name="v">2nd vector</param> /// <returns>Distance between the two vectors</returns> public static double DistanceFrom(this InsightVector u, InsightVector v) { return new EuclideanDistance().CalculateDistance(u, v); }
/// <summary> /// Computes the total error of the solution with parameters theta. /// </summary> /// <param name="X">Training data</param> /// <param name="y">Target variable</param> /// <param name="theta">Model parameters</param> /// <param name="lambda">Regularization weight</param /// <returns>Solution error</returns> private double ComputeError(InsightMatrix X, InsightVector y, InsightVector theta, double lambda) { var inner = ((X * theta.ToColumnMatrix()) - y.ToColumnMatrix()).Power(2); var thetaSub = theta.SubVector(1, theta.Count - 1); var reg = lambda * thetaSub.Multiply(thetaSub).Sum(); return (inner.Column(0).Sum() / (2 * X.RowCount)) + reg; }
static void Main(string[] args) { Console.WriteLine("Similarity & Distance Examples"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); InsightVector u = new InsightVector(new double[] { 1, 2, 3, 4, 5 }); Console.WriteLine("Vector u:"); Console.WriteLine(u.ToString()); Console.WriteLine(Environment.NewLine); InsightVector v = new InsightVector(new double[] { 5, 4, 3, 2, 1 }); Console.WriteLine("Vector v:"); Console.WriteLine(v.ToString()); Console.WriteLine(Environment.NewLine); double distance = u.DistanceFrom(v); Console.WriteLine("Euclidean distance (u, v) = {0}", distance.ToString("F4")); Console.WriteLine(Environment.NewLine); distance = u.DistanceFrom(v, DistanceMethod.HammingDistance); Console.WriteLine("Hamming distance (u, v) = {0}", distance.ToString("F4")); Console.WriteLine(Environment.NewLine); distance = u.DistanceFrom(v, DistanceMethod.ManhattanDistance); Console.WriteLine("Manhattan distance (u, v) = {0}", distance.ToString("F4")); Console.WriteLine(Environment.NewLine); double similarity = u.SimilarityTo(v); Console.WriteLine("Cosine similarity (u, v) = {0}", similarity.ToString("F4")); Console.WriteLine(Environment.NewLine); similarity = u.SimilarityTo(v, SimilarityMethod.JaccardCoefficient); Console.WriteLine("Jaccard coefficient (u, v) = {0}", similarity.ToString("F4")); Console.WriteLine(Environment.NewLine); similarity = u.SimilarityTo(v, SimilarityMethod.PearsonCorrelation); Console.WriteLine("Pearson correlation (u, v) = {0}", similarity.ToString("F4")); Console.WriteLine(Environment.NewLine); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Covariance & Correlation Examples"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); InsightMatrix matrix = new InsightMatrix(new double[,] { { 2.1, 8 }, { 2.5, 12 }, { 4.0, 14 }, { 3.6, 10 } }); Console.WriteLine("Example matrix:"); Console.WriteLine(matrix.ToString()); Console.WriteLine(Environment.NewLine); var cov = matrix.CovarianceMatrix(); Console.WriteLine("Covariance matrix:"); Console.WriteLine(cov.ToString()); Console.WriteLine(Environment.NewLine); var cor = matrix.CorrelationMatrix(); Console.WriteLine("Correlation matrix:"); Console.WriteLine(cor.ToString()); Console.WriteLine(Environment.NewLine); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Feature Extraction Examples"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); InsightMatrix matrix2 = new InsightMatrix(new double[,] { { 2.5, 2.4 }, { 0.5, 0.7 }, { 2.2, 2.9 }, { 1.9, 2.2 }, { 3.1, 3.0 }, { 2.3, 2.7 }, { 2.0, 1.6 }, { 1.0, 1.1 }, { 1.5, 1.6 }, { 1.1, 0.9 } }); Console.WriteLine("First test matrix:"); Console.WriteLine(matrix2.ToString()); Console.WriteLine(Environment.NewLine); var pca = matrix2.ExtractFeatures(ExtractionMethod.PrincipalComponentAnalysis, 1); Console.WriteLine("Result of principal components analysis:"); Console.WriteLine(pca.ToString()); Console.WriteLine(Environment.NewLine); var svd = matrix2.ExtractFeatures(ExtractionMethod.SingularValueDecomposition, 1); Console.WriteLine("Result of singular value decomposition:"); Console.WriteLine(svd.ToString()); Console.WriteLine(Environment.NewLine); InsightMatrix matrix3 = new InsightMatrix(new double[,] { { 4, 2, 1 }, { 2, 4, 1 }, { 2, 3, 1 }, { 3, 6, 1 }, { 4, 4, 1 }, { 9, 10, 2 }, { 6, 8, 2 }, { 9, 5, 2 }, { 8, 7, 2 }, { 10, 8, 2 } }); Console.WriteLine("Second test matrix:"); Console.WriteLine(matrix3.ToString()); Console.WriteLine(Environment.NewLine); var lda = matrix3.ExtractFeatures(ExtractionMethod.LinearDiscriminantAnalysis); Console.WriteLine("Result of linear discriminant analysis:"); Console.WriteLine(lda.ToString()); Console.WriteLine(Environment.NewLine); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Optimization Examples"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); var simpleHillClimber = new SimpleHillClimbing<double>(); var transforms = new List<Func<double, double>>(); transforms.Add(x => x + 1); transforms.Add(x => x - 1); transforms.Add(x => x + 0.3); transforms.Add(x => x - 0.3); transforms.Add(x => x + 0.1); transforms.Add(x => x - 0.1); var solution = simpleHillClimber.FindMaxima( 0, transforms, x => Math.Abs(x) - (x * x)); Console.WriteLine("Simple hill climber solution = {0}", solution.Solution); Console.WriteLine("Simple hill climber score = {0}", solution.Score); Console.WriteLine(Environment.NewLine); var steepestAscentHillClimber = new SteepestAscentHillClimbing<double>(); var solution2 = steepestAscentHillClimber.FindMaxima( 0, transforms, x => Math.Abs(x) - (x * x)); Console.WriteLine("Steepest ascent hill climber solution = {0}", solution2.Solution); Console.WriteLine("Steepest ascent hill climber score = {0}", solution2.Score); Console.WriteLine(Environment.NewLine); var stocasticHillClimber = new StochasticHillClimbing<double>(); var solution3 = stocasticHillClimber.FindMaxima( 0, transforms, x => Math.Abs(x) - (x * x)); Console.WriteLine("Stocastic hill climber solution = {0}", solution3.Solution); Console.WriteLine("Stocastic hill climber score = {0}", solution3.Score); Console.WriteLine(Environment.NewLine); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Data Loading Examples"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); InsightMatrix iris = DataLoader.ImportFromCSV("../../../data/iris.data", ',', false, true); Console.WriteLine("Iris data set:"); Console.WriteLine(iris.ToString()); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Clustering Examples"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); var clusterResults = iris.Cluster(ClusteringMethod.KMeans); Console.WriteLine("K-Means"); Console.WriteLine("Distortion = {0}", clusterResults.Distortion); Console.WriteLine("Centroids:"); Console.WriteLine(clusterResults.Centroids.ToString()); Console.ReadKey(); var clusterResults2 = iris.Cluster(ClusteringMethod.KMeans, DistanceMethod.EuclideanDistance, 3, 10); Console.WriteLine("K-Means (best of 10)"); Console.WriteLine("Distortion = {0}", clusterResults.Distortion); Console.WriteLine("Centroids:"); Console.WriteLine(clusterResults.Centroids.ToString()); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Linear Regression"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); Console.WriteLine("Loading regression sample data..."); InsightMatrix data1 = DataLoader.ImportFromCSV("../../../data/ex1data1.txt", ',', false, false); Console.WriteLine("Training model..."); var model1 = new LinearRegression(); model1.Train(data1); Console.WriteLine("Model training complete. Parameters:"); Console.WriteLine(model1.Theta.ToString()); Console.WriteLine(model1.Error.ToString()); Console.WriteLine("Predicting output for first data point..."); data1 = data1.RemoveColumn(data1.ColumnCount - 1); var prediction = model1.Predict(data1.Row(0)); Console.WriteLine("Prediction = {0}", prediction); Console.ReadKey(); Console.WriteLine(Environment.NewLine); Console.WriteLine("Logistic Regression"); Console.WriteLine("------------------------------"); Console.WriteLine(Environment.NewLine); Console.WriteLine("Loading classification sample data..."); InsightMatrix data2 = DataLoader.ImportFromCSV("../../../data/ex2data1.txt", ',', false, false); Console.WriteLine("Training model..."); var model2 = new LogisticRegression(); model2.Train(data2); Console.WriteLine("Model training complete. Parameters:"); Console.WriteLine(model2.Theta.ToString()); Console.WriteLine(model2.Error.ToString()); Console.WriteLine("Predicting output for first data point..."); data2 = data2.RemoveColumn(data2.ColumnCount - 1); var classification = model2.Classify(data2.Row(0)); Console.WriteLine("Classification = {0}", classification); Console.ReadKey(); }
/// <summary> /// Predicts the target for a new instance of the data using the algorithm's trained model. /// </summary> /// <param name="instance">New instance</param> /// <returns>Prediction</returns> public double Predict(InsightVector instance) { return Predict(instance.ToRowMatrix())[0]; }
/// <summary> /// Creates a new matrix with the provided row inserted at the given index. /// </summary> /// <param name="index">Row index</param> /// <param name="vector">Row vector</param> /// <returns>Result matrix</returns> public InsightMatrix InsertRow(int index, InsightVector vector) { return new InsightMatrix(this.Data.InsertRow(index, vector.Data)); }
/// <summary> /// Creates a new matrix with the provided value inserted as a row at the given index. /// </summary> /// <param name="index">Row index</param> /// <param name="value">Row value</param> /// <returns>Result matrix</returns> public InsightMatrix InsertRow(int index, double value) { var vector = new InsightVector(this.ColumnCount); for (int i = 0; i < vector.Count; i++) { vector[i] = value; } return new InsightMatrix(this.Data.InsertRow(index, vector.Data)); }
/// <summary> /// Centers each column in the matrix by subtracting each value in the column by the mean. /// </summary> /// <param name="meanVector">Vector with pre-computed column means</param> /// <returns>Column-centered matrix</returns> public InsightMatrix Center(InsightVector meanVector) { var matrix = new InsightMatrix(this.Data); int colLength = matrix.Data.ColumnCount; for (int i = 0; i < colLength; i++) { int length = matrix.Data.RowCount; for (int j = 0; j < length; j++) { matrix.Data[j, i] = matrix.Data[j, i] - meanVector.Data[i]; } } return matrix; }
/// <summary> /// Performs the K-Means clustering algorithm on the data set using the provided parameters. /// </summary> /// <param name="matrix">Input matrix</param> /// <param name="similarityMethod">Similarity measure used to compare instances</param> /// <param name="distanceMethod">Distance measure used to compare instances</param> /// <param name="clusters">Number of desired clusters</param> /// <returns>Result set that includes cluster centroids, cluster assignments, and total distortion</returns> private IClusteringResults PerformKMeansClustering(InsightMatrix matrix, DistanceMethod? distanceMethod, int? clusters) { if (distanceMethod == null) { // Default to sum of squared error (equivalent to Euclidean distance) distanceMethod = DistanceMethod.EuclideanDistance; } if (clusters == null) { // Need to add some type of intelligent way to figure out a good number // of clusters to use based on an analysis of the data clusters = 3; } var assignments = new InsightVector(matrix.RowCount); var centroids = new InsightMatrix(clusters.Value, matrix.ColumnCount); var random = new Random(); double distortion = -1; // Initialize means via random selection for (int i = 0; i < clusters; i++) { var samples = new List<int>(); int sample = random.Next(0, matrix.RowCount - 1); // Make sure we don't use the same instance more than once while (samples.Exists(x => x == sample)) { sample = random.Next(0, matrix.RowCount - 1); } samples.Add(sample); centroids.SetRow(i, matrix.Row(sample)); } // Keep going until convergence point is reached while (true) { // Re-initialize the distortion (total error) distortion = 0; // Assign each point to the nearest mean for (int i = 0; i < matrix.RowCount; i++) { // Compute the proximity to each centroid to find the closest match double closestProximity = -1; for (int j = 0; j < clusters; j++) { double proximity = matrix.Row(i).DistanceFrom(centroids.Row(j), distanceMethod.Value); if (j == 0) { closestProximity = proximity; assignments[i] = j; } else if (proximity < closestProximity) { closestProximity = proximity; assignments[i] = j; } } // Add the proximity value to the total distortion for this solution distortion += closestProximity; } // Calculate the new means for each centroid var newCentroids = new InsightMatrix(clusters.Value, matrix.ColumnCount); bool converged = true; for (int i = 0; i < clusters; i++) { int instanceCount = assignments.Where(x => x == i).Count(); // Compute the means for each instance assigned to the current cluster for (int j = 0; j < newCentroids.ColumnCount; j++) { double sum = 0; for (int k = 0; k < matrix.RowCount; k++) { if (assignments[k] == i) sum += matrix[k, j]; } if (instanceCount > 0) newCentroids[i, j] = Math.Round(sum / instanceCount, 2); else newCentroids[i, j] = centroids[i, j]; if (newCentroids[i, j] != centroids[i, j]) converged = false; } centroids.SetRow(i, newCentroids.Row(i)); } // If the new centroid means did not change then we've reached the final result if (converged) break; } return new ClusteringResults(centroids, assignments, distortion); }
/// <summary> /// Trains the model using the supplied data. Uses the default training /// parameters for the model. /// </summary> /// <param name="data">Training data</param> public void Train(InsightMatrix data) { var results = PerformLinearRegression(data, Alpha, Lambda, Iterations); Theta = results.Item1; Error = results.Item2; }
/// <summary> /// Copies the provided value into each column of the specified row. /// </summary> /// <param name="index">Row index</param> /// <param name="value">Row value</param> public void SetRow(int index, double value) { var vector = new InsightVector(this.ColumnCount); for (int i = 0; i < vector.Count; i++) { vector[i] = value; } this.Data.SetRow(index, vector.Data); }
/// <summary> /// Performs linear regression on the input data. /// </summary> /// <param name="data">Training data</param> /// <param name="alpha">The learning rate for the algorithm</param> /// <param name="lambda">The regularization weight for the algorithm</param> /// <param name="iters">The number of training iterations to run</param> /// <returns>Tuple containing the parameter and error vectors</returns> private Tuple<InsightVector, InsightVector> PerformLinearRegression(InsightMatrix data, double alpha, double lambda, int iters) { // First add a ones column for the intercept term data = data.InsertColumn(0, 1); // Split the data into training data and the target variable var X = data.RemoveColumn(data.ColumnCount - 1); var y = data.Column(data.ColumnCount - 1); // Initialize several variables needed for the computation var theta = new InsightVector(X.ColumnCount); var temp = new InsightVector(X.ColumnCount); var error = new InsightVector(iters); // Perform gradient descent on the parameters theta for (int i = 0; i < iters; i++) { var delta = (X * theta.ToColumnMatrix()) - y.ToColumnMatrix(); for (int j = 0; j < theta.Count; j++) { var inner = delta.Multiply(X.SubMatrix(0, X.RowCount, j, 1)); if (j == 0) { temp[j] = theta[j] - ((alpha / X.RowCount) * inner.Column(0).Sum()); } else { var reg = (2 * lambda) * theta[j]; temp[j] = theta[j] - ((alpha / X.RowCount) * inner.Column(0).Sum()) + reg; } } theta = temp.Clone(); error[i] = ComputeError(X, y, theta, lambda); } return new Tuple<InsightVector, InsightVector>(theta, error); }
/// <summary> /// Copies the values of the given vector to the specified row. /// </summary> /// <param name="index">Row index</param> /// <param name="vector">Row vector</param> public void SetRow(int index, InsightVector vector) { this.Data.SetRow(index, vector.Data); }
/// <summary> /// Sorts the rows of a matrix using the values in the designated column for comparison. /// </summary> /// <param name="columnIndex">Column to use for sorting</param> /// <returns>Row-sorted matrix</returns> public InsightMatrix Sort(int columnIndex) { var matrix = new InsightMatrix(this.Data); var sortKeys = Enumerable.Range(0, matrix.Data.RowCount) .Select(x => new { Index = x, Value = matrix.Data[x, columnIndex] }) .OrderBy(x => x.Value) .ToList(); var sortKeys2 = sortKeys .Select((x, i) => new { NewIndex = i, OldIndex = x.Index, }) .OrderBy(x => x.NewIndex) .ToList(); var sortKeys3 = sortKeys2 .Select(x => x.OldIndex) .ToList(); for (int i = 0; i < matrix.Data.RowCount; i++) { // Save the row at the current index InsightVector temp = new InsightVector(matrix.Data.Row(i)); // Copy the row from the new index to the current index matrix.Data.SetRow(i, matrix.Data.Row(sortKeys3[i])); // Copy the saved row to the new index matrix.Data.SetRow(sortKeys3[i], temp.Data); // Update the index to show row at position i is now at sortkeys[i] int position = sortKeys3.IndexOf(i, i); sortKeys3[position] = sortKeys3[i]; } return matrix; }
/// <summary> /// Default constructor. /// </summary> /// <param name="centroids">Matrix of the centroids for each cluster</param> /// <param name="assignments">Cluster assignments</param> public ClusteringResults(InsightMatrix centroids, InsightVector assignments, double distortion) { Centroids = centroids; ClusterAssignments = assignments; Distortion = distortion; }