/// <summary> /// Returns the distance matrix for a set of peaks. /// </summary> public static DistanceMatrix Create(Core core, IntensityMatrix valueMatrix, ConfigurationMetric metric, ProgressReporter prog) { int n = valueMatrix.NumRows; int bytesRequired = (n * n) * 8; int mbRequired = bytesRequired / (1024 * 1024); int limit = core.Options.ObjectSizeLimit * 1024 * 1024; if (bytesRequired > limit) { // It ain't gonna happen. I'm not creating a distance matrix over 500Mb throw new InvalidOperationException("n = " + n + " input vectors. n * n = " + (n * n) + " elements in the distance matrix. 8 bytes per element gives " + (n * n * 8) + " bytes, or " + mbRequired + " megabytes. ObjectSizeLimit is " + core.Options.ObjectSizeLimit + " Mb. Reduce the number of input vectors by filtering the data, or change the limit from the preferences menu. Some algorithms also have the option to disable the distance matrix."); } double[,] s = new double[n, n]; prog.Enter("Calculating distance matrix"); for (int i = 0; i < n; i++) { prog.SetProgress(i, n); for (int j = 0; j < n; j++) { s[i, j] = metric.Calculate(valueMatrix.Vectors[i], valueMatrix.Vectors[j]); } } prog.Leave(); return(new DistanceMatrix(s, valueMatrix)); }
/// <summary> /// Rates the current assignments based on SUM( |x - c(x)|)^2 (the k-means function). /// </summary> private static void QuantifyAssignments(Core core, out bool warning, out double sumPeakClusterScore, out int numPeaks, out double sumPeakClusterScoreSigsOnly, out int numSigPeaks, out double sumWorstTen, out double sumHighestTen, out int numTen, List <Tuple <Peak, double> > allScores) { sumPeakClusterScore = 0; numPeaks = 0; sumPeakClusterScoreSigsOnly = 0; numSigPeaks = 0; sumHighestTen = 0; sumWorstTen = 0; numTen = 0; warning = false; ArgsMetric args = new ArgsMetric(Algo.ID_METRIC_EUCLIDEAN, null, null); ConfigurationMetric metric = new ConfigurationMetric() { Args = args }; // Iterate clusters foreach (Cluster pat in core.Clusters) { // Get scores in this cluster List <double> scoresInThisCluster = new List <double>(); // Iterate assignments foreach (Assignment assignment in pat.Assignments.List) { // Get score for PEAK-CLUSTER if (pat.Centres.Count == 0) { pat.SetCentre(ECentreMode.Average, ECandidateMode.Assignments); warning = true; } double peakClusterScore = pat.CalculateScore(assignment.Vector.Values, EDistanceMode.ClosestCentre, metric); peakClusterScore = peakClusterScore * peakClusterScore; sumPeakClusterScore += peakClusterScore; numPeaks++; allScores.Add(new Tuple <Peak, double>(assignment.Peak, peakClusterScore)); if (pat.States != Cluster.EStates.None) { scoresInThisCluster.Add(peakClusterScore); sumPeakClusterScoreSigsOnly += peakClusterScore; numSigPeaks++; } } scoresInThisCluster.Sort(); for (int n = 0; n < scoresInThisCluster.Count / 10; n++) { sumHighestTen += scoresInThisCluster[n]; sumWorstTen += scoresInThisCluster[scoresInThisCluster.Count - n - 1]; numTen++; } } }
/// <summary> /// Constructor. /// </summary> public ArgsClusterer(string id, IMatrixProvider source, PeakFilter sigFilter, ConfigurationMetric distance, ObsFilter atypes, bool splitGroups, EClustererStatistics suppressMetric, object[] parameters, string clusterNamePrefix) : base(id, source, parameters) { this.PeakFilter = sigFilter; this.Distance = distance; this.ObsFilter = atypes; this.SplitGroups = splitGroups; this.Statistics = suppressMetric; this.OverrideShortName = clusterNamePrefix; }
void find_distance_range() { ProgressReporter prog = new ProgressReporter(this); double smallest = double.MaxValue; double largest = double.MinValue; Tuple <Peak, Peak> smallestT = null; Tuple <Peak, Peak> largestT = null; IMatrixProvider vmatrix = DataSet.ForMatrixProviders(this._core).ShowList(this, null); if (vmatrix == null) { return; } ConfigurationMetric metric = new ConfigurationMetric(); metric.Args = new ArgsMetric(Algo.ID_METRIC_EUCLIDEAN, vmatrix, null); DistanceMatrix dmatrix = DistanceMatrix.Create(this._core, vmatrix.Provide, metric, prog); for (int peakIndex1 = 0; peakIndex1 < this._core.Peaks.Count; peakIndex1++) { for (int peakIndex2 = 0; peakIndex2 < this._core.Peaks.Count; peakIndex2++) { if (peakIndex1 != peakIndex2) { double result = dmatrix.Values[peakIndex1, peakIndex2]; Peak peak1 = this._core.Peaks[peakIndex1]; Peak peak2 = this._core.Peaks[peakIndex2]; if (result > largest) { largest = result; largestT = new Tuple <Peak, Peak>(peak1, peak2); } if (result < smallest) { smallest = result; smallestT = new Tuple <Peak, Peak>(peak1, peak2); } } } } StringBuilder sb = new StringBuilder(); sb.AppendLine("| " + smallestT.Item1.DisplayName + " - " + smallestT.Item2.DisplayName + " | = " + smallest); sb.AppendLine("| " + largestT.Item1.DisplayName + " - " + largestT.Item2.DisplayName + " | = " + largest); FrmInputMultiLine.ShowFixed(this, "Find distance range", "Maximum and minimum differences", "Showing the closest and furthest peaks", sb.ToString()); }
/// <summary> /// Action completed - calculate statisstics /// </summary> internal void FinalizeResults(Core core, ConfigurationMetric metric, IntensityMatrix vmatrix, DistanceMatrix dmatrix, EClustererStatistics statistics, ProgressReporter prog) { UiControls.Assert(Assignments.IsEmpty(), "FinalizeResults on ClusterResults already called."); // Get ALL the assignments foreach (Cluster cluster in RealClusters) { Assignments.AddRange(cluster.Assignments.List); } RecalculateStatistics(core, metric, vmatrix, dmatrix, statistics, prog); }
private ArgsClusterer GetSelection() { IMatrixProvider src; PeakFilter peakFilter; ObsFilter obsFilter; string title; string shortName; this._checker.Clear(); // Selection ClustererBase sel = (ClustererBase)this._ecbMethod.SelectedItem; // Title / comments title = string.IsNullOrWhiteSpace(this._txtName.Text) ? null : this._txtName.Text; shortName = string.IsNullOrWhiteSpace(this._txtShortName.Text) ? null : this._txtShortName.Text; // Parameters object[] parameters; if (sel != null) { string error; parameters = sel.Parameters.TryStringToParams(this._core, this._txtParams.Text, out error); this._checker.Check(this._txtParams, parameters != null, error ?? "error"); } else { parameters = null; this._checker.Check(this._ecbMethod.ComboBox, false, "A method is required."); } // Peak filter peakFilter = this._ecbPeakFilter.SelectedItem; this._checker.Check(this._ecbPeakFilter.ComboBox, this._ecbPeakFilter.HasSelection, "Select a valid peak filter"); // Suppress metric EClustererStatistics suppressMetric; if (this._cbStatistics.SelectionValid) { suppressMetric = (EClustererStatistics)this._cbStatistics.SelectedItems.Cast <int>().Sum(); } else { this._checker.Check(this._cbStatistics.TextBox, false, "Select a valid set of statistics"); suppressMetric = default(EClustererStatistics); } // Distance metric MetricBase dMet; dMet = (MetricBase)this._ecbMeasure.SelectedItem; // Distance metric params object[] dMetParams; if (dMet != null) { string error; dMetParams = dMet.Parameters.TryStringToParams(this._core, this._txtMeasureParams.Text, out error); this._checker.Check(this._txtMeasureParams, dMetParams != null, error ?? "error"); } else { this._checker.Check(this._ecbMeasure.ComboBox, false, "Specify a distance measure"); dMetParams = null; } // Obs source src = this._ecbSource.SelectedItem; if (sel != null && sel.SupportsObservationFilters) { this._checker.Check(this._ecbSource.ComboBox, src != null, "Select a valid source"); } _lblRepWarn.Visible = HasReplicates(src); // Vector A if (sel == null || !sel.SupportsObservationFilters) { obsFilter = null; } else if (this._ecbObsFilter.HasSelection) { obsFilter = this._ecbObsFilter.SelectedItem; } else { this._checker.Check(this._ecbObsFilter.ComboBox, false, "Select a valid observation filter"); obsFilter = default(ObsFilter); } if (this._checker.HasErrors) { return(null); } // Result ConfigurationMetric df = dMet != null ? new ConfigurationMetric() : null; if (df != null) { df.Args = new ArgsMetric(dMet.Id, src, dMetParams) { OverrideDisplayName = dMet.DisplayName }; } ArgsClusterer args = new ArgsClusterer(sel.Id, src, peakFilter, df, obsFilter, this._chkSepGroups.Checked, suppressMetric, parameters, shortName) { OverrideDisplayName = title, Comment = this._comment }; return(args); }
/// <summary> /// d-k-means++ /// Ignores insignificant variables. /// Returns new clusters (these won't be added to the core so make sure to do so) /// </summary> private static List <Cluster> AutogenerateClusters(IntensityMatrix vmatrix, List <Cluster> seed, double?stoppingDistance, int?stoppingCount, ConfigurationMetric metric, ConfigurationClusterer tag, ProgressReporter prog) { // Make a log of whatever limits have been set if (!stoppingCount.HasValue && !stoppingDistance.HasValue) { throw new InvalidOperationException("No stopping condition set."); } // Assign all variables to nearest List <Cluster> result = new List <Cluster>(seed); // Get the actual limits int iterations = 0; int count = (stoppingCount - seed.Count) ?? Int32.MaxValue; double distance = stoppingDistance ?? Double.MinValue; // Get the most distant variable prog.Enter("Initialising assignments"); LegacyClustererHelper.Assign(vmatrix, result, ECandidateMode.Exemplars, metric, prog); Assignment mostDistant = GetMostDistantAssignment(result); prog.Leave(); // Continue until our limits are breached while ((count > 0) && (mostDistant.Score > distance)) { // Check we haven't got unreasonable limits iterations++; prog.Enter("Centre generation (iteration " + iterations + ")"); if (iterations > 1000) { throw new InvalidOperationException("Too many iterations - exiting."); } // Create a new cluster with the most distant variable as its exemplar var newCluster = new Cluster((result.Count + 1).ToString(), tag); result.Add(newCluster); newCluster.Exemplars.Add(mostDistant.Vector.Values); // todo: check to prevent multiple assignment? // Make the assignments based on the closest exemplars LegacyClustererHelper.Assign(vmatrix, result, ECandidateMode.Exemplars, metric, prog); // Basic check if (!newCluster.Assignments.Vectors.Contains(mostDistant.Vector)) { throw new InvalidOperationException("Problem creating new cluster from vector - " + mostDistant.Vector.ToString() + " doesn't like being in its own cluster. Check this vector for discrepancies."); } // Get the next most distant variable count = count - 1; mostDistant = GetMostDistantAssignment(result); prog.Leave(); } // Return the number of iterations return(result); }
/// <summary> /// Thread operation fo calculate statistics for [stat]. /// /// [stat] is guarenteed to be unique, however stat.Assignment is not, hence stat.Assignment must be locked. /// /// Currently only stat.Assignment.AssignmentStatistics is the only member to be R/W locked, since that is all /// that is modified. /// </summary> private static void Thread_CalculateAssignmentStatistics([Const] EClustererStatistics statistics, [MutableUnsafe] ForStat stat, [Const] Cluster[] realClusters, [Const] ConfigurationMetric metric, [MutableSafe] ProgressParallelHandler prog) { prog.SafeIncrement(); // STATS: Distance from avg if (stat.ClusterVector != null) { // Euclidean if (statistics.HasFlag(EClustererStatistics.EuclideanFromAverage)) { double ed = Maths.Euclidean(stat.AssignmentVector.Values, stat.ClusterVector); stat.Assignment.AssignmentStatistics.ThreadSafeIndex(CreatePartialKey(stat.ObsFilter, STAT_ASSIGNMENT_EUCLIDEAN_FROM_AVG), ed); stat.Assignment.AssignmentStatistics.ThreadSafeIndex(CreatePartialKey(stat.ObsFilter, STAT_ASSIGNMENT_EUCLIDEAN_FROM_AVG_SQUARED), ed * ed); } // Custom (if applicable) if (metric != null && statistics.HasFlag(EClustererStatistics.DistanceFromAverage) && !(metric.Args.Id == Algo.ID_METRIC_EUCLIDEAN && statistics.HasFlag(EClustererStatistics.EuclideanFromAverage))) { string key1 = metric.ToString() + STAT_ASSIGNMENT_DISTANCE_FROM_AVG; string key2 = metric.ToString() + STAT_ASSIGNMENT_DISTANCE_FROM_AVG_SQUARED; double dd = metric.Calculate(stat.AssignmentVector.Values, stat.ClusterVector); stat.Assignment.AssignmentStatistics.ThreadSafeIndex(CreatePartialKey(stat.ObsFilter, key1), dd); stat.Assignment.AssignmentStatistics.ThreadSafeIndex(CreatePartialKey(stat.ObsFilter, key2), dd * dd); } } // STATS: Silhouette Cluster nextNearestCluster = null; if (statistics.HasFlag(EClustererStatistics.SilhouetteWidth)) { double silhouetteWidth; double nextNearestClusterId; ClustererStatisticsHelper.CalculateSilhouette(stat, realClusters, out silhouetteWidth, out nextNearestCluster); if (!double.TryParse(nextNearestCluster.ShortName, out nextNearestClusterId)) { nextNearestClusterId = double.NaN; } // Silhouette stat.Assignment.AssignmentStatistics.ThreadSafeIndex(CreatePartialKey(stat.ObsFilter, STAT_ASSIGNMENT_SILHOUETTE_WIDTH), silhouetteWidth); stat.Assignment.AssignmentStatistics.ThreadSafeIndex(CreatePartialKey(stat.ObsFilter, STAT_ASSIGNMENT_NEXT_NEAREST_CLUSTER), nextNearestClusterId); } // STATS: Score if (stat.ObsFilter == null) { // Score stat.Assignment.AssignmentStatistics.ThreadSafeIndex(STAT_ASSIGNMENT_SCORE, stat.Assignment.Score); // Next nearest cluster stat.Assignment.NextNearestCluster = nextNearestCluster; // Only one ForStat per Assignment has ObsFilter == null so thread safe not required } }
/// <summary> /// Determines what needs calculating. /// </summary> private void Thread_AddFilterToCalculationList([Const] Core core, [Const] ConfigurationMetric metric, [Const] IntensityMatrix vmatrix, [Const] DistanceMatrix dmatrix, [Const] EClustererStatistics statistics, [Const] Cluster[] realClusters, [Const] ObsFilter obsFilter, [MutableUnsafe] List <ForStat> needsCalculating, [MutableSafe] ProgressParallelHandler progP) { progP.SafeIncrement(); IntensityMatrix vmatFiltered; DistanceMatrix dmatFiltered; int[] filteredIndices; if (obsFilter == null) { vmatFiltered = vmatrix; dmatFiltered = dmatrix; filteredIndices = null; } else { filteredIndices = vmatrix.Columns.Which(z => obsFilter.Test(z.Observation)).ToArray(); // TODO: Multuple iteration vmatFiltered = vmatrix.Subset(null, obsFilter, ESubsetFlags.None); dmatFiltered = null; } Dictionary <Cluster, IReadOnlyList <double> > centreVectors = new Dictionary <Cluster, IReadOnlyList <double> >(); foreach (Cluster cluster in realClusters) { ///////////////////// // ASSIGNMENT STATS var centre = cluster.GetCentre(ECentreMode.Average, ECandidateMode.Assignments); IReadOnlyList <double> centreVector = centre.Count != 0 ? centre[0] : null; if (filteredIndices != null) { centreVector = centreVector.Extract(filteredIndices); } centreVectors.Add(cluster, centreVector); } foreach (Assignment ass in Assignments) { ForStat f = new ForStat(); f.Assignment = ass; f.ObsFilter = obsFilter; if (filteredIndices != null) { f.AssignmentVector = vmatFiltered.Vectors[ass.Vector.Index]; } else { f.AssignmentVector = ass.Vector; } f.ClusterVector = centreVectors[ass.Cluster]; if (statistics.HasFlag(EClustererStatistics.SilhouetteWidth)) { if (dmatFiltered == null) { dmatFiltered = DistanceMatrix.Create(core, vmatrix, metric, ProgressReporter.GetEmpty()); } } f.DistanceMatrix = dmatFiltered; lock (needsCalculating) { needsCalculating.Add(f); } } }
/// <summary> /// Recalculates the statistics. /// </summary> /// <param name="core">Core</param> /// <param name="metric">Metric for statistics</param> /// <param name="statistics">What to calculate</param> /// <param name="prog">Report progress to</param> /// <param name="vmatrix">Value matrix</param> /// <param name="dmatrix">Distance matrix (optional - if not present will be calculated if necessary)</param> internal void RecalculateStatistics(Core core, ConfigurationMetric metric, IntensityMatrix vmatrix, DistanceMatrix dmatrix, EClustererStatistics statistics, ProgressReporter prog) { // Add basics ClustererStatistics[STAT_NUM_VECTORS] = vmatrix.NumRows; ClustererStatistics[STAT_LENGTH_OF_VECTORS] = vmatrix.NumCols; // Don't calculate metrics? if (statistics == EClustererStatistics.None) { return; } // Get the non-insig clusters Cluster[] realClusters = RealClusters.ToArray(); // If we don't have a DMatrix we should calculate the sil. width manually // The DMatrix might be too big to pass to R so its better just to avoid it. prog.Enter("Calculating statistics"); List <ObsFilter> groupFilters = new List <ObsFilter>(); // No filter groupFilters.Add(null); if (!vmatrix.HasSplitGroups) { // Defined filters if (statistics.HasFlag(EClustererStatistics.IncludePartialVectorsForFilters)) { groupFilters.AddRange(core.ObsFilters); } // Group filters (if not already) if (statistics.HasFlag(EClustererStatistics.IncludePartialVectorsForGroups)) { AllGroupsFilters(core, groupFilters); } } List <ForStat> needsCalculating = new List <ForStat>(); prog.Enter("Input vectors"); ProgressParallelHandler progP = prog.CreateParallelHandler(groupFilters.Count); ProgressParallelHandler closure1 = progP; Parallel.ForEach(groupFilters, obsFilter => Thread_AddFilterToCalculationList(core, metric, vmatrix, dmatrix, statistics, realClusters, obsFilter, needsCalculating, closure1)); prog.Leave(); // ASSIGNMENT STATS prog.Enter("Assignments"); progP = prog.CreateParallelHandler(needsCalculating.Count); ProgressParallelHandler closure2 = progP; Parallel.ForEach(needsCalculating, z => Thread_CalculateAssignmentStatistics(statistics, z, realClusters, metric, closure2)); prog.Leave(); // CLUSTER STATS prog.Enter("Clusters"); progP = prog.CreateParallelHandler(this.Clusters.Length); Parallel.ForEach(this.Clusters, z => Thread_CalculateClusterStatistics(core, statistics, z, progP)); prog.Leave(); // SUMMARY STATS prog.Enter("Summary"); CalculateSummaryStatistics(core, statistics, realClusters); prog.Leave(); prog.Leave(); }
/// <summary> /// Assigns peaks to clusters /// A single k-means iteration. /// </summary> public static bool Assign(IntensityMatrix vmatrix, IReadOnlyList <Cluster> toChoose, ECandidateMode source, ConfigurationMetric distanceMetric, ProgressReporter prog) { // Get the current cluster centres prog.SetProgress(0, vmatrix.NumRows); for (int index = 0; index < toChoose.Count; index++) { prog.SetProgress(index, toChoose.Count); toChoose[index].SetCentre(ECentreMode.Average, source); } // Clear the previous assignments Dictionary <Cluster, List <Vector> > previousAssignments = new Dictionary <Cluster, List <Vector> >(); for (int index = 0; index < toChoose.Count; index++) { previousAssignments.Add(toChoose[index], toChoose[index].Assignments.Vectors.ToList()); toChoose[index].Assignments.ClearAll(); } // Detect changes so we know when the algorithm has converged bool somethingChanged = false; // Assign peaks to centres for (int index = 0; index < vmatrix.NumRows; index++) { Vector vec = vmatrix.Vectors[index]; prog.SetProgress(index, vmatrix.NumRows); ClusterScore best = FindClosestCluster(vec.Values, toChoose, distanceMetric); // Something changed? if (!previousAssignments[best.Cluster].Contains(vec)) { somethingChanged = true; } // Create new assignment best.Cluster.Assignments.Add(new Assignment(vec, best.Cluster, best.Score)); } return(somethingChanged); }
/// <summary> /// K-means centering. /// </summary> public static void PerformKMeansCentering(IntensityMatrix vmatrix, IReadOnlyList <Cluster> toChoose, ConfigurationMetric metric, ProgressReporter prog) { int n = 0; do { if (n != 0) { prog.Leave(); } prog.Enter("k-means (iteration " + (++n) + ")"); } while (Assign(vmatrix, toChoose, ECandidateMode.Assignments, metric, prog)); prog.Leave(); }
/// <summary> /// Calculates the cluster with the best score for v. /// Will not assign to insignificant, distant or disabled clusters. /// </summary> private static ClusterScore FindClosestCluster(IReadOnlyList <double> vector, IEnumerable <Cluster> toChoose, ConfigurationMetric distanceMetric) { Cluster bestCluster = null; double bestScore = double.NaN; foreach (Cluster p in toChoose) { double s = p.CalculateScore(vector, EDistanceMode.ClosestCentre, distanceMetric); if (double.IsNaN(bestScore) || s < bestScore) { bestScore = s; bestCluster = p; } } return(new ClusterScore(bestCluster, bestScore)); }
/// <summary> /// Calculates the score for variable v against this cluster. /// The centres must have been called first by calling [SetCentre]. /// </summary> public double CalculateScore(IReadOnlyList <double> vector, EDistanceMode distanceMode, ConfigurationMetric distanceMetric) { switch (distanceMode) { case EDistanceMode.ClosestCentre: return(this.Centres.Count == 0 ? double.NaN : (this.Centres.Cast <double[]>().Select(centre => distanceMetric.Calculate(vector, centre))).Min()); case EDistanceMode.AverageToAllCentres: double totalDistance = this.Centres.Cast <double[]>().Sum(centre => distanceMetric.Calculate(vector, centre)); return(totalDistance / this.Centres.Count); default: throw new InvalidOperationException("Invalid switch: " + distanceMode.ToString()); } }