/// <summary> /// Clusters features based on their pairwise distances by finding the minimal spanning tree (MST) via Prim's algorithm. /// </summary> /// <param name="distances">Pairwise distances between all features in question.</param> /// <param name="clusters">Singleton clusters from each feature.</param> /// <returns>List of features clustered together.</returns> public override List <U> LinkFeatures(List <PairwiseDistance <T> > potentialDistances, Dictionary <int, U> clusters) { var newClusters = new List <U>(); var distances = new List <PairwiseDistance <T> >(); // There is an edge case with this setup that a singleton outside of the range // of other features made it into the batch of edges, but there is no corresponding edge // to the rest of the graph(s). So here we hash all features // then we ask for within the range, pare down that hash to a set of features that // have no corresponding edge. These guys would ultimately be singletons we want // to capture... var clusterMap = new HashSet <T>(); foreach (var cluster in clusters.Values) { foreach (var feature in cluster.Features) { if (!clusterMap.Contains(feature)) { clusterMap.Add(feature); } } } foreach (var distance in potentialDistances) { if (AreClustersWithinTolerance(distance.FeatureX, distance.FeatureY)) { //distances.Add(distance); if (clusterMap.Contains(distance.FeatureX)) { clusterMap.Remove(distance.FeatureX); } if (clusterMap.Contains(distance.FeatureY)) { clusterMap.Remove(distance.FeatureY); } } } // Once we have removed any cluster foreach (var feature in clusterMap) { var cluster = new U(); feature.SetParentFeature(cluster); cluster.AddChildFeature(feature); newClusters.Add(cluster); } var newDistances = (from element in potentialDistances orderby element.Distance select element).ToList(); var queue = new Queue <Edge <T> >(); var graph = new FeatureGraph <T>(); // Sort out the distances so we dont have to recalculate distances. var id = 0; var edges = new List <Edge <T> >(); newDistances.ForEach(x => edges.Add(new Edge <T>(id++, x.Distance, x.FeatureX, x.FeatureY))); graph.CreateGraph(edges); edges.ForEach(x => queue.Enqueue(x)); // This makes sure we have var seenEdge = new HashSet <int>(); // Now we start at the MST building if (DumpLinearRelationship) { Console.WriteLine("GraphEdgeLength"); } while (queue.Count > 0) { var startEdge = queue.Dequeue(); // If we have already seen the edge, ignore it... if (seenEdge.Contains(startEdge.ID)) { continue; } var mstGroup = ConstructSubTree(graph, seenEdge, startEdge); var clusterTree = new MstLrTree <Edge <T> >(); // Get the mst value . double sum = 0; double mean = 0; foreach (var dist in mstGroup.LinearRelationship) { seenEdge.Add(dist.ID); sum += dist.Length; clusterTree.Insert(dist); var ppmDist = FeatureLight.ComputeMassPPMDifference(dist.VertexB.MassMonoisotopicAligned, dist.VertexA.MassMonoisotopicAligned); if (DumpLinearRelationship) { Console.WriteLine("{0}", dist.Length); /*,,{1},{2},{3},{4},{5},{6},{7},{8}", dist.Length, * dist.VertexA.NetAligned, * dist.VertexA.MassMonoisotopicAligned, * dist.VertexA.DriftTime, * dist.VertexB.NetAligned, * dist.VertexB.MassMonoisotopicAligned, * dist.VertexB.DriftTime, * ppmDist, * Math.Abs(dist.VertexA.NetAligned - dist.VertexB.NetAligned)); */ } } var N = Convert.ToDouble(mstGroup.LinearRelationship.Count); // Calculate the standard deviation. mean = sum / N; sum = 0; foreach (var dist in mstGroup.LinearRelationship) { var diff = dist.Length - mean; sum += (diff * diff); } var stdev = Math.Sqrt(sum / N); var cutoff = NSigma; // *stdev; // stdev* NSigma; var mstClusters = CreateClusters(mstGroup, cutoff); newClusters.AddRange(mstClusters); } return(newClusters); }