static void PrintKMeans(StreamWriter writer, ClusteringResult result, bool printMembers) { writer.WriteLine("KMeans clusters"); for (int i = 0; i < result.Clusters.Count; i++) { var cluster = result.Clusters[i]; var sumOfId = cluster.Members.Sum(e => e.Member.Id); var firstId = "None"; var lastId = "None"; if (cluster.Members.Count > 0) { firstId = cluster.Members[0].Member.Id.ToString(); lastId = cluster.Members.Last().Member.Id.ToString(); } writer.WriteLine("Cluster {0}: member(s) {4,4} || #{1,5}, first {2,4}, last {3,4}", i + 1, sumOfId, firstId, lastId, cluster.Members.Count); } writer.Write("\r\n\r\n"); if (printMembers) { for (int valCluster = 0; valCluster < result.Clusters.Count; valCluster++) { writer.WriteLine("Cluster {0} members:", valCluster); for (int valMember = 0; valMember < result.Clusters[valCluster].Members.Count; valMember++) { writer.WriteLine("Member {0,4}, id {1,3}", valMember, result.Clusters[valCluster].Members[valMember].Member.Id); } writer.Write("\r\n\r\n"); } } }
private void RunClusteringAndGraph() { if (chartDataSource != null) { Cluster cluster; distanceMetric = DistanceMetric(currentDistanceMatrix); try { clusterResult = ClusterCalculate(); } catch (InvalidOperationException) { MessageBox.Show("Please try again."); return; } /* Executing scatterplot */ foreach (var dataPoint in chartDataSource) { cluster = clusterResult.FindCluster(dataPoint.Origin); if (cluster != null) { dataPoint.Group = string.Format("Cluster {0}", cluster.Id); } } chartDataSource = chartDataSource.OrderBy(item => item.Group).ToList(); scatterPlotControl1.BuildScatterPlot(chartDataSource); } }
private static void GetBestPartition( ClusteringResult <DataPoint> clustering, IInternalEvaluationCriterion <DataPoint> criterion, string criterionName) { // gets coeffs for all cluster-sets var evals = clustering.EvaluateClustering(criterion); // saves cluster-sets indexes to CSV file SaveToCsv(evals, Path.GetFullPath(Path.Combine(RESULTS_PATH, $"{criterionName}.csv")), criterionName); // gets max coeff var maxEval = new ClusterSetEvaluation <DataPoint>(null, double.MinValue); foreach (var eval in evals) { if (eval.EvaluationValue > maxEval.EvaluationValue) { maxEval = eval; } } // prints cluster set info Console.WriteLine("======================================"); Console.WriteLine($"Max {criterionName}: {maxEval.EvaluationValue:0.00}"); if (maxEval.ClusterSet == null) { return; } Console.WriteLine( $"Clusters at distance: {maxEval.ClusterSet.Dissimilarity:0.00} ({maxEval.ClusterSet.Count})"); foreach (var cluster in maxEval.ClusterSet) { Console.WriteLine($" - {cluster}"); } }
/// <summary> /// Saves the given <see cref="ClusteringResult{TInstance}" /> to a d3.js dendrogram file. /// </summary> /// <typeparam name="TInstance">The type of instance considered.</typeparam> /// <param name="clustering">The clustering result to be saved to a dendrogram file.</param> /// <param name="filePath">The path to the file in which to save the clustering dendrogram.</param> /// <param name="printNames">Whether to include clusters' string representation in their nodes.</param> /// <param name="formatting">The Json file formatting.</param> public static void SaveD3DendrogramFile <TInstance>( this ClusteringResult <TInstance> clustering, string filePath, bool printNames = true, Formatting formatting = Formatting.None) where TInstance : IComparable <TInstance> { using (var fs = File.Create(filePath)) using (var sw = new StreamWriter(fs, Encoding.UTF8)) { var writer = new JsonTextWriter(sw) { Formatting = formatting }; WriteJson(clustering.SingleCluster, writer, printNames); } }
public static string GetDendrogramJson <TInstance>( this ClusteringResult <TInstance> clustering, bool printNames = true, Formatting formatting = Formatting.None) where TInstance : IComparable <TInstance> { var sb = new StringBuilder(); var sbw = new StringWriter(sb); var writer = new JsonTextWriter(sbw) { Formatting = formatting }; WriteJson(clustering.SingleCluster, writer, printNames); return(sb.ToString()); }
private void ClusteringWorkerDoWork(object sender, DoWorkEventArgs e) { // checks data points if (this._dataPoints == null || this._dataPoints.Count == 0) { return; } // selects linkage criterion ILinkageCriterion <DataPoint> linkage; var selectedIndex = e.Argument; switch (selectedIndex) { case 1: linkage = new CompleteLinkage <DataPoint>(this._dissimilarityMetric); break; case 2: linkage = new SingleLinkage <DataPoint>(this._dissimilarityMetric); break; case 3: linkage = new MinimumEnergyLinkage <DataPoint>(this._dissimilarityMetric); break; case 4: linkage = new CentroidLinkage <DataPoint>(this._dissimilarityMetric, DataPoint.GetMedoid); break; case 5: linkage = new WardsMinimumVarianceLinkage <DataPoint>( this._dissimilarityMetric, DataPoint.GetMedoid); break; default: linkage = new AverageLinkage <DataPoint>(this._dissimilarityMetric); break; } // clusters data-points var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); this._clusteringResult = clusteringAlg.GetClustering(this._dataPoints); }
private void LoadDataSet() { // loads data-points var parser = new CsvParser(); this._dataPoints = parser.Load(Path.GetFullPath(this.openFileDialog.FileName)); // clears series this._chartDataPoints.Clear(); this.ChartPoints.Clear(); // adds points to series var maxX = double.MinValue; var minX = double.MaxValue; foreach (var dataPoint in this._dataPoints) { var chartDataPoint = new ChartDataPoint(dataPoint.Value[0], dataPoint.Value[1]) { Label = dataPoint.ID }; this.ChartPoints.Add(chartDataPoint); this._chartDataPoints.Add(dataPoint, chartDataPoint); maxX = Math.Max(maxX, dataPoint.Value[0]); minX = Math.Min(minX, dataPoint.Value[0]); } // resets this._numClusters = int.MinValue; this._clusteringResult = null; this._dissimilarityMetric = new CachedDissimilarityMetric <DataPoint>(new DataPoint(), this._dataPoints); // adjusts track-bar according to num clusters this.numClustersTrackBar.SmallChange = (uint)(this.numClustersTrackBar.Maximum / this._dataPoints.Count); this.numClustersTrackBar.LargeChange = this.numClustersTrackBar.SmallChange * 5; this.datasetChart.ChartAreas[0].AxisX.Maximum = Math.Ceiling(maxX); this.datasetChart.ChartAreas[0].AxisX.Minimum = Math.Floor(minX); }
public void Train(IUnlabeledExampleCollection <SparseVector <double> > dataset, ClusteringResult hierarchy) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Utils.ThrowException(hierarchy == null ? new ArgumentNullException("hierarchy") : null); Utils.ThrowException(hierarchy.Roots.Count == 0 ? new ArgumentValueException("hierarchy") : null); mModel = new Dictionary <Cluster, ClusterInfo>(); mDataset = dataset; foreach (Cluster root in hierarchy.Roots) { ComputeCentroid(root); } mDataset = null; }
void IHierarchicalModel.Train(IUnlabeledExampleCollection dataset, ClusteringResult hierarchy) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(!(dataset is IUnlabeledExampleCollection <SparseVector <double> >) ? new ArgumentTypeException("dataset") : null); Train((IUnlabeledExampleCollection <SparseVector <double> >)dataset, hierarchy); // throws ArgumentNullException, ArgumentValueException }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Compute 100 clusters of documents. KMeansFast kMeans = new KMeansFast(100); // Set k to 100. kMeans.Trials = 3; // Perform 3 repetitions. Take the best // result. kMeans.Eps = 0.001; // Stop iterating when the partition // quality increases for less than 0.001. ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute. // Extract the top 5 terms with the highest TF-IDF weights // from each of the clusters' centroids and output the // number of documents (companies) in each cluster. foreach (Cluster cl in cr.Roots) { SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2); Console.Write(bowSpc.GetKeywordsStr(centroid, 5)); Console.WriteLine(" ({0} companies)", cl.Items.Count); } // Output the documents that are contained in the first // cluster. foreach (int docIdx in cr.Roots[0].Items) { Console.WriteLine(docs[docIdx]); } }
public Vector2D[] ComputeLayout(LayoutSettings settings) { UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset); // clustering mLogger.Info("ComputeLayout", "Clustering ..."); KMeansFast kMeans = new KMeansFast(mKClust); kMeans.Eps = mKMeansEps; kMeans.Random = mRandom; kMeans.Trials = 1; ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clustering.Roots) { SparseVector <double> centroid = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>(); dsRefInst.Add(centroid); // dataset of reference instances dataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; Vector2D[] centrPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNN); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[dataset.Count - mKClust]; for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(centrPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = centrPos[j].Y; } lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }
internal void AddMathSet(IMathSet set) { this.m_mathSet = set; this.m_cResult = null; }
public void AddMathSet(IMathSet set) { this.m_mathSet = set; this.m_cResult = null; }
//private double GetQual() //{ // double clustQual = 0; // foreach (Centroid centroid in mCentroids) // { // foreach (int itemIdx in centroid.CurrentItems) // { // clustQual += centroid.GetDotProduct(mDataset[itemIdx]); // } // } // clustQual /= (double)mDataset.Count; // return clustQual; //} // TODO: exceptions public ClusteringResult Update(int dequeueN, IEnumerable <SparseVector <double> > addList, ref int iter) { StopWatch stopWatch = new StopWatch(); // update centroid data (1) foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= dequeueN) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } //Console.WriteLine(">>> {0} >>> update centroid data (1)", stopWatch.TotalMilliseconds); stopWatch.Reset(); // update dataset mDataset.RemoveRange(0, dequeueN); int ofs = mDataset.Count; mDataset.AddRange(addList); //Console.WriteLine(">>> {0} >>> update dataset", stopWatch.TotalMilliseconds); stopWatch.Reset(); // update centroid data (2) foreach (CentroidData centroid in mCentroids) { Set <int> itemsOfs = new Set <int>(); foreach (int item in centroid.CurrentItems) { itemsOfs.Add(item - dequeueN); } centroid.CurrentItems.Inner.SetItems(itemsOfs); centroid.Items.SetItems(itemsOfs); } //Console.WriteLine(">>> {0} >>> update centroid data (2)", stopWatch.TotalMilliseconds); stopWatch.Reset(); // assign new instances double bestClustQual = 0; { mLogger.Info("Update", "Initializing ..."); int i = 0; foreach (SparseVector <double> example in addList) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { double sim = mCentroids[j].GetDotProduct(example); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { mCentroids[candidates[0]].Items.Add(ofs + i); } i++; } // update centroids foreach (CentroidData centroid in mCentroids) { centroid.Update(mDataset); centroid.UpdateCentroidLen(); } //Console.WriteLine(GetQual()); foreach (CentroidData centroid in mCentroids) { foreach (int itemIdx in centroid.CurrentItems) { bestClustQual += centroid.GetDotProduct(mDataset[itemIdx]); } } bestClustQual /= (double)mDataset.Count; mLogger.Info("Update", "Quality: {0:0.0000}", bestClustQual); } //Console.WriteLine(">>> {0} >>> assign new instances", stopWatch.TotalMilliseconds); stopWatch.Reset(); // main k-means loop iter = 0; while (true) { iter++; mLogger.Info("Update", "Iteration {0} ...", iter); // assign items to clusters for (int i = 0; i < mDataset.Count; i++) { SparseVector <double> example = mDataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { double sim = mCentroids[j].GetDotProduct(example); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { mCentroids[candidates[0]].Items.Add(i); } } // // *** OPTIMIZE THIS with GetDotProductSimilarity (see this.Cluster) !!! *** // //Console.WriteLine(">>> {0} >>> loop: assign items to clusters", stopWatch.TotalMilliseconds); stopWatch.Reset(); double clustQual = 0; // update centroids foreach (CentroidData centroid in mCentroids) { centroid.Update(mDataset); centroid.UpdateCentroidLen(); } //Console.WriteLine(GetQual()); foreach (CentroidData centroid in mCentroids) { foreach (int itemIdx in centroid.CurrentItems) { clustQual += centroid.GetDotProduct(mDataset[itemIdx]); } } clustQual /= (double)mDataset.Count; //Console.WriteLine(">>> {0} >>> loop: update centroids", stopWatch.TotalMilliseconds); stopWatch.Reset(); mLogger.Info("Update", "Quality: {0:0.0000} Diff: {1:0.0000}", clustQual, clustQual - bestClustQual); // check if done if (clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; } // save the result ClusteringResult clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(mCentroids[i].Items); } return(clustering); }
public void AddClusteringResult(ClusteringResult set) { this.m_cResult = set; }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); mDataset = new UnlabeledDataset <SparseVector <double> >(dataset); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(mDataset.Count); for (int i = 0; i < mDataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(mDataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } ArrayList <KeyDat <double, int> > medoids = new ArrayList <KeyDat <double, int> >(mK); for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(mDataset); centroids[i].UpdateCentroidLen(); medoids.Add(new KeyDat <double, int>(-1, bestSeeds[i])); } double[,] dotProd = new double[mDataset.Count, mK]; SparseMatrix <double> dsMat = ModelUtils.GetTransposedMatrix(mDataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters //StopWatch stopWatch = new StopWatch(); int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(dsMat, mDataset.Count, cenVec); for (int i = 0; i < dotProdSimVec.Length; i++) { if (dotProdSimVec[i] > 0) { dotProd[i, j] = dotProdSimVec[i]; } } j++; } for (int dsInstIdx = 0; dsInstIdx < mDataset.Count; dsInstIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[dsInstIdx, cenIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(dsInstIdx); clustQual += maxSim; if (medoids[candidates[0]].Key < maxSim) { medoids[candidates[0]] = new KeyDat <double, int>(maxSim, dsInstIdx); } } } //Console.WriteLine(stopWatch.TotalMilliseconds); clustQual /= (double)mDataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(mDataset); centroids[i].UpdateCentroidLen(); } // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; for (int i = 0; i < medoids.Count; i++) { medoids[i] = new KeyDat <double, int>(-1, medoids[i].Dat); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; mCentroids = centroids; mMedoids = medoids; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }
public Vector2D[] ComputeLayout(LayoutSettings settings) { // clustering mLogger.Info("ComputeLayout", "Clustering ..."); mKMeans = new IncrementalKMeans(mKClust); mKMeans.Eps = mKMeansEps; mKMeans.Random = mRandom; mKMeans.Trials = 3; ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (SparseVector <double> centroid in mKMeans.GetCentroids()) { dsRefInst.Add(centroid); // dataset of reference instances mDataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; sm.MaxSteps = int.MaxValue; sm.MinDiff = 0.00001; mRefPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); mPatches = new ArrayList <Patch>(mDataset.Count); for (int i = 0; i < mDataset.Count; i++) { mPatches.Add(new Patch(i)); } foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNnExt); for (int i = 0; i < count; i++) { mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat])); } mPatches[simMtxRow.Idx].ProcessList(); count = Math.Min(knn.Count, mKNn); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[mDataset.Count - mKClust]; for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(mRefPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); mSolX = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = mRefPos[j].Y; } lsqr.Train(lsqrDs); mSolY = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }
internal void AddClusteringResult(ClusteringResult set) { this.m_cResult = set; }
public static void Main(string[] args) { Plot generatedDataPlot = new Plot(); Spawner spawner = new Spawner(STD_DEV); List <PointF> allPoints = new List <PointF>(); for (int i = 0; i < CLUSTER_COUNT; ++i) { spawner.ResetCenter(MIN_CENTER_DISTANCE, MAX_CENTER_DISTANCE); PointF[] points = spawner.Spawn(POINT_COUNT); allPoints.AddRange(points); Color color = generatedDataPlot.GetNextColor(); generatedDataPlot.AddScatterPoints(points, color, label: $"Points {i + 1}"); generatedDataPlot.AddPoint(spawner.Center.X, spawner.Center.Y, color, 25); } generatedDataPlot.Legend(); PlotForm generatedDataPlotForm = new PlotForm(generatedDataPlot, "source_data"); generatedDataPlotForm.ShowDialog(); Plot grayDataPlot = new Plot(); grayDataPlot.AddScatterPoints(allPoints.ToArray(), label: "Gray points"); grayDataPlot.Legend(); PlotForm grayDataPlotForm = new PlotForm(grayDataPlot, "gray_data"); grayDataPlotForm.ShowDialog(); KMeansClusterizer clusterizer = new KMeansClusterizer(); List <Dictionary <PointF, List <PointF> > > clusterizingHistory = clusterizer.Clusterize(allPoints, CLUSTER_COUNT); PlotForm resultPlotForm = new PlotForm(CreateClusterizingPlot(clusterizingHistory.Last()), "crusterized"); resultPlotForm.ShowDialog(); PlotForm historyForm = new PlotForm(clusterizingHistory.Select(c => CreateClusterizingPlot(c)).ToList(), "history_"); historyForm.ShowDialog(); CentroidLinkage <DataPoint> linkage = new CentroidLinkage <DataPoint>( new DissimilarityMetric(), cluster => new DataPoint( cluster.Average(p => p.X), cluster.Average(p => p.Y) ) ); AgglomerativeClusteringAlgorithm <DataPoint> algorithm = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); HashSet <DataPoint> dataPoints = allPoints.Select(p => new DataPoint(p)).ToHashSet(); ClusteringResult <DataPoint> clusteringResult = algorithm.GetClustering(dataPoints); ClusterSet <DataPoint> result = clusteringResult[clusteringResult.Count - 3]; Plot aglomeraPlot = new Plot(); foreach (Cluster <DataPoint> resultCluster in result) { Color color = aglomeraPlot.GetNextColor(); aglomeraPlot.AddScatterPoints( resultCluster.Select(p => (double)p.X).ToArray(), resultCluster.Select(p => (double)p.Y).ToArray(), color ); aglomeraPlot.AddPoint( resultCluster.Select(p => p.X).Average(), resultCluster.Select(p => p.Y).Average(), color, 25 ); } PlotForm aglomeraForm = new PlotForm(aglomeraPlot, "aglomera"); aglomeraForm.ShowDialog(); clusteringResult.SaveD3DendrogramFile(Environment.CurrentDirectory + "/dendro.json"); Console.ReadLine(); }