public ClusteringService() { var metric = new RecordDissimilarityMetric(); var linkage = new AverageLinkage <Record>(metric); algorithm = new AgglomerativeClusteringAlgorithm <Record>(linkage); }
public void InitializedClusteringTest() { foreach (var linkage in Linkages) { Console.WriteLine("________________________"); var completeClusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(DataPoints); for (var i = 0; i < completeClusteringResult.Count; i++) { var clusterSet = completeClusteringResult[i]; var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(clusterSet); Assert.AreEqual(clusteringResult.Count, DataPoints.Count - i); for (var j = 0; j < clusteringResult.Count; j++) { var clusterSet1 = completeClusteringResult[i + j]; var clusterSet2 = clusteringResult[j]; Console.WriteLine(clusterSet1); Console.WriteLine(clusterSet2); Assert.AreEqual(clusterSet1.Dissimilarity, clusterSet2.Dissimilarity, double.Epsilon); Assert.IsTrue(clusterSet1.SequenceEqual(clusterSet2), $"{clusterSet1} should be equal to {clusterSet2}"); } } } }
public void ClusterSetsSizeTest() { foreach (var linkage in Linkages) { Console.WriteLine("________________________"); var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(DataPoints); for (var i = 0; i < clusteringResult.Count; i++) { var clusterSet = clusteringResult[i]; Console.WriteLine(clusterSet); Assert.AreEqual(clusterSet.Count, DataPoints.Count - i, $"Cluster-set size at iteration {i} should be inverse to the number of items."); if (i > 0) { Assert.IsTrue(clusterSet.Dissimilarity > 0d, "Dissimilarity of the single cluster-set should be > 0."); } foreach (var dataPoint in DataPoints) { Assert.IsTrue(clusterSet.Any(cluster => cluster.Contains(dataPoint)), $"There should be a cluster in {clusterSet} that contains data-point {dataPoint}"); } } } }
private const uint NUM_CLUSTERS = 3; // for seeds and iris data-sets #endregion #region Private & Protected Methods private static void EvaluateClustering( ISet <DataPoint> dataPoints, ILinkageCriterion <DataPoint> linkage, string linkageName, uint numClusters) { var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); var clustering = clusteringAlg.GetClustering(dataPoints); // gets cluster set according to predefined number of clusters var clusterSet = clustering.First(cs => cs.Count == numClusters); // gets classes for each data-point (first character of the ID in the dataset) var pointClasses = dataPoints.ToDictionary(dataPoint => dataPoint, dataPoint => dataPoint.ID[0]); Console.WriteLine("============================================="); Console.WriteLine($"Evaluating {linkageName} clustering using Euclidean distance..."); // evaluates the clustering according to different criteria var evaluations = new Dictionary <string, double> { { "Purity", new Purity <DataPoint, char>().Evaluate(clusterSet, pointClasses) }, { "NMI", new NormalizedMutualInformation <DataPoint, char>().Evaluate(clusterSet, pointClasses) }, { "Accuracy", new RandIndex <DataPoint, char>().Evaluate(clusterSet, pointClasses) }, { "Precision", new Precision <DataPoint, char>().Evaluate(clusterSet, pointClasses) }, { "Recall", new Recall <DataPoint, char>().Evaluate(clusterSet, pointClasses) }, { "F1Measure", new FMeasure <DataPoint, char>(1).Evaluate(clusterSet, pointClasses) }, { "F2Measure", new FMeasure <DataPoint, char>(2).Evaluate(clusterSet, pointClasses) }, { "F05Measure", new FMeasure <DataPoint, char>(0.5).Evaluate(clusterSet, pointClasses) } }; foreach (var evaluation in evaluations) { Console.WriteLine($" - {evaluation.Key}: {evaluation.Value:0.000}"); } }
private static ClusteringResult <DataPoint> GetClustering() { // performs hierarchical clustering var linkage = new AverageLinkage <DataPoint>(Metric); var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); return(clusteringAlg.GetClustering(ClusteringTests.DataPoints)); }
public void EmptyClusteringTest() { foreach (var linkage in Linkages) { Console.WriteLine("________________________"); var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(new HashSet <DataPoint>()); Console.WriteLine(clusteringResult.Count); Assert.AreEqual(0, clusteringResult.Count, "Empty set should produce an empty clustering result."); } }
static void Main(string[] args) { var txgraphOptions = new ConnectionOptions() { DatabaseName = "txgraph", DatabaseType = ODatabaseType.Graph, HostName = "localhost", Password = "******", Port = 2424, UserName = "******" }; //var data = new DataSourceProvider("dbaf14e1c476e76ea05a8b71921a46d6b06f0a950f17c5f9f1a03b8fae467f10", LimitType.DATE, 1); //var addresses = data.GetAddresses(txgraphOptions); //if(addresses.Count == 0) //{ // return; //} // DEBUG var addresses = ReadGold(); // DEBUG Console.WriteLine($"{addresses.Count} addresses of interest will be processed..."); var algoPipe = new Pipeline(); algoPipe.Add(new TotalAmounts()); algoPipe.Add(new Amounts()); algoPipe.Add(new SocialNetwork()); algoPipe.Add(new TimeSlots()); algoPipe.Add(new Core.Clustering.FeatureExtractors.DayOfWeek()); algoPipe.Add(new TransactionShape()); algoPipe.Add(new CommonTimes()); algoPipe.Add(new Heuristic1()); algoPipe.Add(new Heuristic2()); algoPipe.Process(txgraphOptions, addresses); // DEBUG Console.WriteLine("Processing of addresses done."); var metric = new AddressDissimilarityMetric(algoPipe, addresses); var linkage = new AverageLinkage <string>(metric); var algorithm = new AgglomerativeClusteringAlgorithm <string>(linkage); var clusteringResult = algorithm.GetClustering(new HashSet <string>(addresses)); var index = 0; foreach (var clusterSet in clusteringResult.OrderBy(x => x.Dissimilarity)) { var sb = new StringBuilder(); sb.AppendLine($"{clusterSet.Dissimilarity}"); foreach (var cluster in clusterSet) { sb.AppendLine(string.Join("\t", cluster)); } Directory.CreateDirectory("report"); File.WriteAllText(Path.Combine("report", $"{index++}.txt".Replace(",", "#")), sb.ToString()); } }
public void ClusteringSizeTest() { foreach (var linkage in Linkages) { Console.WriteLine("________________________"); var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(DataPoints); Console.WriteLine(clusteringResult.Count); Assert.AreEqual(clusteringResult.Count, DataPoints.Count, "Clustering result should have as many cluster-sets as there are clusters."); } }
public void InitialClusterTest() { foreach (var linkage in Linkages) { Console.WriteLine("________________________"); var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(DataPoints); var firstClusterSet = clusteringResult[0]; Console.WriteLine(firstClusterSet); Assert.AreEqual(firstClusterSet.Count, DataPoints.Count, "First cluster-set size should be the same as the number of items."); Assert.AreEqual(firstClusterSet.Dissimilarity, 0d, double.Epsilon, "First cluster-set dissimilarity should be 0."); } }
public Task <string> GetClusterAnalysisAsync(IReadOnlyCollection <string[]> data) { if (data.Count == 0) { return(Task.FromException <string>(new ArgumentNullException())); } var instance = Utils.ClusterParse(data); var linkage = new SingleLinkage <DataPoint>(new DataPoint(null, null)); var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); var clustering = clusteringAlg.GetClustering(instance); var clustersJson = clustering.GetDendrogramJson(true, Formatting.Indented); return(Task.FromResult(clustersJson)); }
private void ClusteringWorkerDoWork(object sender, DoWorkEventArgs e) { // checks data points if (this._dataPoints == null || this._dataPoints.Count == 0) { return; } // selects linkage criterion ILinkageCriterion <DataPoint> linkage; var selectedIndex = e.Argument; switch (selectedIndex) { case 1: linkage = new CompleteLinkage <DataPoint>(this._dissimilarityMetric); break; case 2: linkage = new SingleLinkage <DataPoint>(this._dissimilarityMetric); break; case 3: linkage = new MinimumEnergyLinkage <DataPoint>(this._dissimilarityMetric); break; case 4: linkage = new CentroidLinkage <DataPoint>(this._dissimilarityMetric, DataPoint.GetMedoid); break; case 5: linkage = new WardsMinimumVarianceLinkage <DataPoint>( this._dissimilarityMetric, DataPoint.GetMedoid); break; default: linkage = new AverageLinkage <DataPoint>(this._dissimilarityMetric); break; } // clusters data-points var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); this._clusteringResult = clusteringAlg.GetClustering(this._dataPoints); }
public void SingleClusterTest() { foreach (var linkage in Linkages) { Console.WriteLine("________________________"); var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(DataPoints); var singleClusterSet = clusteringResult[DataPoints.Count - 1]; Console.WriteLine(singleClusterSet); Assert.AreEqual(singleClusterSet.Count, 1, "Single cluster-set should only have a cluster."); var cluster = singleClusterSet.First(); Assert.AreEqual(cluster.Count, DataPoints.Count, "Single cluster-set size should be the same as the number of items."); Assert.AreEqual(cluster, clusteringResult.SingleCluster, $"Cluster {cluster} should be the single cluster."); } }
public void SaveFileTest() { for (var i = 0; i < Linkages.Count; i++) { var linkage = Linkages[i]; Console.WriteLine("________________________"); var clusteringResult = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage).GetClustering(DataPoints); var filePath = Path.GetFullPath($"linkage-{i}.csv"); Console.WriteLine(filePath); clusteringResult.SaveToCsv(filePath); Assert.IsTrue(File.Exists(filePath), $"CSV file should exist in {filePath}"); Assert.AreNotEqual(0, new FileInfo(filePath).Length, "CSV file size should be > 0 bytes."); #if !DEBUG File.Delete(filePath); #endif } }
public void SaveFileTest() { var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(new AverageLinkage <DataPoint>(new DataPoint())); var clustering = clusteringAlg.GetClustering(DataPoints); Console.WriteLine(clustering); var fullPath = Path.Combine(Path.GetFullPath("."), FILE_NAME); File.Delete(fullPath); clustering.SaveD3DendrogramFile(fullPath, formatting: Formatting.Indented); Console.WriteLine(fullPath); Assert.IsTrue(File.Exists(fullPath), $"D3 json file should exist in {fullPath}."); Assert.IsTrue(new FileInfo(fullPath).Length > 0, "Json file size should be > 0 bytes."); #if !DEBUG File.Delete(fullPath); #endif }
private static void PrintClusters(ISet <DataPoint> instances, ILinkageCriterion <DataPoint> linkage, string name) { var perfMeasure = new PerformanceMeasure(); perfMeasure.Start(); var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); var clustering = clusteringAlg.GetClustering(instances); perfMeasure.Stop(); Console.WriteLine("_____________________________________________"); Console.WriteLine(name); Console.WriteLine(perfMeasure); foreach (var clusterSet in clustering) { Console.WriteLine($"Clusters at distance: {clusterSet.Dissimilarity:0.00} ({clusterSet.Count})"); foreach (var cluster in clusterSet) { Console.WriteLine($" - {cluster}"); } } clustering.SaveD3DendrogramFile(Path.GetFullPath($"{name}.json"), formatting: Formatting.Indented); }
private static void Main(string[] args) { var globalPerf = new PerformanceMeasure(); globalPerf.Start(); // loads points from csv file var dataSetFile = args.Length > 0 ? args[0] : DATASET_FILE; var filePath = Path.GetFullPath(dataSetFile); Console.WriteLine($"Loading data-points from {filePath}..."); var parser = new CsvParser(); var dataPoints = parser.Load(filePath); Console.WriteLine($"Clustering {dataPoints.Count} data-points..."); // performs hierarchical clustering var clusterPerf = new PerformanceMeasure(); clusterPerf.Start(); var metric = new DataPoint(); // Euclidean distance var linkage = new AverageLinkage <DataPoint>(metric); var clusteringAlg = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); var clustering = clusteringAlg.GetClustering(dataPoints); clusterPerf.Stop(); Directory.CreateDirectory(Path.GetFullPath(RESULTS_PATH)); clustering.SaveD3DendrogramFile(Path.GetFullPath(Path.Combine(RESULTS_PATH, "dendrogram.json"))); clustering.SaveToCsv(Path.GetFullPath(Path.Combine(RESULTS_PATH, "clustering.csv"))); Console.WriteLine($"Finished clustering: {clusterPerf}"); // evaluates the clustering according to several criteria //CentroidFunction<DataPoint> centroidFunc = DataPoint.GetMedoid; CentroidFunction <DataPoint> centroidFunc = DataPoint.GetCentroid; var criteria = new Dictionary <string, IInternalEvaluationCriterion <DataPoint> > { { "Silhouette coefficient", new SilhouetteCoefficient <DataPoint>(metric) }, { "Dunn index", new DunnIndex <DataPoint>(metric) }, { "Davies-Bouldin index", new DaviesBouldinIndex <DataPoint>(metric, centroidFunc) }, { "Calinski-Harabasz index", new CalinskiHarabaszIndex <DataPoint>(metric, centroidFunc) }, { "Modified Gamma statistic", new ModifiedGammaStatistic <DataPoint>(metric, centroidFunc) }, { "Xie-Beni index", new XieBeniIndex <DataPoint>(metric, centroidFunc) }, { "Within-Between ratio", new WithinBetweenRatio <DataPoint>(metric, centroidFunc) }, { "I-index", new IIndex <DataPoint>(metric, centroidFunc) }, { "Xu index", new XuIndex <DataPoint>(metric, centroidFunc) } //{"RMSSD", new RootMeanSquareStdDev<DataPoint>(metric, centroidFunc)}, //{"R-squared", new RSquared<DataPoint>(metric, centroidFunc)}, }; foreach (var criterion in criteria) { GetBestPartition(clustering, criterion.Value, criterion.Key); } globalPerf.Stop(); Console.WriteLine($"\nFinished: {globalPerf}"); Console.ReadKey(); }
public static void AgglomerativeClustering(AllResults loaded) { var imgName = "241666.jpg"; //"159161.jpg"; var imgId = loaded.ImageEncoding[imgName]; var relevantRows = loaded.Rows //.Where(r => r.Query.ImageId == imgId) .ToList(); // cluster all of them together? Include query into dissimilarity function then // Or product by product, filter down to big elements, and offer to transitively load more and more? var metric = new ResultsRowSetBasedDistance(); var linkage = new AverageLinkage <ResultsRow>(metric); var algorithm = new AgglomerativeClusteringAlgorithm <ResultsRow>(linkage); var clusters = algorithm.GetClustering(new HashSet <ResultsRow>(relevantRows)); clusters.SaveToCsv(@"G:\siret\zoot\protobuf\clustertest.csv"); //RenderData(); var dummyResults = new AllResults { ImageEncoding = loaded.ImageEncoding, PatchEncoding = loaded.PatchEncoding }; var clusterQueue = new Queue <Cluster <ResultsRow> >(new[] { clusters.SingleCluster }); while (clusterQueue.Count > 0) { var item = clusterQueue.Dequeue(); if (item.Dissimilarity <= 0.70 && item.Count < 50) { dummyResults.Rows.Add(new ResultsRow { Query = item.First().Query, Hits = item.SelectMany(x => x.Hits) .GroupBy(x => x.Hit) .Select(x => new SearchHit { Hit = x.Key, Distance = x.Min(y => y.Distance) }) .Concat(item.Select(i => new SearchHit { Hit = i.Query, Distance = -1 })) .ToArray() }); } else { clusterQueue.Enqueue(item.Parent1); clusterQueue.Enqueue(item.Parent2); } } loaded.RefreshReferenceMap(); foreach (var k in AllResults.ReferenceMap.Keys) { AllResults.ReferenceMap[k][k] = 1; } using (var sw = new StreamWriter(@"G:\siret\zoot\protobuf\clusteringTestMega.html", append: false)) { dummyResults.Render(sw); } }
private static void MelhorCluster(HashSet <DataPoint <int> > dataPoints) { //inicializa classes do algoritmo var metric = new DissimilarityMetric <int>(); var linkage = new AverageLinkage <DataPoint <int> >(metric); var algorithm = new AgglomerativeClusteringAlgorithm <DataPoint <int> >(linkage); //enquanto tiverem entregas pra clusterizar while (dataPoints.Any()) { //clusteriza var clusteringResult = algorithm.GetClustering(dataPoints); //printa a clusterização clusteringResult.Select(x => { Console.WriteLine(x); return(0); }).ToArray(); //maximo de itens por cluster var MAX_ITEMS = 7; //vai ser feito uma força bruta no número minimo de itens, começando igual ao MAX_ITEMS e decrementando até 1 (uma entrega) var minItems = MAX_ITEMS; //Variavel de retorno da operação Cluster <DataPoint <int> > bestCluster = null; //força bruta while (minItems > 0) { //vai percorrendo a lista de clusters do ultimo ao primeiro for (var i = clusteringResult.Count() - 1; i >= 0; i--) { //ordenando decrescente pela quantidade de entregas pega o primeiro q tem a mesma quantidade ou menos q o MAX_ITEMS bestCluster = clusteringResult[i]. OrderByDescending(x => x.Count()). FirstOrDefault(y => y.Count() <= MAX_ITEMS); //se a quantidade de entregas nesse cluster for igual ao minimo q estamos decrescendo finaliza o processo if (bestCluster?.Count() == minItems) { break; } } //como são dois loops, verifica de novo e repete o break if (bestCluster?.Count() == minItems) { break; } //decresce o minItens pra tentar de novo minItems--; } //resultado dessa iteração Console.WriteLine(bestCluster); //remove as entregas já clusterizadas dataPoints.RemoveWhere(x => bestCluster.Any(y => y.id == x.id)); Console.WriteLine(); Console.WriteLine(); //acalma o coração e repete até não ter mais entregas pra clusterizar Console.ReadLine(); } }
public static void Main(string[] args) { Plot generatedDataPlot = new Plot(); Spawner spawner = new Spawner(STD_DEV); List <PointF> allPoints = new List <PointF>(); for (int i = 0; i < CLUSTER_COUNT; ++i) { spawner.ResetCenter(MIN_CENTER_DISTANCE, MAX_CENTER_DISTANCE); PointF[] points = spawner.Spawn(POINT_COUNT); allPoints.AddRange(points); Color color = generatedDataPlot.GetNextColor(); generatedDataPlot.AddScatterPoints(points, color, label: $"Points {i + 1}"); generatedDataPlot.AddPoint(spawner.Center.X, spawner.Center.Y, color, 25); } generatedDataPlot.Legend(); PlotForm generatedDataPlotForm = new PlotForm(generatedDataPlot, "source_data"); generatedDataPlotForm.ShowDialog(); Plot grayDataPlot = new Plot(); grayDataPlot.AddScatterPoints(allPoints.ToArray(), label: "Gray points"); grayDataPlot.Legend(); PlotForm grayDataPlotForm = new PlotForm(grayDataPlot, "gray_data"); grayDataPlotForm.ShowDialog(); KMeansClusterizer clusterizer = new KMeansClusterizer(); List <Dictionary <PointF, List <PointF> > > clusterizingHistory = clusterizer.Clusterize(allPoints, CLUSTER_COUNT); PlotForm resultPlotForm = new PlotForm(CreateClusterizingPlot(clusterizingHistory.Last()), "crusterized"); resultPlotForm.ShowDialog(); PlotForm historyForm = new PlotForm(clusterizingHistory.Select(c => CreateClusterizingPlot(c)).ToList(), "history_"); historyForm.ShowDialog(); CentroidLinkage <DataPoint> linkage = new CentroidLinkage <DataPoint>( new DissimilarityMetric(), cluster => new DataPoint( cluster.Average(p => p.X), cluster.Average(p => p.Y) ) ); AgglomerativeClusteringAlgorithm <DataPoint> algorithm = new AgglomerativeClusteringAlgorithm <DataPoint>(linkage); HashSet <DataPoint> dataPoints = allPoints.Select(p => new DataPoint(p)).ToHashSet(); ClusteringResult <DataPoint> clusteringResult = algorithm.GetClustering(dataPoints); ClusterSet <DataPoint> result = clusteringResult[clusteringResult.Count - 3]; Plot aglomeraPlot = new Plot(); foreach (Cluster <DataPoint> resultCluster in result) { Color color = aglomeraPlot.GetNextColor(); aglomeraPlot.AddScatterPoints( resultCluster.Select(p => (double)p.X).ToArray(), resultCluster.Select(p => (double)p.Y).ToArray(), color ); aglomeraPlot.AddPoint( resultCluster.Select(p => p.X).Average(), resultCluster.Select(p => p.Y).Average(), color, 25 ); } PlotForm aglomeraForm = new PlotForm(aglomeraPlot, "aglomera"); aglomeraForm.ShowDialog(); clusteringResult.SaveD3DendrogramFile(Environment.CurrentDirectory + "/dendro.json"); Console.ReadLine(); }