//public Dictionary<HtmlNode, DocumentCluster> GetClusterByDocumentDictionary() //{ // Dictionary<HtmlNode, DocumentCluster> output = new Dictionary<HtmlNode, DocumentCluster>(); // foreach (DocumentCluster cluster in this.) // { // var nodes = cluster.items.Select(x => x); // foreach (HtmlNode node in nodes) // { // context.DeclarationConstruction_ClusterAnalysisContext.ClusterByDocuments.Add(node, cluster); // } // if (cluster.ClusterSeed != null) // { // context.DeclarationConstruction_ClusterAnalysisContext.ClusterByDocuments.Add(cluster.ClusterSeed, cluster); // } // } //} public void Publish(Dictionary <HtmlNode, HtmlSourceAndUrl> documentNodeDictionary, folderNode folderWithResults, DocumentSimilarityResult result) { folderWithResults.generateReadmeFiles(null); var items = GetClusters <DocumentCluster>(true); Dictionary <HtmlNode, string> labelsByDocument = result.GetLabelsByDocument(); if (!name.isNullOrEmpty()) { folderWithResults = folderWithResults.Add(name, name, "Reports for cluster collection " + name); } builderForText reporter = new builderForText(); foreach (DocumentCluster cluster in items) { cluster.Publish(labelsByDocument, documentNodeDictionary, folderWithResults, result); reporter.AppendPair(cluster.name, cluster.items.Count); reporter.AppendPair("- range", cluster.range.Range); } String reportPath = folderWithResults.pathFor("report.txt", imbSCI.Data.enums.getWritableFileMode.overwrite); String reportContent = reporter.GetContent(); File.WriteAllText(reportPath, reportContent); }
/// <summary> /// Gets the clusters by target. /// </summary> /// <param name="result">The result.</param> /// <param name="collectionName">Name of the collection.</param> /// <param name="output">The output.</param> /// <param name="scoreSelector">The score selector.</param> /// <returns></returns> public DocumentClusterCollection GetClustersByTarget(DocumentSimilarityResult result, String collectionName = "Clusters", ITextRender output = null) { List <DocumentClusterCollection> candidates = new List <DocumentClusterCollection>(); List <DocumentClusterCollection> others = new List <DocumentClusterCollection>(); List <Func <DocumentSimilarityResultPair, double> > scoreSelectors = settings.SimilarityScoreSource.GetSelectorList(); if (settings.TargetClusterCount > 0) { var minSimRange = settings.GetMinSimilarityRange(); var minSimScores = minSimRange.GetValueRangeZigZagSteps(settings.TargetSearchSteps); Int32 si = 0; foreach (var minSimScore in minSimScores) { si++; foreach (var scoreSelector in scoreSelectors) { DocumentClusterCollection currentClusters = GetClusters(result, collectionName, scoreSelector, minSimScore); if (output != null) { output.AppendLine($"Clusterization iteration {si}/{minSimScores.Count} with minSimScore: {minSimScore} : Clusters[{currentClusters.Count}] - NullCluster[{currentClusters.NullCluster.items.Count}]"); } if (currentClusters.Count == settings.TargetClusterCount) { if (currentClusters.NullCluster.items.Any()) { candidates.Add(currentClusters); } else { if (output != null) { output.AppendLine($"Match found _{si}/{minSimScores.Count}_ with minSimScore: {minSimScore} : Clusters[{currentClusters.Count}] - NullCluster[{currentClusters.NullCluster.items.Count}]"); } return(currentClusters); } } else { others.Add(currentClusters); } } } if (candidates.Any()) { return(candidates.FirstOrDefault()); } var sorted = others.OrderBy(x => Math.Abs(settings.TargetClusterCount - x.Count)); return(sorted.FirstOrDefault()); } return(GetClusters(result, collectionName, scoreSelectors.First(), settings.MinScoreInRangeCriterion)); }
public void Publish(Dictionary <HtmlNode, String> labelsByDocument, Dictionary <HtmlNode, HtmlSourceAndUrl> documentNodeDictionary, folderNode folderWithResults, DocumentSimilarityResult result) { var cluster = this; folderNode cFolder = folderWithResults.Add(cluster.name, cluster.name, "Directory for cluster " + cluster.name); result.Publish(documentNodeDictionary, cFolder, cluster.items); builderForText reporter = new builderForText(); reporter.AppendHeading("Name: " + cluster.name); reporter.AppendPair("Items", cluster.items.Count); if (cluster.ClusterSeed != null) { reporter.AppendPair("Seed", labelsByDocument[cluster.ClusterSeed]); } foreach (var pair in cluster.range.GetDictionary()) { reporter.AppendPair(pair.Key, pair.Value.ToString("F3")); } foreach (var item in cluster.items) { if (item != cluster.ClusterSeed) { if (cluster.scoreDictionary.ContainsKey(item)) { String label = labelsByDocument[item]; Double score = cluster.scoreDictionary[item]; HtmlSourceAndUrl source = documentNodeDictionary[item]; reporter.AppendLine("-----------------------------------"); reporter.AppendLine(label + " => " + score.ToString("F3")); reporter.AppendLine("Filepath: " + source.filepath); reporter.AppendLine("Url: " + source.url); } } } String reportPath = cFolder.pathFor("report.txt", imbSCI.Data.enums.getWritableFileMode.overwrite); String reportContent = reporter.GetContent(); File.WriteAllText(reportPath, reportContent); }
/// <summary> /// Gets cluster collection /// </summary> /// <param name="collectionName">Name for the collection.</param> /// <param name="result">The result.</param> /// <param name="scoreSelector">The score selector.</param> /// <returns></returns> public DocumentClusterCollection GetClusters(DocumentSimilarityResult result, String collectionName = "Clusters", Func <DocumentSimilarityResultPair, Double> scoreSelector = null, Double minSimScore = Double.MinValue) { if (minSimScore == Double.MinValue) { minSimScore = settings.MinScoreInRangeCriterion; } if (scoreSelector == null) { scoreSelector = settings.SimilarityScoreSource.GetSelector(); } DocumentClusterCollection output = new DocumentClusterCollection() { name = collectionName }; var documents = result.GetDocuments(); var sortedResults = result.GetAllResults().OrderByDescending(x => scoreSelector).ToList(); rangeFinder similarityRange = new rangeFinder(); foreach (var pair in sortedResults) { similarityRange.Learn(scoreSelector(pair)); } Int32 limit = documents.Count; Int32 i = 0; while (documents.Any()) { i++; var doc = documents.FirstOrDefault(); if (doc == null) { break; } var results = result.GetResultsFor(doc); DocumentCluster currentCluster = output.NewCluster <DocumentCluster>(); //new DocumentCluster(); currentCluster.ClusterSeed = doc; foreach (KeyValuePair <HtmlNode, DocumentSimilarityResultPair> pair in results) { Double scoreAtRange = similarityRange.GetPositionInRange(scoreSelector(pair.Value)); if (scoreAtRange > minSimScore) { currentCluster.Add(pair.Key, scoreAtRange); documents.Remove(pair.Key); } } if (currentCluster.items.Count == 0) { output.NullCluster.Add(doc, 0); documents.Remove(doc); } else { documents.Remove(doc); currentCluster.items.Add(doc); output.AddCluster(currentCluster); } if (i > limit) { break; } } foreach (var item in output.NullCluster.items) { var results = result.GetResultsFor(item); Double maxScore = Double.MinValue; DocumentCluster selectedCluster = null; foreach (var cluster in output.GetClusters <DocumentCluster>(false)) { Double score = scoreSelector(results[cluster.ClusterSeed]); if (score > maxScore) { maxScore = score; selectedCluster = cluster; } } if (similarityRange.GetPositionInRange(maxScore) > minSimScore) { selectedCluster.Add(item, maxScore); output.NullCluster.Remove(item); } else { } } if (settings.ExclusiveClusterMembership) { var itemToCluster = output.GetItemToClusterAssociations <DocumentCluster>(); foreach (var pair in itemToCluster) { if (pair.Value.Count > 1) { Dictionary <HtmlNode, DocumentSimilarityResultPair> results = result.GetResultsFor(pair.Key); Double maxScore = Double.MinValue; DocumentCluster selectedCluster = null; foreach (var cluster in pair.Value) { Double score = scoreSelector(results[cluster.ClusterSeed]); if (score > maxScore) { maxScore = score; selectedCluster = cluster; } } foreach (var cluster in pair.Value) { if (cluster != selectedCluster) { cluster.Remove(pair.Key); } } } } } output.RemoveEmptyClusters(); return(output); }