예제 #1
0
        //public Dictionary<HtmlNode, DocumentCluster> GetClusterByDocumentDictionary()
        //{
        //    Dictionary<HtmlNode, DocumentCluster> output = new Dictionary<HtmlNode, DocumentCluster>();

        //    foreach (DocumentCluster cluster in this.)
        //    {
        //        var nodes = cluster.items.Select(x => x);

        //        foreach (HtmlNode node in nodes)
        //        {
        //            context.DeclarationConstruction_ClusterAnalysisContext.ClusterByDocuments.Add(node, cluster);
        //        }

        //        if (cluster.ClusterSeed != null)
        //        {
        //            context.DeclarationConstruction_ClusterAnalysisContext.ClusterByDocuments.Add(cluster.ClusterSeed, cluster);
        //        }
        //    }
        //}

        public void Publish(Dictionary <HtmlNode, HtmlSourceAndUrl> documentNodeDictionary, folderNode folderWithResults, DocumentSimilarityResult result)
        {
            folderWithResults.generateReadmeFiles(null);
            var items = GetClusters <DocumentCluster>(true);

            Dictionary <HtmlNode, string> labelsByDocument = result.GetLabelsByDocument();

            if (!name.isNullOrEmpty())
            {
                folderWithResults = folderWithResults.Add(name, name, "Reports for cluster collection " + name);
            }

            builderForText reporter = new builderForText();

            foreach (DocumentCluster cluster in items)
            {
                cluster.Publish(labelsByDocument, documentNodeDictionary, folderWithResults, result);

                reporter.AppendPair(cluster.name, cluster.items.Count);
                reporter.AppendPair("- range", cluster.range.Range);
            }


            String reportPath    = folderWithResults.pathFor("report.txt", imbSCI.Data.enums.getWritableFileMode.overwrite);
            String reportContent = reporter.GetContent();


            File.WriteAllText(reportPath, reportContent);
        }
예제 #2
0
        /// <summary>
        /// Gets the clusters by target.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="collectionName">Name of the collection.</param>
        /// <param name="output">The output.</param>
        /// <param name="scoreSelector">The score selector.</param>
        /// <returns></returns>
        public DocumentClusterCollection GetClustersByTarget(DocumentSimilarityResult result, String collectionName = "Clusters", ITextRender output = null)
        {
            List <DocumentClusterCollection> candidates = new List <DocumentClusterCollection>();

            List <DocumentClusterCollection> others = new List <DocumentClusterCollection>();



            List <Func <DocumentSimilarityResultPair, double> > scoreSelectors = settings.SimilarityScoreSource.GetSelectorList();


            if (settings.TargetClusterCount > 0)
            {
                var minSimRange = settings.GetMinSimilarityRange();

                var minSimScores = minSimRange.GetValueRangeZigZagSteps(settings.TargetSearchSteps);

                Int32 si = 0;
                foreach (var minSimScore in minSimScores)
                {
                    si++;

                    foreach (var scoreSelector in scoreSelectors)
                    {
                        DocumentClusterCollection currentClusters = GetClusters(result, collectionName, scoreSelector, minSimScore);

                        if (output != null)
                        {
                            output.AppendLine($"Clusterization iteration {si}/{minSimScores.Count} with minSimScore: {minSimScore} : Clusters[{currentClusters.Count}] - NullCluster[{currentClusters.NullCluster.items.Count}]");
                        }

                        if (currentClusters.Count == settings.TargetClusterCount)
                        {
                            if (currentClusters.NullCluster.items.Any())
                            {
                                candidates.Add(currentClusters);
                            }
                            else
                            {
                                if (output != null)
                                {
                                    output.AppendLine($"Match found _{si}/{minSimScores.Count}_ with minSimScore: {minSimScore} : Clusters[{currentClusters.Count}] - NullCluster[{currentClusters.NullCluster.items.Count}]");
                                }
                                return(currentClusters);
                            }
                        }
                        else
                        {
                            others.Add(currentClusters);
                        }
                    }
                }

                if (candidates.Any())
                {
                    return(candidates.FirstOrDefault());
                }

                var sorted = others.OrderBy(x => Math.Abs(settings.TargetClusterCount - x.Count));
                return(sorted.FirstOrDefault());
            }


            return(GetClusters(result, collectionName, scoreSelectors.First(), settings.MinScoreInRangeCriterion));
        }
        public void Publish(Dictionary <HtmlNode, String> labelsByDocument, Dictionary <HtmlNode, HtmlSourceAndUrl> documentNodeDictionary, folderNode folderWithResults, DocumentSimilarityResult result)
        {
            var        cluster = this;
            folderNode cFolder = folderWithResults.Add(cluster.name, cluster.name, "Directory for cluster " + cluster.name);

            result.Publish(documentNodeDictionary, cFolder, cluster.items);

            builderForText reporter = new builderForText();

            reporter.AppendHeading("Name: " + cluster.name);
            reporter.AppendPair("Items", cluster.items.Count);

            if (cluster.ClusterSeed != null)
            {
                reporter.AppendPair("Seed", labelsByDocument[cluster.ClusterSeed]);
            }
            foreach (var pair in cluster.range.GetDictionary())
            {
                reporter.AppendPair(pair.Key, pair.Value.ToString("F3"));
            }

            foreach (var item in cluster.items)
            {
                if (item != cluster.ClusterSeed)
                {
                    if (cluster.scoreDictionary.ContainsKey(item))
                    {
                        String           label  = labelsByDocument[item];
                        Double           score  = cluster.scoreDictionary[item];
                        HtmlSourceAndUrl source = documentNodeDictionary[item];
                        reporter.AppendLine("-----------------------------------");
                        reporter.AppendLine(label + " => " + score.ToString("F3"));
                        reporter.AppendLine("Filepath: " + source.filepath);
                        reporter.AppendLine("Url: " + source.url);
                    }
                }
            }

            String reportPath    = cFolder.pathFor("report.txt", imbSCI.Data.enums.getWritableFileMode.overwrite);
            String reportContent = reporter.GetContent();

            File.WriteAllText(reportPath, reportContent);
        }
예제 #4
0
        /// <summary>
        /// Gets cluster collection
        /// </summary>
        /// <param name="collectionName">Name for the collection.</param>
        /// <param name="result">The result.</param>
        /// <param name="scoreSelector">The score selector.</param>
        /// <returns></returns>
        public DocumentClusterCollection GetClusters(DocumentSimilarityResult result, String collectionName = "Clusters", Func <DocumentSimilarityResultPair, Double> scoreSelector = null, Double minSimScore = Double.MinValue)
        {
            if (minSimScore == Double.MinValue)
            {
                minSimScore = settings.MinScoreInRangeCriterion;
            }

            if (scoreSelector == null)
            {
                scoreSelector = settings.SimilarityScoreSource.GetSelector();
            }

            DocumentClusterCollection output = new DocumentClusterCollection()
            {
                name = collectionName
            };

            var documents     = result.GetDocuments();
            var sortedResults = result.GetAllResults().OrderByDescending(x => scoreSelector).ToList();

            rangeFinder similarityRange = new rangeFinder();

            foreach (var pair in sortedResults)
            {
                similarityRange.Learn(scoreSelector(pair));
            }


            Int32 limit = documents.Count;
            Int32 i     = 0;

            while (documents.Any())
            {
                i++;
                var doc = documents.FirstOrDefault();
                if (doc == null)
                {
                    break;
                }

                var results = result.GetResultsFor(doc);

                DocumentCluster currentCluster = output.NewCluster <DocumentCluster>(); //new DocumentCluster();
                currentCluster.ClusterSeed = doc;

                foreach (KeyValuePair <HtmlNode, DocumentSimilarityResultPair> pair in results)
                {
                    Double scoreAtRange = similarityRange.GetPositionInRange(scoreSelector(pair.Value));
                    if (scoreAtRange > minSimScore)
                    {
                        currentCluster.Add(pair.Key, scoreAtRange);
                        documents.Remove(pair.Key);
                    }
                }

                if (currentCluster.items.Count == 0)
                {
                    output.NullCluster.Add(doc, 0);
                    documents.Remove(doc);
                }
                else
                {
                    documents.Remove(doc);
                    currentCluster.items.Add(doc);
                    output.AddCluster(currentCluster);
                }

                if (i > limit)
                {
                    break;
                }
            }

            foreach (var item in output.NullCluster.items)
            {
                var             results         = result.GetResultsFor(item);
                Double          maxScore        = Double.MinValue;
                DocumentCluster selectedCluster = null;

                foreach (var cluster in output.GetClusters <DocumentCluster>(false))
                {
                    Double score = scoreSelector(results[cluster.ClusterSeed]);
                    if (score > maxScore)
                    {
                        maxScore        = score;
                        selectedCluster = cluster;
                    }
                }

                if (similarityRange.GetPositionInRange(maxScore) > minSimScore)
                {
                    selectedCluster.Add(item, maxScore);
                    output.NullCluster.Remove(item);
                }
                else
                {
                }
            }



            if (settings.ExclusiveClusterMembership)
            {
                var itemToCluster = output.GetItemToClusterAssociations <DocumentCluster>();

                foreach (var pair in itemToCluster)
                {
                    if (pair.Value.Count > 1)
                    {
                        Dictionary <HtmlNode, DocumentSimilarityResultPair> results = result.GetResultsFor(pair.Key);
                        Double          maxScore        = Double.MinValue;
                        DocumentCluster selectedCluster = null;

                        foreach (var cluster in pair.Value)
                        {
                            Double score = scoreSelector(results[cluster.ClusterSeed]);
                            if (score > maxScore)
                            {
                                maxScore        = score;
                                selectedCluster = cluster;
                            }
                        }

                        foreach (var cluster in pair.Value)
                        {
                            if (cluster != selectedCluster)
                            {
                                cluster.Remove(pair.Key);
                            }
                        }
                    }
                }
            }

            output.RemoveEmptyClusters();


            return(output);
        }