Ejemplo n.º 1
0
        /// <summary>
        /// Stores document sources for each cluster
        /// </summary>
        /// <param name="clusters">The clusters.</param>
        /// <param name="documentNodeDictionary">The document node dictionary.</param>
        /// <returns></returns>
        public List <HtmlSourceAndUrlCollection> ConvertToSourceCollections(DocumentClusterCollection clusters, Dictionary <HtmlNode, HtmlSourceAndUrl> documentNodeDictionary)
        {
            List <HtmlSourceAndUrlCollection> output = new List <HtmlSourceAndUrlCollection>();

            foreach (var cluster in clusters.GetClusters <DocumentCluster>(false))
            {
                HtmlSourceAndUrlCollection sourceCollection = new HtmlSourceAndUrlCollection();
                sourceCollection.name = cluster.name;

                foreach (var document in cluster.items)
                {
                    sourceCollection.items.Add(documentNodeDictionary[document]);
                }
                output.Add(sourceCollection);
            }

            return(output);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Gets cluster collection
        /// </summary>
        /// <param name="collectionName">Name for the collection.</param>
        /// <param name="result">The result.</param>
        /// <param name="scoreSelector">The score selector.</param>
        /// <returns></returns>
        public DocumentClusterCollection GetClusters(DocumentSimilarityResult result, String collectionName = "Clusters", Func <DocumentSimilarityResultPair, Double> scoreSelector = null, Double minSimScore = Double.MinValue)
        {
            if (minSimScore == Double.MinValue)
            {
                minSimScore = settings.MinScoreInRangeCriterion;
            }

            if (scoreSelector == null)
            {
                scoreSelector = settings.SimilarityScoreSource.GetSelector();
            }

            DocumentClusterCollection output = new DocumentClusterCollection()
            {
                name = collectionName
            };

            var documents     = result.GetDocuments();
            var sortedResults = result.GetAllResults().OrderByDescending(x => scoreSelector).ToList();

            rangeFinder similarityRange = new rangeFinder();

            foreach (var pair in sortedResults)
            {
                similarityRange.Learn(scoreSelector(pair));
            }


            Int32 limit = documents.Count;
            Int32 i     = 0;

            while (documents.Any())
            {
                i++;
                var doc = documents.FirstOrDefault();
                if (doc == null)
                {
                    break;
                }

                var results = result.GetResultsFor(doc);

                DocumentCluster currentCluster = output.NewCluster <DocumentCluster>(); //new DocumentCluster();
                currentCluster.ClusterSeed = doc;

                foreach (KeyValuePair <HtmlNode, DocumentSimilarityResultPair> pair in results)
                {
                    Double scoreAtRange = similarityRange.GetPositionInRange(scoreSelector(pair.Value));
                    if (scoreAtRange > minSimScore)
                    {
                        currentCluster.Add(pair.Key, scoreAtRange);
                        documents.Remove(pair.Key);
                    }
                }

                if (currentCluster.items.Count == 0)
                {
                    output.NullCluster.Add(doc, 0);
                    documents.Remove(doc);
                }
                else
                {
                    documents.Remove(doc);
                    currentCluster.items.Add(doc);
                    output.AddCluster(currentCluster);
                }

                if (i > limit)
                {
                    break;
                }
            }

            foreach (var item in output.NullCluster.items)
            {
                var             results         = result.GetResultsFor(item);
                Double          maxScore        = Double.MinValue;
                DocumentCluster selectedCluster = null;

                foreach (var cluster in output.GetClusters <DocumentCluster>(false))
                {
                    Double score = scoreSelector(results[cluster.ClusterSeed]);
                    if (score > maxScore)
                    {
                        maxScore        = score;
                        selectedCluster = cluster;
                    }
                }

                if (similarityRange.GetPositionInRange(maxScore) > minSimScore)
                {
                    selectedCluster.Add(item, maxScore);
                    output.NullCluster.Remove(item);
                }
                else
                {
                }
            }



            if (settings.ExclusiveClusterMembership)
            {
                var itemToCluster = output.GetItemToClusterAssociations <DocumentCluster>();

                foreach (var pair in itemToCluster)
                {
                    if (pair.Value.Count > 1)
                    {
                        Dictionary <HtmlNode, DocumentSimilarityResultPair> results = result.GetResultsFor(pair.Key);
                        Double          maxScore        = Double.MinValue;
                        DocumentCluster selectedCluster = null;

                        foreach (var cluster in pair.Value)
                        {
                            Double score = scoreSelector(results[cluster.ClusterSeed]);
                            if (score > maxScore)
                            {
                                maxScore        = score;
                                selectedCluster = cluster;
                            }
                        }

                        foreach (var cluster in pair.Value)
                        {
                            if (cluster != selectedCluster)
                            {
                                cluster.Remove(pair.Key);
                            }
                        }
                    }
                }
            }

            output.RemoveEmptyClusters();


            return(output);
        }