Exemplo n.º 1
0
        /// <summary>
        /// Builds <see cref="LeafNodeDictionary"/> and <see cref="LeafNodeDictionaryEntryNGram"/>s for each document, to allow performance optimization
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="leafSelectXPath">The leaf select x path, leave blank to use from settings, <see cref="DocumentSimilaritySettings.XPathToSelectLeafs"/></param>
        /// <param name="tagsToIgnore">The tags to ignore, leave unspecified to use from settings, <see cref="DocumentSimilaritySettings.TagsToIgnore"/>.</param>
        /// <returns></returns>
        public DocumentSimilarityResult Prepare(IEnumerable <HtmlNode> documents, String leafSelectXPath = "", List <String> tagsToIgnore = null)
        {
            leafSelectXPath = leafSelectXPath.or(settings.XPathToSelectLeafs, LeafNodeDictionary.DefaultNodeSelectionXPath);
            tagsToIgnore    = tagsToIgnore.or(settings.TagsToIgnore, LeafNodeDictionary.DefaultTagsToIgnore);

            DocumentSimilarityResult result = new DocumentSimilarityResult();

            frequencyCounter <String> xpathCounter = new frequencyCounter <string>();

            Dictionary <HtmlNode, LeafNodeDictionary> leafDictionary = new Dictionary <HtmlNode, LeafNodeDictionary>();


            foreach (HtmlNode documentA in documents)
            {
                LeafNodeDictionary leafNodeDictionaryA = new LeafNodeDictionary(documentA, leafSelectXPath, tagsToIgnore);
                if (leafNodeDictionaryA.items.Count < 5)
                {
                }
                foreach (var entry in leafNodeDictionaryA.items)
                {
                    xpathCounter.Count(entry.XPath);
                }
                leafDictionary.Add(documentA, leafNodeDictionaryA);
            }

            var commonXPaths = xpathCounter.GetItemsWithTopFrequency();

            foreach (var pair in leafDictionary)
            {
                pair.Value.RemoveEntriesByXPath(commonXPaths);
            }

            foreach (HtmlNode documentA in documents)
            {
                try
                {
                    LeafNodeDictionary leafNodeDictionaryA = leafDictionary[documentA];

                    List <LeafNodeDictionaryEntryNGram> nGrams_A = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode);

                    result.DocumentsByLeafDictionary.Add(leafNodeDictionaryA, documentA);
                    result.DocumentsByNGrams.Add(nGrams_A, documentA);
                    result.LeafDictionaryByDocuments.Add(documentA, leafNodeDictionaryA);
                    result.NGramsByDocuments.Add(documentA, nGrams_A);
                } catch (Exception ex)
                {
                    result.DocumentsWithExceptions.Add(documentA, ex);
                }
            }
            return(result);
        }
        public void Analyze(LeafNodeDictionary leafDictionary)
        {
            CompleteGraph = graphTools.BuildGraphFromItems <LeafNodeDictionaryEntry, graphWrapNode <LeafNodeDictionaryEntry> >(leafDictionary.items, x => x.XPath, "/");
            CompleteGraph.pathSeparator = "/";

            GraphMetrics.Process(CompleteGraph);

            var bins = GraphMetrics.JunctionCounter.GetFrequencyBins();

            foreach (var bin in bins)
            {
                if (bin.Key >= JunctionSizeMin)
                {
                }
            }
        }
Exemplo n.º 3
0
        /// <summary>
        /// Computes the similarity between two items. If you have to compare more than two documents, use <see cref="Prepare(IEnumerable{HtmlNode}, string, List{string})"/> and <see cref="ComputeSimilarity(HtmlNode, HtmlNode, DocumentSimilarityResult)"/> methods for greater performances
        /// </summary>
        /// <param name="documentA">The document a.</param>
        /// <param name="documentB">The document b.</param>
        /// <returns></returns>
        public DocumentSimilarityResultPair ComputeSimilarity(HtmlNode documentA, HtmlNode documentB)
        {
            LeafNodeDictionary leafNodeDictionaryA = new LeafNodeDictionary(documentA);
            LeafNodeDictionary leafNodeDictionaryB = new LeafNodeDictionary(documentB);

            List <LeafNodeDictionaryEntryNGram> nGrams_A = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode);

            List <LeafNodeDictionaryEntryNGram> nGrams_B = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryB.items, settings.nGramWidth, settings.nGramMode);

            DocumentSimilarityResultPair output = new DocumentSimilarityResultPair
            {
                itemA = documentA,
                itemB = documentB
            };

            output.ContentSimilarity   = ContentSimilarity.GetSimilarity(nGrams_A, nGrams_B, settings.computationMethod);
            output.StructureSimilarity = ContentSimilarity.GetSimilarity(nGrams_A, nGrams_B, settings.computationMethod);

            return(output);
        }