/// <summary> /// Builds <see cref="LeafNodeDictionary"/> and <see cref="LeafNodeDictionaryEntryNGram"/>s for each document, to allow performance optimization /// </summary> /// <param name="documents">The documents.</param> /// <param name="leafSelectXPath">The leaf select x path, leave blank to use from settings, <see cref="DocumentSimilaritySettings.XPathToSelectLeafs"/></param> /// <param name="tagsToIgnore">The tags to ignore, leave unspecified to use from settings, <see cref="DocumentSimilaritySettings.TagsToIgnore"/>.</param> /// <returns></returns> public DocumentSimilarityResult Prepare(IEnumerable <HtmlNode> documents, String leafSelectXPath = "", List <String> tagsToIgnore = null) { leafSelectXPath = leafSelectXPath.or(settings.XPathToSelectLeafs, LeafNodeDictionary.DefaultNodeSelectionXPath); tagsToIgnore = tagsToIgnore.or(settings.TagsToIgnore, LeafNodeDictionary.DefaultTagsToIgnore); DocumentSimilarityResult result = new DocumentSimilarityResult(); frequencyCounter <String> xpathCounter = new frequencyCounter <string>(); Dictionary <HtmlNode, LeafNodeDictionary> leafDictionary = new Dictionary <HtmlNode, LeafNodeDictionary>(); foreach (HtmlNode documentA in documents) { LeafNodeDictionary leafNodeDictionaryA = new LeafNodeDictionary(documentA, leafSelectXPath, tagsToIgnore); if (leafNodeDictionaryA.items.Count < 5) { } foreach (var entry in leafNodeDictionaryA.items) { xpathCounter.Count(entry.XPath); } leafDictionary.Add(documentA, leafNodeDictionaryA); } var commonXPaths = xpathCounter.GetItemsWithTopFrequency(); foreach (var pair in leafDictionary) { pair.Value.RemoveEntriesByXPath(commonXPaths); } foreach (HtmlNode documentA in documents) { try { LeafNodeDictionary leafNodeDictionaryA = leafDictionary[documentA]; List <LeafNodeDictionaryEntryNGram> nGrams_A = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode); result.DocumentsByLeafDictionary.Add(leafNodeDictionaryA, documentA); result.DocumentsByNGrams.Add(nGrams_A, documentA); result.LeafDictionaryByDocuments.Add(documentA, leafNodeDictionaryA); result.NGramsByDocuments.Add(documentA, nGrams_A); } catch (Exception ex) { result.DocumentsWithExceptions.Add(documentA, ex); } } return(result); }
public void Analyze(LeafNodeDictionary leafDictionary) { CompleteGraph = graphTools.BuildGraphFromItems <LeafNodeDictionaryEntry, graphWrapNode <LeafNodeDictionaryEntry> >(leafDictionary.items, x => x.XPath, "/"); CompleteGraph.pathSeparator = "/"; GraphMetrics.Process(CompleteGraph); var bins = GraphMetrics.JunctionCounter.GetFrequencyBins(); foreach (var bin in bins) { if (bin.Key >= JunctionSizeMin) { } } }
/// <summary> /// Computes the similarity between two items. If you have to compare more than two documents, use <see cref="Prepare(IEnumerable{HtmlNode}, string, List{string})"/> and <see cref="ComputeSimilarity(HtmlNode, HtmlNode, DocumentSimilarityResult)"/> methods for greater performances /// </summary> /// <param name="documentA">The document a.</param> /// <param name="documentB">The document b.</param> /// <returns></returns> public DocumentSimilarityResultPair ComputeSimilarity(HtmlNode documentA, HtmlNode documentB) { LeafNodeDictionary leafNodeDictionaryA = new LeafNodeDictionary(documentA); LeafNodeDictionary leafNodeDictionaryB = new LeafNodeDictionary(documentB); List <LeafNodeDictionaryEntryNGram> nGrams_A = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode); List <LeafNodeDictionaryEntryNGram> nGrams_B = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryB.items, settings.nGramWidth, settings.nGramMode); DocumentSimilarityResultPair output = new DocumentSimilarityResultPair { itemA = documentA, itemB = documentB }; output.ContentSimilarity = ContentSimilarity.GetSimilarity(nGrams_A, nGrams_B, settings.computationMethod); output.StructureSimilarity = ContentSimilarity.GetSimilarity(nGrams_A, nGrams_B, settings.computationMethod); return(output); }