/// <summary> /// Computes the similarity for the result object /// </summary> /// <param name="result">The result object, previously created with <see cref="Prepare(IEnumerable{HtmlNode}, string, List{string})" /></param> /// <param name="output">The output.</param> /// <param name="documents">Optional: select subset of documents to be analysed. These must be within <see cref="result" /> inner collections</param> /// <returns> /// The same result object specified in parameters /// </returns> public DocumentSimilarityResult ComputeSimilarity(DocumentSimilarityResult result, ITextRender output, List <HtmlNode> documents = null) { if (documents.isNullOrEmpty()) { documents = result.LeafDictionaryByDocuments.Keys.ToList(); } List <ComputeSimilarityTask> tasks = new List <ComputeSimilarityTask>(); for (int i = 0; i < documents.Count - 1; i++) { for (int y = i + 1; y < documents.Count; y++) { ComputeSimilarityTask task = new ComputeSimilarityTask() { documentA = documents[i], documentB = documents[y], nGrams_A = result.NGramsByDocuments[documents[i]], nGrams_B = result.NGramsByDocuments[documents[y]] }; tasks.Add(task); //var documentA = ; //var documentB = documents[y]; //var ABResult = ComputeSimilarity(documentA, documentB, result); //result.AddResult(ABResult); } } var task_chunks = tasks.SplitBySize((tasks.Count / 5)); foreach (var task_chunk in task_chunks) { output.AppendLine("Executing similarity computation task chunk [size:" + task_chunk.Count + "] " + task_chunks.IndexOf(task_chunk) + " of " + task_chunks.Count); Parallel.ForEach <ComputeSimilarityTask>(task_chunk, x => { ComputeSimilarity(x); } ); foreach (var task in task_chunk) { if (task.output != null) { result.AddResult(task.output); } } } return(result); }
/// <summary> /// Builds <see cref="LeafNodeDictionary"/> and <see cref="LeafNodeDictionaryEntryNGram"/>s for each document, to allow performance optimization /// </summary> /// <param name="documents">The documents.</param> /// <param name="leafSelectXPath">The leaf select x path, leave blank to use from settings, <see cref="DocumentSimilaritySettings.XPathToSelectLeafs"/></param> /// <param name="tagsToIgnore">The tags to ignore, leave unspecified to use from settings, <see cref="DocumentSimilaritySettings.TagsToIgnore"/>.</param> /// <returns></returns> public DocumentSimilarityResult Prepare(IEnumerable <HtmlNode> documents, String leafSelectXPath = "", List <String> tagsToIgnore = null) { leafSelectXPath = leafSelectXPath.or(settings.XPathToSelectLeafs, LeafNodeDictionary.DefaultNodeSelectionXPath); tagsToIgnore = tagsToIgnore.or(settings.TagsToIgnore, LeafNodeDictionary.DefaultTagsToIgnore); DocumentSimilarityResult result = new DocumentSimilarityResult(); frequencyCounter <String> xpathCounter = new frequencyCounter <string>(); Dictionary <HtmlNode, LeafNodeDictionary> leafDictionary = new Dictionary <HtmlNode, LeafNodeDictionary>(); foreach (HtmlNode documentA in documents) { LeafNodeDictionary leafNodeDictionaryA = new LeafNodeDictionary(documentA, leafSelectXPath, tagsToIgnore); if (leafNodeDictionaryA.items.Count < 5) { } foreach (var entry in leafNodeDictionaryA.items) { xpathCounter.Count(entry.XPath); } leafDictionary.Add(documentA, leafNodeDictionaryA); } var commonXPaths = xpathCounter.GetItemsWithTopFrequency(); foreach (var pair in leafDictionary) { pair.Value.RemoveEntriesByXPath(commonXPaths); } foreach (HtmlNode documentA in documents) { try { LeafNodeDictionary leafNodeDictionaryA = leafDictionary[documentA]; List <LeafNodeDictionaryEntryNGram> nGrams_A = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode); result.DocumentsByLeafDictionary.Add(leafNodeDictionaryA, documentA); result.DocumentsByNGrams.Add(nGrams_A, documentA); result.LeafDictionaryByDocuments.Add(documentA, leafNodeDictionaryA); result.NGramsByDocuments.Add(documentA, nGrams_A); } catch (Exception ex) { result.DocumentsWithExceptions.Add(documentA, ex); } } return(result); }
/// <summary> /// Computes similarity for two documents, that are part of result's inner collections /// </summary> /// <param name="documentA">The document a.</param> /// <param name="documentB">The document b.</param> /// <param name="result">The result object, previously created with <see cref="Prepare(IEnumerable{HtmlNode}, string, List{string})"/></param> /// <returns>Result for these two documents</returns> public DocumentSimilarityResultPair ComputeSimilarity(HtmlNode documentA, HtmlNode documentB, DocumentSimilarityResult result) { List <LeafNodeDictionaryEntryNGram> nGrams_A = result.NGramsByDocuments[documentA]; //setAnalysisTools<LeafNodeDictionaryEntry>.getNGramSet<LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode); List <LeafNodeDictionaryEntryNGram> nGrams_B = result.NGramsByDocuments[documentB]; //setAnalysisTools<LeafNodeDictionaryEntry>.getNGramSet<LeafNodeDictionaryEntryNGram>(leafNodeDictionaryB.items, settings.nGramWidth, settings.nGramMode); var score_StructureSimilarity = StructureSimilarity.GetSimilarity(nGrams_A, nGrams_B, settings.computationMethod); var score_ContentSimilarity = ContentSimilarity.GetSimilarity(nGrams_A, nGrams_B, settings.computationMethod); DocumentSimilarityResultPair output = new DocumentSimilarityResultPair { itemA = documentA, itemB = documentB, StructureSimilarity = score_StructureSimilarity, ContentSimilarity = score_ContentSimilarity }; return(output); }