public static List <PDFDocument> FindDuplicates(PDFDocument pdf_document, TitleCombinedCache cache) { List <PDFDocument> duplicate_pdf_documents = new List <PDFDocument>(); // If a document is already marked as duplicate, then it doesn nt have any duplicates if (Choices.ReadingStages_DUPLICATE == pdf_document.ReadingStage) { return(duplicate_pdf_documents); } string title_this = pdf_document.TitleCombined.ToLower(); if (title_this.Length < 128) { if (PDFDocument.TITLE_UNKNOWN != title_this) { foreach (var entry in cache.Entries) { // Don't match the document to itself if (entry.pdf_document == pdf_document) { continue; } // Don't match the document to duplicates if (Choices.ReadingStages_DUPLICATE == entry.pdf_document.ReadingStage) { continue; } // Check if we are almost similar to the other document string title_other = entry.title_combined; if (PDFDocument.TITLE_UNKNOWN != title_other) { if ((LetterPairSimilarity.CompareStrings(title_this, title_other) > 0.95) && (StringTools.LewensteinSimilarity(title_this, title_other) > 0.9)) { duplicate_pdf_documents.Add(entry.pdf_document); } } } } } return(duplicate_pdf_documents); }
private void FindDuplicates(PDFDocument pdf_document_this) { // Invoke the GUI this.Dispatcher.Invoke(new Action(() => { ClearDuplicates(); } )); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); if (null == pdf_document_this) { Logging.Info("Not doing duplicate detection if we don't have a PDFDocument to work with."); return; } string title_this = pdf_document_this.TitleCombined.ToLower(); if (PDFDocument.TITLE_UNKNOWN == title_this) { Logging.Info("Not doing duplicate detection for an unknown title."); return; } // Find all the document in the library that have the same title as this document's TitleCombinedCache cache = new TitleCombinedCache(pdf_document_this.Library.PDFDocuments); List <PDFDocument> duplicate_pdf_documents = FindDuplicates(pdf_document_this, cache); Logging.Info("It took {0}ms to run the duplicate detection.", stopwatch.ElapsedMilliseconds); // Invoke the GUI this.Dispatcher.Invoke(new Action(() => { RenderDuplicates(duplicate_pdf_documents); } )); }
private void FindDuplicates(PDFDocument pdf_document_this) { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); // Invoke the GUI WPFDoEvents.InvokeInUIThread(() => { ClearDuplicates(); }); Stopwatch stopwatch = Stopwatch.StartNew(); if (null == pdf_document_this) { Logging.Info("Not doing duplicate detection if we don't have a PDFDocument to work with."); return; } string title_this = pdf_document_this.TitleCombined.ToLower(); if (Constants.TITLE_UNKNOWN == title_this) { Logging.Info("Not doing duplicate detection for an unknown title."); return; } // Find all the document in the library that have the same title as this document's TitleCombinedCache cache = new TitleCombinedCache(pdf_document_this.LibraryRef.Xlibrary.PDFDocuments); List <PDFDocument> duplicate_pdf_documents = FindDuplicates(pdf_document_this, cache); Logging.Info("It took {0}ms to run the duplicate detection.", stopwatch.ElapsedMilliseconds); // Invoke the GUI WPFDoEvents.InvokeInUIThread(() => { RenderDuplicates(duplicate_pdf_documents); } ); }