public static List <PDFDocument> FindDuplicates(PDFDocument pdf_document, TitleCombinedCache cache)
        {
            List <PDFDocument> duplicate_pdf_documents = new List <PDFDocument>();

            // If a document is already marked as duplicate, then it doesn nt have any duplicates
            if (Choices.ReadingStages_DUPLICATE == pdf_document.ReadingStage)
            {
                return(duplicate_pdf_documents);
            }

            string title_this = pdf_document.TitleCombined.ToLower();

            if (title_this.Length < 128)
            {
                if (PDFDocument.TITLE_UNKNOWN != title_this)
                {
                    foreach (var entry in cache.Entries)
                    {
                        // Don't match the document to itself
                        if (entry.pdf_document == pdf_document)
                        {
                            continue;
                        }

                        // Don't match the document to duplicates
                        if (Choices.ReadingStages_DUPLICATE == entry.pdf_document.ReadingStage)
                        {
                            continue;
                        }

                        // Check if we are almost similar to the other document
                        string title_other = entry.title_combined;
                        if (PDFDocument.TITLE_UNKNOWN != title_other)
                        {
                            if ((LetterPairSimilarity.CompareStrings(title_this, title_other) > 0.95) && (StringTools.LewensteinSimilarity(title_this, title_other) > 0.9))
                            {
                                duplicate_pdf_documents.Add(entry.pdf_document);
                            }
                        }
                    }
                }
            }

            return(duplicate_pdf_documents);
        }
        private void FindDuplicates(PDFDocument pdf_document_this)
        {
            // Invoke the GUI
            this.Dispatcher.Invoke(new Action(() =>
            {
                ClearDuplicates();
            }
                                              ));


            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            if (null == pdf_document_this)
            {
                Logging.Info("Not doing duplicate detection if we don't have a PDFDocument to work with.");
                return;
            }

            string title_this = pdf_document_this.TitleCombined.ToLower();

            if (PDFDocument.TITLE_UNKNOWN == title_this)
            {
                Logging.Info("Not doing duplicate detection for an unknown title.");
                return;
            }

            // Find all the document in the library that have the same title as this document's
            TitleCombinedCache cache = new TitleCombinedCache(pdf_document_this.Library.PDFDocuments);
            List <PDFDocument> duplicate_pdf_documents = FindDuplicates(pdf_document_this, cache);

            Logging.Info("It took {0}ms to run the duplicate detection.", stopwatch.ElapsedMilliseconds);

            // Invoke the GUI
            this.Dispatcher.Invoke(new Action(() =>
            {
                RenderDuplicates(duplicate_pdf_documents);
            }
                                              ));
        }
Пример #3
0
        private void FindDuplicates(PDFDocument pdf_document_this)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            // Invoke the GUI
            WPFDoEvents.InvokeInUIThread(() =>
            {
                ClearDuplicates();
            });

            Stopwatch stopwatch = Stopwatch.StartNew();

            if (null == pdf_document_this)
            {
                Logging.Info("Not doing duplicate detection if we don't have a PDFDocument to work with.");
                return;
            }

            string title_this = pdf_document_this.TitleCombined.ToLower();

            if (Constants.TITLE_UNKNOWN == title_this)
            {
                Logging.Info("Not doing duplicate detection for an unknown title.");
                return;
            }

            // Find all the document in the library that have the same title as this document's
            TitleCombinedCache cache = new TitleCombinedCache(pdf_document_this.LibraryRef.Xlibrary.PDFDocuments);
            List <PDFDocument> duplicate_pdf_documents = FindDuplicates(pdf_document_this, cache);

            Logging.Info("It took {0}ms to run the duplicate detection.", stopwatch.ElapsedMilliseconds);

            // Invoke the GUI
            WPFDoEvents.InvokeInUIThread(() =>
            {
                RenderDuplicates(duplicate_pdf_documents);
            }
                                         );
        }