public PDFRenderer(string precomputed_document_fingerprint, string pdf_filename, string pdf_user_password, string pdf_owner_password)
        {
            this.pdf_filename         = pdf_filename;
            this.pdf_user_password    = pdf_user_password;
            this.pdf_owner_password   = pdf_owner_password;
            this.document_fingerprint = precomputed_document_fingerprint ?? StreamFingerprint.FromFile(this.pdf_filename);

            pdf_render_file_layer = new PDFRendererFileLayer(this.document_fingerprint, pdf_filename);
            sorax_pdf_renderer    = new SoraxPDFRenderer(pdf_filename, pdf_user_password, pdf_owner_password);
        }
Esempio n. 2
0
        public static PDFDocument CreateFromPDF(WebLibraryDetail web_library_detail, string filename, string precalculated_fingerprint__can_be_null)
        {
            string fingerprint = precalculated_fingerprint__can_be_null;

            if (String.IsNullOrEmpty(fingerprint))
            {
                fingerprint = StreamFingerprint.FromFile(filename);
            }

            PDFDocument pdf_document = new PDFDocument(web_library_detail);

            // Store the most important information
            //
            // thread-UNSAFE access is permitted as the PDF has just been created so there's no thread-safety risk yet.
            pdf_document.FileType            = Path.GetExtension(filename).TrimStart('.');
            pdf_document.Fingerprint         = fingerprint;
            pdf_document.DateAddedToDatabase = DateTime.UtcNow;
            pdf_document.DateLastModified    = DateTime.UtcNow;

            Directory.CreateDirectory(pdf_document.DocumentBasePath);

            pdf_document.StoreAssociatedPDFInRepository(filename);

            List <LibraryDB.LibraryItem> library_items = web_library_detail.Xlibrary.LibraryDB.GetLibraryItems(PDFDocumentFileLocations.METADATA, new List <string>()
            {
                pdf_document.Fingerprint
            });

            ASSERT.Test(library_items.Count < 2);
            if (0 == library_items.Count)
            {
                pdf_document.QueueToStorage();
            }
            else
            {
                LibraryDB.LibraryItem library_item = null;

                try
                {
                    library_item = library_items[0];
                    pdf_document = LoadFromMetaData(web_library_detail, pdf_document.Fingerprint, library_item.data);
                }
                catch (Exception ex)
                {
                    // keep the unrecognized data around so we may fix it later...
                    Logging.Error(ex, "There was a problem reloading an existing PDF from existing metadata, so overwriting it! (document fingerprint: {0}, data: {1})", pdf_document.Fingerprint, library_item?.MetadataAsString() ?? "???");

                    // TODO: WARNING: overwriting old (possibly corrupted) records like this can loose you old/corrupted/unsupported metadata content!
                    pdf_document.QueueToStorage();
                    //pdf_document.SaveToMetaData();
                }
            }

            return(pdf_document);
        }
Esempio n. 3
0
        public static PDFDocument CreateFromPDF(Library library, string filename, string precalculated_fingerprint__can_be_null)
        {
            string fingerprint = precalculated_fingerprint__can_be_null;

            if (String.IsNullOrEmpty(fingerprint))
            {
                fingerprint = StreamFingerprint.FromFile(filename);
            }

            LockObject  _lock        = new LockObject();
            PDFDocument pdf_document = new PDFDocument(_lock, library);

            // Store the most important information
            //
            // thread-UNSAFE access is permitted as the PDF has just been created so there's no thread-safety risk yet.
            pdf_document.doc.FileType            = Path.GetExtension(filename).TrimStart('.');
            pdf_document.doc.Fingerprint         = fingerprint;
            pdf_document.doc.DateAddedToDatabase = DateTime.UtcNow;
            pdf_document.doc.DateLastModified    = DateTime.UtcNow;

            Directory.CreateDirectory(pdf_document.DocumentBasePath);

            pdf_document.doc.StoreAssociatedPDFInRepository(filename);

            List <LibraryDB.LibraryItem> library_items = library.LibraryDB.GetLibraryItems(pdf_document.doc.Fingerprint, PDFDocumentFileLocations.METADATA);

            if (0 == library_items.Count)
            {
                pdf_document.QueueToStorage();
            }
            else
            {
                try
                {
                    LibraryDB.LibraryItem library_item = library_items[0];
                    pdf_document = LoadFromMetaData(library, library_item.data, null);
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was a problem reloading an existing PDF from existing metadata, so overwriting it!");

                    // TODO: WARNING: overwriting old (possibly corrupted) records like this can loose you old/corrupted/unsupported metadata content!
                    pdf_document.QueueToStorage();
                    //pdf_document.SaveToMetaData();
                }
            }

            return(pdf_document);
        }
 private static string GenerateRTFHash(bool is_note_format, string text_for_cluster)
 {
     if (!String.IsNullOrEmpty(text_for_cluster))
     {
         return
             (""
              + (is_note_format ? "0" : "1")
              + StreamFingerprint.FromText(text_for_cluster).Substring(0, 8)
             );
     }
     else
     {
         return(null);
     }
 }
Esempio n. 5
0
        public static PDFDocument CreateFromPDF(Library library, string filename, string precalculated_fingerprint__can_be_null)
        {
            string fingerprint = precalculated_fingerprint__can_be_null;

            if (String.IsNullOrEmpty(fingerprint))
            {
                fingerprint = StreamFingerprint.FromFile(filename);
            }

            PDFDocument pdf_document = new PDFDocument(library);

            // Store the most important information
            pdf_document.FileType            = Path.GetExtension(filename).TrimStart('.');
            pdf_document.Fingerprint         = fingerprint;
            pdf_document.DateAddedToDatabase = DateTime.UtcNow;
            pdf_document.DateLastModified    = DateTime.UtcNow;

            Directory.CreateDirectory(pdf_document.DocumentBasePath);

            pdf_document.StoreAssociatedPDFInRepository(filename);

            List <LibraryDB.LibraryItem> library_items = library.LibraryDB.GetLibraryItems(pdf_document.Fingerprint, PDFDocumentFileLocations.METADATA);

            if (0 == library_items.Count)
            {
                DocumentQueuedStorer.Instance.Queue(pdf_document);
            }
            else
            {
                try
                {
                    LibraryDB.LibraryItem library_item = library_items[0];
                    pdf_document = LoadFromMetaData(library, library_item.data, null);
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was a problem reloading an existing PDF from existing metadata, so overwriting it!");
                    DocumentQueuedStorer.Instance.Queue(pdf_document);
                    //pdf_document.SaveToMetaData();
                }
            }

            return(pdf_document);
        }
        private static string DoSearch(string title)
        {
            try
            {
                string title_encoded = Convert.ToBase64String(Encoding.UTF8.GetBytes(title));

                string auth = title;
                if (0 < auth.Length)
                {
                    // construct key for bibtexsearch.com authentication hash:
                    auth = auth[0] + auth + auth[0];
                }
                auth = StreamFingerprint.FromText(auth);

                string url_server = bibtex_search_server_manager.GetServerUrl();
                string url        = String.Format("{0}/search?auth={1}&qe={2}", url_server, auth, WebUtility.HtmlEncode(title_encoded));
                try
                {
                    WebHeaderCollection header_collection;
                    Stopwatch           clk = Stopwatch.StartNew();

                    using (MemoryStream ms = UrlDownloader.DownloadWithBlocking(url, out header_collection))
                    {
                        bibtex_search_server_manager.ReportLatency(url_server, clk.ElapsedMilliseconds);
                        Logging.Debug特("bibtex_search_server_manager: Download {0} took {1} ms", url, clk.ElapsedMilliseconds);

                        string json = Encoding.UTF8.GetString(ms.ToArray());
                        return(json);
                    }
                }
                catch (Exception ex)
                {
                    bibtex_search_server_manager.ReportError(url_server);
                    Logging.Warn(ex, "There was a problem searching for BibTeX for title '{0}' at server '{1}'.", title, url_server);
                }
            }

            catch (Exception ex)
            {
                Logging.Warn(ex, "There was a problem searching for BibTeX for title '{0}'.", title);
            }

            return(null);
        }
Esempio n. 7
0
        static void Main(string[] args)
        {
            if (args.Length == 1)
            {
                string filepath = args[0];

                string fingerprint = StreamFingerprint.FromFile(filepath);

                Console.WriteLine(fingerprint);
            }
            else
            {
                Console.WriteLine(@"
QiqqaHasher <filepath>

Calculates the Qiqqa compatible fingerprint hash for the given file.
");
            }
        }
        private static string DoSearch(string title)
        {
            try
            {
                string title_encoded = Convert.ToBase64String(Encoding.UTF8.GetBytes(title));

                string auth = title;
                if (0 < auth.Length)
                {
                    auth = auth[0] + auth + auth[0];
                }
                auth = StreamFingerprint.FromText(auth);

                string url_server = bibtex_search_server_manager.GetServerUrl();
                string url        = String.Format("{0}/search?auth={1}&qe={2}", url_server, auth, WebUtility.HtmlEncode(title_encoded));
                try
                {
                    MemoryStream        ms;
                    WebHeaderCollection header_collection;
                    DateTime            START = DateTime.UtcNow;
                    UrlDownloader.DownloadWithBlocking(ConfigurationManager.Instance.Proxy, url, out ms, out header_collection);
                    DateTime STOP = DateTime.UtcNow;
                    bibtex_search_server_manager.ReportLatency(url_server, (STOP - START).TotalMilliseconds);

                    string json = Encoding.UTF8.GetString(ms.ToArray());
                    return(json);
                }

                catch (Exception ex)
                {
                    bibtex_search_server_manager.ReportError(url_server);
                    Logging.Warn(ex, "There was a problem searching for BibTeX for title '{0}' at server '{1}'.", title, url_server);
                }
            }

            catch (Exception ex)
            {
                Logging.Warn(ex, "There was a problem searching for BibTeX for title '{0}'.", title);
            }

            return(null);
        }
Esempio n. 9
0
        private PDFDocument AddNewDocumentToLibrary(string filename, string original_filename, string suggested_download_source, string bibtex, HashSet <string> tags, string comments, bool suppressDialogs, bool suppress_signal_that_docs_have_changed)
        {
            // Flag that someone is trying to add to the library.  This is used by the background processes to hold off while the library is busy being added to...
            //Utilities.LockPerfTimer l1_clk = Utilities.LockPerfChecker.Start();
            lock (last_pdf_add_time_lock)
            {
                //l1_clk.LockPerfTimerStop();
                last_pdf_add_time = DateTime.UtcNow;
            }

            if (String.IsNullOrEmpty(filename) || filename.EndsWith(".vanilla_reference"))
            {
                return(AddVanillaReferenceDocumentToLibrary(bibtex, tags, comments, suppressDialogs, suppress_signal_that_docs_have_changed));
            }

            bool is_a_document_we_can_cope_with = false;

            if (0 == Path.GetExtension(filename).ToLower().CompareTo(".pdf"))
            {
                is_a_document_we_can_cope_with = true;
            }
            else
            {
                if (DocumentConversion.CanConvert(filename))
                {
                    string filename_before_conversion = filename;
                    string filename_after_conversion  = TempFile.GenerateTempFilename("pdf");
                    if (DocumentConversion.Convert(filename_before_conversion, filename_after_conversion))
                    {
                        is_a_document_we_can_cope_with = true;
                        filename = filename_after_conversion;
                    }
                }
            }

            if (!is_a_document_we_can_cope_with)
            {
                string extension = Path.GetExtension(filename);

                if (!suppressDialogs)
                {
                    MessageBoxes.Info("This document library does not support {0} files.  Free and Premium libraries only support PDF files.  Premium+ libraries can automatically convert DOC and DOCX files to PDF.\n\nYou can convert your DOC files to PDFs using the Conversion Tool available on the Start Page Tools menu.\n\nSkipping {1}.", extension, filename);
                }
                else
                {
                    StatusManager.Instance.UpdateStatus("LibraryDocument", String.Format("This document library does not support {0} files.", extension));
                }
                return(null);
            }

            // If the PDF does not exist, can not clone
            if (!File.Exists(filename))
            {
                Logging.Info("Can not add non-existent file to library, so skipping: {0}", filename);
                return(null);
            }

            string fingerprint = StreamFingerprint.FromFile(filename);

            PDFDocument pdf_document = GetDocumentByFingerprint(fingerprint);

            // Useful in logging for diagnosing if we're adding the same document again
            Logging.Info("Fingerprint: {0} - add to library: {1}", fingerprint, (null == pdf_document));
            if (null != pdf_document)
            {
                // Pdf reportedly exists in database.

                // Store the pdf in our location
                pdf_document.StoreAssociatedPDFInRepository(filename);

                // If the document was previously deleted in metadata, reinstate it
                if (pdf_document.Deleted)
                {
                    Logging.Info("The document {0} was deleted, so reinstating it.", fingerprint);
                    pdf_document.Deleted = false;
                    pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.Deleted);
                }

                // Try to add some useful information from the download source if the metadata doesn't already have it
                if (!String.IsNullOrEmpty(suggested_download_source) &&
                    (String.IsNullOrEmpty(pdf_document.DownloadLocation)
                     // or when the new source is a URL we also
                     // *upgrade* our source info by taking up the new URL
                     // as we than assume that a new URL is 'better' i.e. more 'fresh'
                     // than any existing URL or local source file path:
                     || suggested_download_source.StartsWith("http://") ||
                     suggested_download_source.StartsWith("https://") ||
                     suggested_download_source.StartsWith("ftp://") ||
                     suggested_download_source.StartsWith("ftps://"))
                    // *and* the old and new source shouldn't be the same:
                    && suggested_download_source != pdf_document.DownloadLocation)
                {
                    Logging.Info("The document in the library had no download location or an older one, so inferring it from download: {0} --> {1}", pdf_document.DownloadLocation ?? "(NULL)", suggested_download_source);
                    pdf_document.DownloadLocation = suggested_download_source;
                    pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.DownloadLocation);
                }

                // TODO: *merge* the BibTeX!
                if (!String.IsNullOrEmpty(bibtex))
                {
                    pdf_document.BibTex = bibtex;
                    pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.BibTex);
                }

                // merge = add new tags to existing ones (if any)
                if (tags != null)
                {
                    foreach (string tag in tags)
                    {
                        pdf_document.AddTag(tag); // Notify changes called internally
                    }
                }

                // TODO: merge comments?
                //
                // If we already have comments, then append them to our existing comments (if they are not identical)
                if (!String.IsNullOrEmpty(comments))
                {
                    if (pdf_document.Comments != comments)
                    {
                        pdf_document.Comments = pdf_document.Comments + "\n\n---\n\n\n" + comments;
                        pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.Comments);
                    }
                }
            }
            else
            {
                // Create a new document
                pdf_document = PDFDocument.CreateFromPDF(this, filename, fingerprint);
                //pdf_document.OriginalFileName = original_filename;
                pdf_document.DownloadLocation = suggested_download_source;
                pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.DownloadLocation);
                pdf_document.BibTex = bibtex;
                pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.BibTex);
                if (tags != null)
                {
                    foreach (string tag in tags)
                    {
                        pdf_document.AddTag(tag);
                    }
                }

                pdf_document.Comments = comments;
                pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.Comments);

                Utilities.LockPerfTimer l2_clk = Utilities.LockPerfChecker.Start();
                lock (pdf_documents_lock)
                {
                    l2_clk.LockPerfTimerStop();
                    // Store in our database - note that we have the lock already
                    pdf_documents[pdf_document.Fingerprint] = pdf_document;
                }

                // Get OCR queued
                pdf_document.PDFRenderer.CauseAllPDFPagesToBeOCRed();
            }

            if (!suppress_signal_that_docs_have_changed)
            {
                SignalThatDocumentsHaveChanged(pdf_document);
            }

            return(pdf_document);
        }
Esempio n. 10
0
        /// <summary>
        /// Removes invalid files, calculates fingerprints, determines if vanilla, and checks if fingerprint already in library.
        /// </summary>
        protected ParseFileResult CreateFinalResult()
        {
            List <BibTeXEntry> finalEntries = new List <BibTeXEntry>();

            foreach (var entry in Entries)
            {
                try
                {
                    // Assume vanilla to start
                    entry.IsVanilla   = true;
                    entry.Fingerprint = null;

                    if (entry.FileType != null && entry.FileType.Equals("pdf") && !String.IsNullOrEmpty(entry.Filename))
                    {
                        if (!File.Exists(entry.Filename))
                        {
                            // Perhaps it's a relative reference instead (like Qiqqa's export)
                            try
                            {
                                string speculativeRelativeFn = Path.Combine(_exportDirectory, entry.Filename);
                                if (File.Exists(speculativeRelativeFn))
                                {
                                    entry.Filename = speculativeRelativeFn;
                                }
                            }
                            catch
                            {
                            }
                        }

                        if (File.Exists(entry.Filename))
                        {
                            entry.Fingerprint = StreamFingerprint.FromFile(entry.Filename);

                            if (!String.IsNullOrEmpty(entry.Fingerprint))
                            {
                                // Definitely has a valid file...
                                entry.IsVanilla = false;
                            }
                        }
                    }
                    else
                    {
                        // If file could not be found, ensure it's blanked out so we don't try to import it. This is particularly import w.r.t filenames with funny characters, where the import will choke.
                        entry.Filename = entry.FileType = null;
                    }

                    finalEntries.Add(entry);
                }
                catch (Exception ex)
                {
                    Logging.Error(ex);

                    // TODO: log /status manager.
                    // Ignore problems with individual files.
                }
            }

            Entries = finalEntries;

            foreach (var entry in Entries)
            {
                if (entry.IsVanilla)
                {
                    entry.ExistsInLibrary = ImportLibrary.Xlibrary.DocumentExistsInLibraryWithBibTeX(entry.Id);
                }
                else
                {
                    entry.ExistsInLibrary = ImportLibrary.Xlibrary.DocumentExistsInLibraryWithFingerprint(entry.Fingerprint);
                }
            }

            int vanillaEntriesCount = Entries.Where(x => x.IsVanilla).Count();

            return(new ParseFileResult(Entries, vanillaEntriesCount));
        }
Esempio n. 11
0
        //
        // Summary:
        //     Represents the method that defines a set of criteria and determines whether the
        //     specified object meets those criteria.
        //
        // Parameters:
        //   obj:
        //     The object to compare against the criteria defined within the method represented
        //     by this delegate.
        //
        // Type parameters:
        //   T:
        //     The type of the object to compare.
        //
        // Returns:
        //     true if obj meets the criteria defined within the method represented by this
        //     delegate; otherwise, false.
        internal bool DecideIfIncludeDuringDirScan(FileSystemEntryInfo obj)
        {
            bool isRegularFile = !(obj.IsDevice || obj.IsDirectory || obj.IsMountPoint || /* obj.IsReparsePoint (hardlink!) || */ obj.IsOffline || obj.IsSystem || obj.IsTemporary);

            Logging.Debug("FolderWatcher: testing {1} '{0}' for inclusion in the Qiqqa library.", obj.FullPath, isRegularFile ? "regular File" : obj.IsDirectory ? "directory" : "node");

            if (ShutdownableManager.Instance.IsShuttingDown)
            {
                Logging.Info("FolderWatcher: Breaking out of inner processing loop due to daemon termination");
                throw new OperationCanceledException("FolderWatcher: Breaking out of inner processing loop due to daemon termination");
            }

            if (Qiqqa.Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
            {
                Logging.Info("FolderWatcher: Breaking out of inner processing loop due to DisableAllBackgroundTasks");
                throw new OperationCanceledException("FolderWatcher: Breaking out of inner processing loop due to DisableAllBackgroundTasks");
            }

            if (LibraryRef == null || folder_watcher_manager?.TypedTarget == null)
            {
                Logging.Info("FolderWatcher: Breaking out of inner processing loop due to disposed library and/or watch manager");
                throw new OperationCanceledException("FolderWatcher: Breaking out of inner processing loop due to disposed library and/or watch manager");
            }

            global_watch_stats.Inc(0.1);

            bool have_we_slept = false;

            if (watch_stats.index_processing_clock.ElapsedMilliseconds > MAX_SECONDS_PER_ITERATION)
            {
                watch_stats.daemon.Sleep(SECONDS_TO_RELAX_PER_ITERATION);

                // reset:
                watch_stats.index_processing_clock.Restart();

                have_we_slept = true;
            }

            // only include *.pdf files. Use a `while` loop to allow easy `break` statements to abort the inclusion filter logic below:
            while (isRegularFile && obj.Extension.ToLower() == ".pdf")
            {
                // check if the given file isn't already present in the library:

                watch_stats.scanned_file_count++;

                // If we already have this file in the "cache since we started", skip it
                if (folder_watcher_manager.TypedTarget.HaveProcessedFile(obj.FullPath))
                {
                    Logging.Debug("FolderWatcher is skipping {0} as it has already been processed", obj.FullPath);
                    watch_stats.skipped_file_count++;
                    break;
                }

                if (have_we_slept)
                {
                    // As we have slept a while, it's quite unsure whether that file still exists.
                    // Include it only when it still exists and otherwise be sure to retrigger a scan to follow up
                    // any other directory changes.
                    if (!File.Exists(obj.FullPath))
                    {
                        Logging.Info("FolderWatcher is skipping {0} as it has disappeared while we were sleeping", obj.FullPath);
                        FolderContentsHaveChanged = true;
                        break;
                    }
                }

                // ignore zero-length and tiny sized files as those sure are buggy/illegal PDFs:
                //
                // https://stackoverflow.com/questions/17279712/what-is-the-smallest-possible-valid-pdf
                if (obj.FileSize <= 66)
                {
                    Logging.Warn("FolderWatcher is skipping {0} as it is too small to be a valid PDF file @ {1} bytes", obj.FullPath, obj.FileSize);
                    break;
                }

                // Check that the file is not still locked - if it is, mark that the folder is still "changed" and come back later.
                //
                // We do this at the same time as calculating the file fingerprint as both actions require (costly) File I/O
                // and can be folded together: if the fingerprint fails, that's 99.9% sure a failure in the File I/O, hence
                // a locked or otherwise inaccessible file.
                string fingerprint;
                try
                {
                    fingerprint = StreamFingerprint.FromFile(obj.FullPath);
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "Watched folder contains file '{0}' which is locked, so coming back later...", obj.FullPath);
                    FolderContentsHaveChanged = true;
                    break;
                }

                // check if the PDF is already known:
                PDFDocument doc = LibraryRef.Xlibrary.GetDocumentByFingerprint(fingerprint);

                if (doc != null)
                {
                    // Add this file to the list of processed files...
                    Logging.Info("FolderWatcher is skipping {0} as it already exists in the library as fingerprint {1}, title: {2}", obj.FullPath, fingerprint, doc.TitleCombined);
                    folder_watcher_manager.TypedTarget.RememberProcessedFile(obj.FullPath);
                    watch_stats.skipped_file_count++;
                    break;
                }

                if (watch_stats.file_hashes_added.TryGetValue(fingerprint, out var dupe_file_path))
                {
                    Logging.Info("FolderWatcher is skipping {0} as it has already been included in the import set as file {1} which has the same fingerprint {2}", obj.FullPath, dupe_file_path, fingerprint);
                    watch_stats.skipped_file_count++;
                    break;
                }

                watch_stats.file_hashes_added.Add(fingerprint, obj.FullPath);
                watch_stats.files_added_since_last_sleep++;

                return(true);
            }

            return(false);
        }
Esempio n. 12
0
        //
        // Summary:
        //     Represents the method that defines a set of criteria and determines whether the
        //     specified object meets those criteria.
        //
        // Parameters:
        //   obj:
        //     The object to compare against the criteria defined within the method represented
        //     by this delegate.
        //
        // Type parameters:
        //   T:
        //     The type of the object to compare.
        //
        // Returns:
        //     true if obj meets the criteria defined within the method represented by this
        //     delegate; otherwise, false.
        internal bool DecideIfIncludeDuringDirScan(FileSystemEntryInfo obj)
        {
            bool isRegularFile = !(obj.IsDevice || obj.IsDirectory || obj.IsMountPoint || /* obj.IsReparsePoint (hardlink!) || */ obj.IsOffline || obj.IsSystem || obj.IsTemporary);

            Logging.Debug("FolderWatcher: testing {1} '{0}' for inclusion in the Qiqqa library.", obj.FullPath, isRegularFile ? "regular File" : obj.IsDirectory ? "directory" : "node");

            if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
            {
                Logging.Info("FolderWatcher: Breaking out of inner processing loop due to daemon termination");
                throw new OperationCanceledException("FolderWatcher: Breaking out of inner processing loop due to daemon termination");
            }

            if (Qiqqa.Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
            {
                Logging.Info("FolderWatcher: Breaking out of inner processing loop due to DisableAllBackgroundTasks");
                throw new OperationCanceledException("FolderWatcher: Breaking out of inner processing loop due to DisableAllBackgroundTasks");
            }

            if (library?.TypedTarget == null || folder_watcher_manager?.TypedTarget == null)
            {
                Logging.Info("FolderWatcher: Breaking out of inner processing loop due to disposed library and/or watch manager");
                throw new OperationCanceledException("FolderWatcher: Breaking out of inner processing loop due to disposed library and/or watch manager");
            }

            bool have_we_slept = false;

            if (watch_stats.index_processing_clock.ElapsedMilliseconds > MAX_SECONDS_PER_ITERATION)
            {
                Logging.Info("FolderWatcher: Taking a nap due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed, {1} threads pending", watch_stats.index_processing_clock.ElapsedMilliseconds / 1E3, SafeThreadPool.QueuedThreadCount);

                // Collect various 'pending' counts to help produce a stretched sleep/delay period
                // in order to allow the other background tasks to keep up with the PDF series being
                // fed into them by this task.
                int thr_cnt    = Math.Max(0, SafeThreadPool.QueuedThreadCount - 2);
                int queued_cnt = Qiqqa.Documents.Common.DocumentQueuedStorer.Instance.PendingQueueCount;
                Qiqqa.Documents.PDF.PDFRendering.PDFTextExtractor.Instance.GetJobCounts(out var textify_count, out var ocr_count);

                int duration = 1 * 1000 + thr_cnt * 250 + queued_cnt * 20 + textify_count * 50 + ocr_count * 500;

                watch_stats.daemon.Sleep(Math.Min(60 * 1000, duration));

                // Relinquish control to the UI thread to make sure responsiveness remains tolerable at 100% CPU load.
                WPFDoEvents.WaitForUIThreadActivityDone();

                // reset:
                watch_stats.index_processing_clock.Restart();

                have_we_slept = true;
            }

            // only include *.pdf files. Use a `while` loop to allow easy `break` statements to abort the inclusion filter logic below:
            while (isRegularFile && obj.Extension.ToLower() == ".pdf")
            {
                // check if the given file isn't already present in the library:

                watch_stats.scanned_file_count++;

                // If we already have this file in the "cache since we started", skip it
                if (folder_watcher_manager.TypedTarget.HaveProcessedFile(obj.FullPath))
                {
                    Logging.Debug("FolderWatcher is skipping {0} as it has already been processed", obj.FullPath);
                    watch_stats.skipped_file_count++;
                    break;
                }

                if (have_we_slept)
                {
                    // As we have slept a while, it's quite unsure whether that file still exists.
                    // Include it only when it still exists and otherwise be sure to retrigger a scan to follow up
                    // any other directory changes.
                    if (!File.Exists(obj.FullPath))
                    {
                        Logging.Info("FolderWatcher is skipping {0} as it has disappeared while we were sleeping", obj.FullPath);
                        FolderContentsHaveChanged = true;
                        break;
                    }
                }

                // ignore zero-length and tiny sized files as those sure are buggy/illegal PDFs:
                //
                // https://stackoverflow.com/questions/17279712/what-is-the-smallest-possible-valid-pdf
                if (obj.FileSize <= 66)
                {
                    Logging.Warn("FolderWatcher is skipping {0} as it is too small to be a valid PDF file @ {1} bytes", obj.FullPath, obj.FileSize);
                    break;
                }

                // Check that the file is not still locked - if it is, mark that the folder is still "changed" and come back later.
                //
                // We do this at the same time as calculating the file fingerprint as both actions require (costly) File I/O
                // and can be folded together: if the fingerprint fails, that's 99.9% sure a failure in the File I/O, hence
                // a locked or otherwise inaccessible file.
                string fingerprint;
                try
                {
                    fingerprint = StreamFingerprint.FromFile(obj.FullPath);
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "Watched folder contains file '{0}' which is locked, so coming back later...", obj.FullPath);
                    FolderContentsHaveChanged = true;
                    break;
                }

                // check if the PDF is already known:
                PDFDocument doc = Library.GetDocumentByFingerprint(fingerprint);

                if (doc != null)
                {
                    // Add this file to the list of processed files...
                    Logging.Info("FolderWatcher is skipping {0} as it already exists in the library as fingerprint {1}, title: {2}", obj.FullPath, fingerprint, doc.TitleCombined);
                    folder_watcher_manager.TypedTarget.RememberProcessedFile(obj.FullPath);
                    watch_stats.skipped_file_count++;
                    break;
                }

                if (watch_stats.file_hashes_added.TryGetValue(fingerprint, out var dupe_file_path))
                {
                    Logging.Info("FolderWatcher is skipping {0} as it has already been included in the import set as file {1} which has the same fingerprint {2}", obj.FullPath, dupe_file_path, fingerprint);
                    watch_stats.skipped_file_count++;
                    break;
                }

                watch_stats.file_hashes_added.Add(fingerprint, obj.FullPath);

                return(true);
            }

            return(false);
        }
        /// <summary>
        /// Only call this if you have LOCKed the pdf_documents object
        /// </summary>
        private PDFDocument AddNewDocumentToLibrary_LOCK(string filename, string suggested_download_source, string bibtex, List <string> tags, string comments, bool suppressDialogs, bool suppress_signal_that_docs_have_changed)
        {
            // Flag that someone is trying to add to the library.  This is used by the background processes to hold off while the library is busy being added to...
            last_pdf_add_time = DateTime.UtcNow;

            if (String.IsNullOrEmpty(filename) || filename.EndsWith(".vanilla_reference"))
            {
                return(AddVanillaReferenceDocumentToLibrary(bibtex, tags, comments, suppressDialogs, suppress_signal_that_docs_have_changed));
            }

            bool is_a_document_we_can_cope_with = false;

            if (0 == Path.GetExtension(filename).ToLower().CompareTo(".pdf"))
            {
                is_a_document_we_can_cope_with = true;
            }
            else
            {
                if (DocumentConversion.CanConvert(filename))
                {
                    string filename_before_conversion = filename;
                    string filename_after_conversion  = TempFile.GenerateTempFilename("pdf");
                    if (DocumentConversion.Convert(filename_before_conversion, filename_after_conversion))
                    {
                        is_a_document_we_can_cope_with = true;
                        filename = filename_after_conversion;
                    }
                }
            }

            if (!is_a_document_we_can_cope_with)
            {
                string extension = Path.GetExtension(filename);

                if (!suppressDialogs)
                {
                    MessageBoxes.Info("This document library does not support {0} files.  Free and Premium libraries only support PDF files.  Premium+ libraries can automatically convert DOC and DOCX files to PDF.\n\nYou can convert your DOC files to PDFs using the Conversion Tool available on the Start Page Tools menu.\n\nSkipping {1}.", extension, filename);
                }
                else
                {
                    StatusManager.Instance.UpdateStatus("LibraryDocument", String.Format("This document library does not support {0} files.", extension));
                }
                return(null);
            }

            // If the PDF does not exist, can not clone
            if (!File.Exists(filename))
            {
                Logging.Info("Can not add non-existent file to library, so skipping: " + filename);
                return(null);
            }

            string fingerprint = StreamFingerprint.FromFile(filename);

            // Useful in logging for diagnosing if we're adding the same document again
            Logging.Info("Fingerprint: " + fingerprint);

            PDFDocument pdf_document;

            if (pdf_documents.TryGetValue(fingerprint, out pdf_document))
            {
                // Pdf reportedly exists in database.

                // Store the pdf in our location
                pdf_document.StoreAssociatedPDFInRepository(filename);

                // If the document was previously deleted in metadata, reinstate it
                if (pdf_document.Deleted)
                {
                    Logging.Info("The document was deleted, so reinstating it.");
                    pdf_document.Deleted = false;
                    pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.Deleted);
                }

                // Try to add some useful information from the download source if the metadata doesn't already have it
                if (String.IsNullOrEmpty(pdf_document.DownloadLocation) && !String.IsNullOrEmpty(suggested_download_source))
                {
                    Logging.Info("The document in the library had no download location, so inferring it from download.");
                    pdf_document.DownloadLocation = suggested_download_source;
                    pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.DownloadLocation);
                }

                if (!String.IsNullOrEmpty(bibtex))
                {
                    pdf_document.BibTex = bibtex;
                    pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.BibTex);
                }

                if (tags != null)
                {
                    tags.ForEach(x => pdf_document.AddTag(x)); //Notify changes called internally
                }

                // If we already have comments, then append them to our existing comments (if they are not identical)
                if (!String.IsNullOrEmpty(comments))
                {
                    if (pdf_document.Comments != comments)
                    {
                        pdf_document.Comments = pdf_document.Comments + "\n\n---\n\n\n" + comments;
                        pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.Comments);
                    }
                }
            }
            else
            {
                // Create a new document
                pdf_document = PDFDocument.CreateFromPDF(this, filename, fingerprint);
                pdf_document.DownloadLocation = suggested_download_source;
                pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.DownloadLocation);
                pdf_document.BibTex = bibtex;
                pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.BibTex);
                if (tags != null)
                {
                    tags.ForEach(x => pdf_document.AddTag(x));
                }

                pdf_document.Comments = comments;
                pdf_document.Bindable.NotifyPropertyChanged(() => pdf_document.Comments);

                // Store in our database - note that we have the lock already
                pdf_documents[pdf_document.Fingerprint] = pdf_document;

                // Get OCR queued
                pdf_document.PDFRenderer.CauseAllPDFPagesToBeOCRed();
            }

            if (!suppress_signal_that_docs_have_changed)
            {
                SignalThatDocumentsHaveChanged(pdf_document);
            }

            return(pdf_document);
        }
Esempio n. 14
0
        /// <summary>
        /// The daemon code calls this occasionally to poke it into action to do work
        /// </summary>
        /// <param name="daemon"></param>
        public void TaskDaemonEntryPoint(Daemon daemon)
        {
            // We don't want to start watching files until the library is loaded...
            if (!library.LibraryIsLoaded)
            {
                Logging.Info("Library is not yet loaded, so waiting before watching...");

                // Indicate that the library may still not have been changed...
                FolderContentsHaveChanged = true;
                return;
            }

            // Update our folder system watcher if necessary
            CheckIfFolderNameHasChanged();

            // If the current folder is blank, do nothing
            if (String.IsNullOrEmpty(configured_folder_to_watch))
            {
                return;
            }

            // If the folder does not exist, do nothing
            if (!Directory.Exists(configured_folder_to_watch))
            {
                return;
            }

            // If the folder or its contents has not changed since the last time, do nothing
            if (!FolderContentsHaveChanged)
            {
                return;
            }

            Logging.Debug("FolderWatcher BEGIN");

            // To recover from a fatal library failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // To reconstruct the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 5;
            const int MAX_SECONDS_PER_ITERATION          = 5 * 1000;
            Stopwatch index_processing_clock             = Stopwatch.StartNew();

            // Mark that we are now processing the folder
            while (TestAndReset_FolderContentsHaveChanged())
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs)
                {
                    Logging.Debug特("FolderWatcher: Not daemon processing any library that is busy with adds...");
                    FolderContentsHaveChanged = true;
                    break;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("FolderWatcher: Breaking out of outer processing loop due to daemon termination");
                    FolderContentsHaveChanged = true;
                    break;
                }

                if (Qiqqa.Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("FolderWatcher: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    FolderContentsHaveChanged = true;
                    break;
                }

                int processing_file_count = 0;
                int processed_file_count  = 0;
                int scanned_file_count    = 0;
                int skipped_file_count    = 0;

                // If we get this far then there might be some work to do in the folder...
                Stopwatch clk = new Stopwatch();
                clk.Start();
                IEnumerable <string> filenames_in_folder = Directory.EnumerateFiles(configured_folder_to_watch, "*.pdf", SearchOption.AllDirectories);
                Logging.Debug特("Directory.EnumerateFiles took {0} ms", clk.ElapsedMilliseconds);

                List <PDFDocument> pdf_documents_already_in_library = library.PDFDocuments;

                List <string> filenames_that_are_new = new List <string>();
                foreach (string filename in filenames_in_folder)
                {
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Info("FolderWatcher: Breaking out of inner processing loop due to daemon termination");
                        break;
                    }

                    if (Qiqqa.Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                    {
                        Logging.Info("FolderWatcher: Breaking out of inner processing loop due to DisableAllBackgroundTasks");
                        break;
                    }

                    scanned_file_count++;

                    if (index_processing_clock.ElapsedMilliseconds > MAX_SECONDS_PER_ITERATION)
                    {
                        Logging.Info("FolderWatcher: Taking a nap due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed, {1} threads pending", index_processing_clock.ElapsedMilliseconds / 1E3, SafeThreadPool.QueuedThreadCount);

                        // Collect various 'pending' counts to help produce a stretched sleep/delay period
                        // in order to allow the other background tasks to keep up with the PDF series being
                        // fed into them by this task.
                        int thr_cnt       = Math.Max(0, SafeThreadPool.QueuedThreadCount - 2);
                        int queued_cnt    = Qiqqa.Documents.Common.DocumentQueuedStorer.Instance.PendingQueueCount;
                        int textify_count = 0;
                        int ocr_count     = 0;
                        Qiqqa.Documents.PDF.PDFRendering.PDFTextExtractor.Instance.GetJobCounts(out textify_count, out ocr_count);

                        int duration = 1 * 1000 + thr_cnt * 250 + queued_cnt * 20 + textify_count * 50 + ocr_count * 500;

                        daemon.Sleep(Math.Min(60 * 1000, duration));
                        // As we have slept a while, it's quite unsure whether that file still exists. Skip it and
                        // let the next round find it later on.
                        FolderContentsHaveChanged = true;
                        // reset:
                        index_processing_clock.Restart();
                        continue;
                    }

                    // If we already have this file in the "cache since we started", skip it
                    if (folder_watcher_manager.HaveProcessedFile(filename))
                    {
                        Logging.Debug特("FolderWatcher is skipping {0} as it has already been processed", filename);
                        skipped_file_count++;
                        continue;
                    }

                    // If we already have this file in the "pdf file locations", skip it
                    bool is_already_in_library = false;

                    // Check that the file is not still locked - if it is, mark that the folder is still "changed" and come back later.
                    //
                    // We do this at the same tim as calculating the file fingerprint as both actions require (costly) File I/O
                    // and can be folded together: if the fingerprint fails, that's 99.9% sure a failure in the File I/O, hence
                    // a locked or otherwise inaccessible file.
                    string fingerprint;
                    try
                    {
                        fingerprint = StreamFingerprint.FromFile(filename);
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "Watched folder contains file '{0}' which is locked, so coming back later...", filename);
                        FolderContentsHaveChanged = true;
                        continue;
                    }

                    foreach (PDFDocument pdf_document in pdf_documents_already_in_library)
                    {
                        // do NOT depend on the file staying the same; external activities may have replaced the PDF with another one!
                        //
                        // Hence we SHOULD check using file FINGERPRINT, even though that's a costly operation:
#if OLD
                        if (pdf_document.DownloadLocation == filename)
                        {
                            is_already_in_library = true;
                            break;
                        }
#else
                        if (pdf_document.Fingerprint == fingerprint)
                        {
                            is_already_in_library = true;
                            break;
                        }
#endif
                    }

                    if (is_already_in_library)
                    {
                        // Add this file to the list of processed files...
                        folder_watcher_manager.RememberProcessedFile(filename);
                        skipped_file_count++;
                        continue;
                    }

                    // Do NOT count files which are already present in our library/DB,
                    // despite the fact that those also *do* take time and effort to check
                    // in the code above.
                    //
                    // The issue here is that when we would import files A,B,C,D,E,F,G,H,I,J,K,
                    // we would do so in tiny batches, resulting in a rescan after each batch
                    // where the already processed files will be included in the set, but must
                    // be filtered out as 'already in there' in the code above.
                    // Iff we had counted *all* files we inspect from the Watch Directory,
                    // we would never make it batch the first batch as then our count limit
                    // would trigger already for every round through here!
                    //
                    processing_file_count++;

                    Logging.Info("FolderWatcher is importing {0}", filename);
                    filenames_that_are_new.Add(filename);

                    if (processing_file_count >= MAX_NUMBER_OF_PDF_FILES_TO_PROCESS + processed_file_count)
                    {
                        Logging.Info("FolderWatcher: {0} of {1} files have been processed/inspected (total {2} scanned, {3} skipped, {4} ignored)", processed_file_count, processing_file_count, scanned_file_count, skipped_file_count, scanned_file_count - skipped_file_count - processing_file_count);
                        // process one little batch, before we add any more:
                        ProcessTheNewDocuments(filenames_that_are_new);

                        // reset
                        filenames_that_are_new.Clear();

                        processed_file_count = processing_file_count;

                        // Relinquish control to the UI thread to make sure responsiveness remains tolerable at 100% CPU load.
                        Utilities.GUI.WPFDoEvents.WaitForUIThreadActivityDone();
                    }
                }

                Logging.Info("FolderWatcher: {0} of {1} files have been processed/inspected (total {2} scanned, {3} skipped, {4} ignored)", processed_file_count, processing_file_count, scanned_file_count, skipped_file_count, scanned_file_count - skipped_file_count - processing_file_count);
                // process the remainder: a last little batch:
                ProcessTheNewDocuments(filenames_that_are_new);

                Logging.Debug("FolderWatcher End-Of-Round");

                daemon.Sleep(3 * 1000);
            }

            Logging.Debug("FolderWatcher END");
        }