/// <summary>
        /// The daemon code calls this occasionally to poke it into action to do work
        /// </summary>
        /// <param name="daemon"></param>
        public void TaskDaemonEntryPoint(Daemon daemon)
        {
            // We don't want to start watching files until the library is loaded...
            if (!library.LibraryIsLoaded)
            {
                Logging.Info("Library is not yet loaded, so waiting before watching...");

                // Indicate that the library may still not have been changed...
                folder_contents_has_changed = true;
                return;
            }

            // Update our fole system watcher if necessary
            CheckIfFolderNameHasChanged();

            // If the current folder is blank, do nothing
            if (String.IsNullOrEmpty(folder_to_watch))
            {
                return;
            }

            // If the folder does not exist, do nothing
            if (!Directory.Exists(folder_to_watch))
            {
                return;
            }


            // If the folder or its contents has not changed since the last time, do nothing
            if (!folder_contents_has_changed)
            {
                return;
            }

            // Mark that we are now processing the folder
            folder_contents_has_changed = false;

            // If we get this far then there might be some work to do in the folder...
            string[] filenames_in_folder = Directory.GetFiles(previous_folder_to_watch, "*.pdf", SearchOption.AllDirectories);

            List <PDFDocument> pdf_documents_already_in_library = library.PDFDocuments;

            List <string> filenames_that_are_new = new List <string>();

            foreach (string filename in filenames_in_folder)
            {
                // If we already have this file in the "cache since we started", skip it
                if (folder_watcher_manager.HaveProcessedFile(filename))
                {
                    //Logging.Info("FolderWatcher is skipping {0} as it has already been processed", filename);
                    continue;
                }

                // If we already have this file in the "pdf file locations", skip it
                bool is_already_in_library = false;
                foreach (PDFDocument pdf_document in pdf_documents_already_in_library)
                {
                    if (pdf_document.DownloadLocation == filename)
                    {
                        is_already_in_library = true;
                        break;
                    }
                }

                if (is_already_in_library)
                {
                    // Add this file to the list of processed files...
                    folder_watcher_manager.RememberProcessedFile(filename);

                    continue;
                }

                // Check that the file is not still locked - if it is, mark that the folder is still "changed" and come back later..
                if (IsFileLocked(filename))
                {
                    Logging.Info("Watched folder contains file '{0}' which is locked, so coming back later...", filename);
                    folder_contents_has_changed = true;
                    continue;
                }

                Logging.Info("FolderWatcher is importing {0}", filename);
                filenames_that_are_new.Add(filename);

                // Add this file to the list of processed files...
                folder_watcher_manager.RememberProcessedFile(filename);
            }


            // Create the import records
            List <ImportingIntoLibrary.FilenameWithMetadataImport> filename_with_metadata_imports = new List <ImportingIntoLibrary.FilenameWithMetadataImport>();

            foreach (var filename in filenames_that_are_new)
            {
                filename_with_metadata_imports.Add(new ImportingIntoLibrary.FilenameWithMetadataImport {
                    filename = filename, tags = new List <string>(this.tags)
                });
            }

            // Get the library to import all these new files
            ImportingIntoLibrary.AddNewPDFDocumentsToLibraryWithMetadata_ASYNCHRONOUS(library, true, true, filename_with_metadata_imports.ToArray());
        }
Example #2
0
        /// <summary>
        /// The daemon code calls this occasionally to poke it into action to do work
        /// </summary>
        /// <param name="daemon"></param>
        public void TaskDaemonEntryPoint(Daemon daemon)
        {
            // We don't want to start watching files until the library is loaded...
            if (!library.LibraryIsLoaded)
            {
                Logging.Info("Library is not yet loaded, so waiting before watching...");

                // Indicate that the library may still not have been changed...
                FolderContentsHaveChanged = true;
                return;
            }

            // Update our folder system watcher if necessary
            CheckIfFolderNameHasChanged();

            // If the current folder is blank, do nothing
            if (String.IsNullOrEmpty(configured_folder_to_watch))
            {
                return;
            }

            // If the folder does not exist, do nothing
            if (!Directory.Exists(configured_folder_to_watch))
            {
                return;
            }

            // If the folder or its contents has not changed since the last time, do nothing
            if (!FolderContentsHaveChanged)
            {
                return;
            }

            Logging.Debug("FolderWatcher BEGIN");

            // To recover from a fatal library failure and re-indexing attempt for very large libraries,
            // we're better off processing a limited number of source files as we'll be able to see
            // *some* results more quickly and we'll have a working, though yet incomplete,
            // index in *reasonable time*.
            //
            // To reconstruct the entire index will take a *long* time. We grow the index and other meta
            // stores a bunch-of-files at a time and then repeat the entire maintenance process until
            // we'll be sure to have run out of files to process for sure...
            const int MAX_NUMBER_OF_PDF_FILES_TO_PROCESS = 5;
            const int MAX_SECONDS_PER_ITERATION          = 5 * 1000;
            Stopwatch index_processing_clock             = Stopwatch.StartNew();

            // Mark that we are now processing the folder
            while (TestAndReset_FolderContentsHaveChanged())
            {
                // If this library is busy, skip it for now
                if (Library.IsBusyAddingPDFs)
                {
                    Logging.Debug特("FolderWatcher: Not daemon processing any library that is busy with adds...");
                    FolderContentsHaveChanged = true;
                    break;
                }

                if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                {
                    Logging.Debug特("FolderWatcher: Breaking out of outer processing loop due to daemon termination");
                    FolderContentsHaveChanged = true;
                    break;
                }

                if (Qiqqa.Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                {
                    Logging.Debug特("FolderWatcher: Breaking out of outer processing loop due to DisableAllBackgroundTasks");
                    FolderContentsHaveChanged = true;
                    break;
                }

                int processing_file_count = 0;
                int processed_file_count  = 0;
                int scanned_file_count    = 0;
                int skipped_file_count    = 0;

                // If we get this far then there might be some work to do in the folder...
                Stopwatch clk = new Stopwatch();
                clk.Start();
                IEnumerable <string> filenames_in_folder = Directory.EnumerateFiles(configured_folder_to_watch, "*.pdf", SearchOption.AllDirectories);
                Logging.Debug特("Directory.EnumerateFiles took {0} ms", clk.ElapsedMilliseconds);

                List <PDFDocument> pdf_documents_already_in_library = library.PDFDocuments;

                List <string> filenames_that_are_new = new List <string>();
                foreach (string filename in filenames_in_folder)
                {
                    if (Utilities.Shutdownable.ShutdownableManager.Instance.IsShuttingDown)
                    {
                        Logging.Info("FolderWatcher: Breaking out of inner processing loop due to daemon termination");
                        break;
                    }

                    if (Qiqqa.Common.Configuration.ConfigurationManager.Instance.ConfigurationRecord.DisableAllBackgroundTasks)
                    {
                        Logging.Info("FolderWatcher: Breaking out of inner processing loop due to DisableAllBackgroundTasks");
                        break;
                    }

                    scanned_file_count++;

                    if (index_processing_clock.ElapsedMilliseconds > MAX_SECONDS_PER_ITERATION)
                    {
                        Logging.Info("FolderWatcher: Taking a nap due to MAX_SECONDS_PER_ITERATION: {0} seconds consumed, {1} threads pending", index_processing_clock.ElapsedMilliseconds / 1E3, SafeThreadPool.QueuedThreadCount);

                        // Collect various 'pending' counts to help produce a stretched sleep/delay period
                        // in order to allow the other background tasks to keep up with the PDF series being
                        // fed into them by this task.
                        int thr_cnt       = Math.Max(0, SafeThreadPool.QueuedThreadCount - 2);
                        int queued_cnt    = Qiqqa.Documents.Common.DocumentQueuedStorer.Instance.PendingQueueCount;
                        int textify_count = 0;
                        int ocr_count     = 0;
                        Qiqqa.Documents.PDF.PDFRendering.PDFTextExtractor.Instance.GetJobCounts(out textify_count, out ocr_count);

                        int duration = 1 * 1000 + thr_cnt * 250 + queued_cnt * 20 + textify_count * 50 + ocr_count * 500;

                        daemon.Sleep(Math.Min(60 * 1000, duration));
                        // As we have slept a while, it's quite unsure whether that file still exists. Skip it and
                        // let the next round find it later on.
                        FolderContentsHaveChanged = true;
                        // reset:
                        index_processing_clock.Restart();
                        continue;
                    }

                    // If we already have this file in the "cache since we started", skip it
                    if (folder_watcher_manager.HaveProcessedFile(filename))
                    {
                        Logging.Debug特("FolderWatcher is skipping {0} as it has already been processed", filename);
                        skipped_file_count++;
                        continue;
                    }

                    // If we already have this file in the "pdf file locations", skip it
                    bool is_already_in_library = false;

                    // Check that the file is not still locked - if it is, mark that the folder is still "changed" and come back later.
                    //
                    // We do this at the same tim as calculating the file fingerprint as both actions require (costly) File I/O
                    // and can be folded together: if the fingerprint fails, that's 99.9% sure a failure in the File I/O, hence
                    // a locked or otherwise inaccessible file.
                    string fingerprint;
                    try
                    {
                        fingerprint = StreamFingerprint.FromFile(filename);
                    }
                    catch (Exception ex)
                    {
                        Logging.Error(ex, "Watched folder contains file '{0}' which is locked, so coming back later...", filename);
                        FolderContentsHaveChanged = true;
                        continue;
                    }

                    foreach (PDFDocument pdf_document in pdf_documents_already_in_library)
                    {
                        // do NOT depend on the file staying the same; external activities may have replaced the PDF with another one!
                        //
                        // Hence we SHOULD check using file FINGERPRINT, even though that's a costly operation:
#if OLD
                        if (pdf_document.DownloadLocation == filename)
                        {
                            is_already_in_library = true;
                            break;
                        }
#else
                        if (pdf_document.Fingerprint == fingerprint)
                        {
                            is_already_in_library = true;
                            break;
                        }
#endif
                    }

                    if (is_already_in_library)
                    {
                        // Add this file to the list of processed files...
                        folder_watcher_manager.RememberProcessedFile(filename);
                        skipped_file_count++;
                        continue;
                    }

                    // Do NOT count files which are already present in our library/DB,
                    // despite the fact that those also *do* take time and effort to check
                    // in the code above.
                    //
                    // The issue here is that when we would import files A,B,C,D,E,F,G,H,I,J,K,
                    // we would do so in tiny batches, resulting in a rescan after each batch
                    // where the already processed files will be included in the set, but must
                    // be filtered out as 'already in there' in the code above.
                    // Iff we had counted *all* files we inspect from the Watch Directory,
                    // we would never make it batch the first batch as then our count limit
                    // would trigger already for every round through here!
                    //
                    processing_file_count++;

                    Logging.Info("FolderWatcher is importing {0}", filename);
                    filenames_that_are_new.Add(filename);

                    if (processing_file_count >= MAX_NUMBER_OF_PDF_FILES_TO_PROCESS + processed_file_count)
                    {
                        Logging.Info("FolderWatcher: {0} of {1} files have been processed/inspected (total {2} scanned, {3} skipped, {4} ignored)", processed_file_count, processing_file_count, scanned_file_count, skipped_file_count, scanned_file_count - skipped_file_count - processing_file_count);
                        // process one little batch, before we add any more:
                        ProcessTheNewDocuments(filenames_that_are_new);

                        // reset
                        filenames_that_are_new.Clear();

                        processed_file_count = processing_file_count;

                        // Relinquish control to the UI thread to make sure responsiveness remains tolerable at 100% CPU load.
                        Utilities.GUI.WPFDoEvents.WaitForUIThreadActivityDone();
                    }
                }

                Logging.Info("FolderWatcher: {0} of {1} files have been processed/inspected (total {2} scanned, {3} skipped, {4} ignored)", processed_file_count, processing_file_count, scanned_file_count, skipped_file_count, scanned_file_count - skipped_file_count - processing_file_count);
                // process the remainder: a last little batch:
                ProcessTheNewDocuments(filenames_that_are_new);

                Logging.Debug("FolderWatcher End-Of-Round");

                daemon.Sleep(3 * 1000);
            }

            Logging.Debug("FolderWatcher END");
        }