Beispiel #1
0
        public static ExpeditionDataSource BuildExpeditionDataSource(Library library, int num_topics, bool add_autotags, bool add_tags, ExpeditionBuilderProgressUpdateDelegate progress_update_delegate)
        {
            bool not_aborted_by_user = true;

            // Check that we have a progres update delegate
            if (null == progress_update_delegate)
            {
                progress_update_delegate = DefaultExpeditionBuilderProgressUpdate;
            }

            // What are the sources of data?
            progress_update_delegate("Assembling tags", 0);
            HashSet <string>   tags          = BuildLibraryTagList(library, add_autotags, add_tags);
            List <PDFDocument> pdf_documents = library.PDFDocumentsWithLocalFilePresent;

            // Initialise the datasource
            progress_update_delegate("Initialising datasource", 0);
            ExpeditionDataSource data_source = new ExpeditionDataSource();

            data_source.date_created = DateTime.UtcNow;

            progress_update_delegate("Adding tags", 0);
            data_source.words = new List <string>();
            foreach (string tag in tags)
            {
                data_source.words.Add(tag);
            }

            progress_update_delegate("Adding docs", 0);
            data_source.docs = new List <string>();
            foreach (PDFDocument pdf_document in pdf_documents)
            {
                data_source.docs.Add(pdf_document.Fingerprint);
            }

            progress_update_delegate("Rebuilding indices", 0);
            data_source.RebuildIndices();

            // Now go through each doc and find the tags that match
            data_source.words_in_docs = new int[data_source.docs.Count][];

            int total_processed = 0;

            Parallel.For(0, data_source.docs.Count, d =>
                         //for (int d = 0; d < data_source.docs.Count; ++d)
            {
                int total_processed_local = Interlocked.Increment(ref total_processed);
                if (0 == total_processed_local % 10)
                {
                    not_aborted_by_user = not_aborted_by_user && progress_update_delegate("Scanning documents", total_processed_local / (double)data_source.docs.Count);
                }

                List <int> tags_in_document = new List <int>();

                if (not_aborted_by_user)
                {
                    PDFDocument pdf_document = pdf_documents[d];
                    string full_text         = " " + pdf_document.PDFRenderer.GetFullOCRText() + " ";
                    string full_text_lower   = full_text.ToLower();

                    for (int t = 0; t < data_source.words.Count; ++t)
                    {
                        string tag = ' ' + data_source.words[t] + ' ';

                        string full_text_to_search = full_text;
                        if (StringTools.HasSomeLowerCase(tag))
                        {
                            full_text_to_search = full_text_lower;
                            tag = tag.ToLower();
                        }

                        int num_appearances = StringTools.CountStringOccurence(full_text_to_search, tag);
                        for (int i = 0; i < num_appearances; ++i)
                        {
                            tags_in_document.Add(t);
                        }
                    }
                }

                data_source.words_in_docs[d] = tags_in_document.ToArray();
            }
                         );

            // Initialise the LDA
            not_aborted_by_user = not_aborted_by_user && progress_update_delegate("Building themes sampler", 0);
            int    num_threads = Environment.ProcessorCount;
            double alpha       = 2.0 / num_topics;
            double beta        = 0.01;

            data_source.lda_sampler = new LDASampler(alpha, beta, num_topics, data_source.words.Count, data_source.docs.Count, data_source.words_in_docs);

            LDASamplerMCSerial lda_sampler_mc = new LDASamplerMCSerial(data_source.lda_sampler, num_threads);

            for (int i = 0; i < MAX_TOPIC_ITERATIONS; ++i)
            {
                if (!not_aborted_by_user)
                {
                    break;
                }
                not_aborted_by_user = not_aborted_by_user && progress_update_delegate("Building themes", i / (double)MAX_TOPIC_ITERATIONS);
                lda_sampler_mc.MC(10);
            }

            // Results
            if (not_aborted_by_user)
            {
                progress_update_delegate("Built Expedition", 1);
            }
            else
            {
                progress_update_delegate("Cancelled Expedition", 1);
            }

            return(data_source);
        }
        public static ExpeditionDataSource BuildExpeditionDataSource(WebLibraryDetail web_library_detail, int num_topics, bool add_autotags, bool add_tags, ExpeditionBuilderProgressUpdateDelegate progress_update_delegate)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            // Initialise the datasource
            ExpeditionDataSource data_source = new ExpeditionDataSource();

            data_source.date_created = DateTime.UtcNow;

            try
            {
                // Check that we have a progress update delegate
                if (null == progress_update_delegate)
                {
                    progress_update_delegate = DefaultExpeditionBuilderProgressUpdate;
                }

                // What are the sources of data?
                progress_update_delegate("Assembling tags");
                HashSet <string>   tags          = BuildLibraryTagList(web_library_detail, add_autotags, add_tags);
                List <PDFDocument> pdf_documents = web_library_detail.Xlibrary.PDFDocumentsWithLocalFilePresent;

                progress_update_delegate("Adding tags");
                data_source.words = new List <string>();
                foreach (string tag in tags)
                {
                    data_source.words.Add(tag);
                }

                progress_update_delegate("Adding docs");
                data_source.docs = new List <string>();
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    data_source.docs.Add(pdf_document.Fingerprint);
                }

                progress_update_delegate("Rebuilding indices");
                data_source.RebuildIndices();

                // Now go through each doc and find the tags that match
                int DATA_SOURCE_DOCS_COUNT = data_source.docs.Count;
                data_source.words_in_docs = new int[DATA_SOURCE_DOCS_COUNT][];

                //int total_processed = 0;

                Parallel.For(0, DATA_SOURCE_DOCS_COUNT, d =>
                             //for (int d = 0; d < DATA_SOURCE_DOCS_COUNT; ++d)
                {
                    //int total_processed_local = Interlocked.Increment(ref total_processed);
                    //if (0 == total_processed_local % 50)
                    if (0 == d % 50)
                    {
                        if (!progress_update_delegate("Scanning documents", d, DATA_SOURCE_DOCS_COUNT))
                        {
                            // Parallel.For() doc at https://docs.microsoft.com/en-us/archive/msdn-magazine/2007/october/parallel-performance-optimize-managed-code-for-multi-core-machines
                            // says:
                            //
                            // Finally, if any exception is thrown in any of the iterations, all iterations are canceled
                            // and the first thrown exception is rethrown in the calling thread, ensuring that exceptions
                            // are properly propagated and never lost.
                            //
                            // --> We can thus easily use an exception to terminate/cancel all iterations of Parallel.For()!
                            throw new TaskCanceledException("Operation canceled by user");
                        }
                    }

                    List <int> tags_in_document = new List <int>();

                    {
                        PDFDocument pdf_document = pdf_documents[d];
                        string full_text         = " " + pdf_document.GetFullOCRText() + " ";
                        string full_text_lower   = full_text.ToLower();

                        for (int t = 0; t < data_source.words.Count; ++t)
                        {
                            string tag = ' ' + data_source.words[t] + ' ';

                            string full_text_to_search = full_text;
                            if (StringTools.HasSomeLowerCase(tag))
                            {
                                full_text_to_search = full_text_lower;
                                tag = tag.ToLower();
                            }

                            int num_appearances = StringTools.CountStringOccurence(full_text_to_search, tag);
                            for (int i = 0; i < num_appearances; ++i)
                            {
                                tags_in_document.Add(t);
                            }
                        }
                    }

                    data_source.words_in_docs[d] = tags_in_document.ToArray();
                }
                             );

                // Initialise the LDA
                if (!progress_update_delegate("Building themes sampler"))
                {
                    // Parallel.For() doc at https://docs.microsoft.com/en-us/archive/msdn-magazine/2007/october/parallel-performance-optimize-managed-code-for-multi-core-machines
                    // says:
                    //
                    // Finally, if any exception is thrown in any of the iterations, all iterations are canceled
                    // and the first thrown exception is rethrown in the calling thread, ensuring that exceptions
                    // are properly propagated and never lost.
                    //
                    // --> We can thus easily use an exception to terminate/cancel all iterations of Parallel.For()!
                    throw new TaskCanceledException("Operation canceled by user");
                }

                int    num_threads = Math.Min(1, (Environment.ProcessorCount - 1) / 2);
                double alpha       = 2.0 / num_topics;
                double beta        = 0.01;
                data_source.lda_sampler = new LDASampler(alpha, beta, num_topics, data_source.words.Count, data_source.docs.Count, data_source.words_in_docs);

                LDASamplerMCSerial lda_sampler_mc = new LDASamplerMCSerial(data_source.lda_sampler, num_threads);
                lda_sampler_mc.MC(MAX_TOPIC_ITERATIONS, (iteration, num_iterations) =>
                {
                    if (!progress_update_delegate("Building themes", iteration, num_iterations))
                    {
                        // Parallel.For() doc at https://docs.microsoft.com/en-us/archive/msdn-magazine/2007/october/parallel-performance-optimize-managed-code-for-multi-core-machines
                        // says:
                        //
                        // Finally, if any exception is thrown in any of the iterations, all iterations are canceled
                        // and the first thrown exception is rethrown in the calling thread, ensuring that exceptions
                        // are properly propagated and never lost.
                        //
                        // --> We can thus easily use an exception to terminate/cancel all iterations of Parallel.For()!
                        throw new TaskCanceledException("Operation canceled by user");
                    }
                });
            }
#pragma warning disable CS0168 // The variable 'ex' is declared but never used
            catch (TaskCanceledException ex)
#pragma warning restore CS0168 // The variable 'ex' is declared but never used
            {
                // This exception should only occur when the user *canceled* the process and should therefor
                // *not* be propagated. Instead, we have to report an aborted result:
                progress_update_delegate("Canceled Expedition", 1, 1);
                return(null);
            }

            progress_update_delegate("Built Expedition", 1, 1);

            return(data_source);
        }