private static void GetDensityForDocument(PDFDocument pdf_document, out int doc_id, out int num_topics, out float[] density_of_topics_in_document)
        {
            doc_id     = -1;
            num_topics = -1;
            density_of_topics_in_document = null;

            if (null == pdf_document)
            {
                return;
            }

            ExpeditionDataSource eds = pdf_document.LibraryRef.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

            if (null != eds)
            {
                LDAAnalysis lda_analysis = eds.LDAAnalysis;
                if (eds.docs_index.ContainsKey(pdf_document.Fingerprint))
                {
                    // Result!

                    doc_id     = eds.docs_index[pdf_document.Fingerprint];
                    num_topics = lda_analysis.NUM_TOPICS;
                    density_of_topics_in_document = new float[num_topics];
                    for (int i = 0; i < lda_analysis.NUM_TOPICS; ++i)
                    {
                        density_of_topics_in_document[i] = lda_analysis.DensityOfTopicsInDocuments[doc_id, i];
                    }
                }
            }
        }
        public void RebuildExpedition(int num_topics, bool add_autotags, bool add_tags, RebuiltExpeditionCompleteDelegate rebuiltexpeditioncompletedelegate)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            StatusManager.Instance.UpdateStatus("Expedition", "Rebuilding Expedition");

            try
            {
                Library.IsBusyRegeneratingTags = true;

                ExpeditionDataSource eds = ExpeditionBuilder.BuildExpeditionDataSource(LibraryRef, num_topics, add_autotags, add_tags, ExpeditionBuilderProgressUpdate);
                if (eds != null)
                {
                    SerializeFile.SaveSafely(Filename_Store, eds);
                    expedition_data_source = eds;
                }
            }
            finally
            {
                Library.IsBusyRegeneratingTags = false;

                StatusManager.Instance.ClearCancelled("Expedition");
            }
            Logging.Info("-Rebuilding Expedition");

            if (null != rebuiltexpeditioncompletedelegate)
            {
                Logging.Info("+Notifying of rebuilt Expedition");
                rebuiltexpeditioncompletedelegate();
                Logging.Info("-Notifying of rebuilt Expedition");
            }
        }
        public static Brush GetBrushForDistribution(WebLibraryDetail web_library_detail, int num_topics, float[] distribution)
        {
            ExpeditionDataSource eds = web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

            if (null != eds)
            {
                Color[] colours = eds.Colours;

                int num_stops = 2 * num_topics;

                GradientStopCollection gradient_stop_collection = new GradientStopCollection(num_stops);
                double previous_offset = 0.0;
                for (int i = 0; i < num_topics; ++i)
                {
                    gradient_stop_collection.Add(new GradientStop(colours[i], previous_offset));
                    previous_offset += distribution[i];
                    gradient_stop_collection.Add(new GradientStop(colours[i], previous_offset));
                }

                LinearGradientBrush lgb = new LinearGradientBrush(gradient_stop_collection);
                lgb.Freeze();

                return(lgb);
            }

            return(UNKNOWN_BRUSH);
        }
Example #4
0
        public static void Test()
        {
            Library library = Library.GuestInstance;

            Thread.Sleep(1000);

            int num_topics            = (int)Math.Ceiling(Math.Sqrt(library.PDFDocuments.Count));
            ExpeditionDataSource ebds = BuildExpeditionDataSource(library, num_topics, true, true, null);
        }
Example #5
0
        private void TopicOverviewControl_DataContextChanged(object sender, DependencyPropertyChangedEventArgs e)
        {
            // Clear the old
            ObjPDFRendererControlPlaceholder.Children.Clear();

            AugmentedBindable <PDFDocument> pdf_document_bindable = DataContext as AugmentedBindable <PDFDocument>;

            if (null == pdf_document_bindable)
            {
                return;
            }

            PDFDocument pdf_document = pdf_document_bindable.Underlying;

            if (null == pdf_document.Library.ExpeditionManager.ExpeditionDataSource)
            {
                return;
            }

            ExpeditionDataSource eds          = pdf_document.Library.ExpeditionManager.ExpeditionDataSource;
            LDAAnalysis          lda_analysis = eds.LDAAnalysis;

            try
            {
                if (!pdf_document.Library.ExpeditionManager.ExpeditionDataSource.docs_index.ContainsKey(pdf_document.Fingerprint))
                {
                    MessageBoxes.Warn("Expedition doesn't have any information about this paper.  Please Refresh your Expedition.");
                    return;
                }
            }
            catch (Exception ex)
            {
                Logging.Error(ex, "There was a problem with Expedition for document {0}", pdf_document.Fingerprint);
            }



            if (pdf_document_bindable.Underlying.DocumentExists)
            {
                ObjPDFRendererControlPlaceholderBorder.Visibility = Visibility.Visible;
                ObjPDFRendererControlPlaceholderRow.Height        = new GridLength(1, GridUnitType.Star);

                PDFRendererControl pdf_renderer_control = new PDFRendererControl(pdf_document_bindable.Underlying, false, PDFRendererControl.ZoomType.Zoom1Up);
                ObjPDFRendererControlPlaceholder.Children.Add(pdf_renderer_control);
            }
            else
            {
                ObjPDFRendererControlPlaceholderBorder.Visibility = Visibility.Collapsed;
                ObjPDFRendererControlPlaceholderRow.Height        = new GridLength(0, GridUnitType.Pixel);
            }
        }
        public void RebuildExpedition(int num_topics, bool add_autotags, bool add_tags, RebuiltExpeditionCompleteDelegate rebuiltexpeditioncompletedelegate)
        {
            Logging.Info("+Rebuilding Expedition");
            StatusManager.Instance.ClearCancelled("Expedition");
            ExpeditionDataSource eds = ExpeditionBuilder.BuildExpeditionDataSource(library, num_topics, add_autotags, add_tags, ExpeditionBuilderProgressUpdate);

            SerializeFile.SaveSafely(Filename_Store, eds);
            expedition_data_source = eds;
            Logging.Info("-Rebuilding Expedition");

            if (null != rebuiltexpeditioncompletedelegate)
            {
                Logging.Info("+Notifying of rebuilt Expedition");
                rebuiltexpeditioncompletedelegate();
                Logging.Info("-Notifying of rebuilt Expedition");
            }
        }
        public void ReflectLibrary(WebLibraryDetail web_library_detail)
        {
            // Reset
            RegionNoLibrary.Visibility          = Visibility.Collapsed;
            RegionNoExpedition.Visibility       = Visibility.Collapsed;
            RegionStaleExpedition.Visibility    = Visibility.Collapsed;
            RegionExpeditionTooSmall.Visibility = Visibility.Collapsed;

            // Reflect
            if (null == web_library_detail)
            {
                RegionNoLibrary.Visibility = Visibility.Visible;
            }
            else
            {
                ExpeditionDataSource eds = web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

                if (null != eds)
                {
                    RegionNoExpedition.Visibility = Visibility.Visible;
                }
                else
                {
                    ASSERT.Test(eds.words != null);
                    ASSERT.Test(eds.docs != null);

                    // Is this expedition getting old?
                    if (web_library_detail.Xlibrary.ExpeditionManager.IsStale)
                    {
                        RegionStaleExpedition.Visibility = Visibility.Visible;
                    }

                    // Is this expedition too small?
                    if (eds.docs.Count < 20 || eds.words.Count < 5)
                    {
                        RegionExpeditionTooSmall.Visibility = Visibility.Visible;
                    }
                }
            }
        }
Example #8
0
        private void ButtonExportTopics_Click(object sender, RoutedEventArgs e)
        {
            ExpeditionDataSource eds = web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

            if (null != eds)
            {
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < eds.LDAAnalysis.NUM_TOPICS; ++i)
                {
                    string topic_description = eds.GetDescriptionForTopic(i);
                    sb.AppendFormat("{1}\r\n", i, topic_description);
                }

                string filename = TempFile.GenerateTempFilename("txt");
                File.WriteAllText(filename, sb.ToString());
                Process.Start(filename);
            }
            else
            {
                MessageBoxes.Error("You need to first run Expedition for this library.");
            }
        }
Example #9
0
        public void ChooseNewLibrary(WebLibraryDetail picked_web_library_detail)
        {
            web_library_detail            = picked_web_library_detail;
            TextLibraryForExpedition.Text = "Click to choose a library.";
            ObjTopicListPanel.Children.Clear();

            ObjExpeditionInstructionsControl.ReflectLibrary(web_library_detail);

            if (null != web_library_detail)
            {
                TextLibraryForExpedition.Text = web_library_detail.Title;

                int suggested_theme_count = web_library_detail.Xlibrary?.ExpeditionManager?.RecommendedThemeCount ?? 0;
                TextExpeditionNumThemes.Text    = $"{suggested_theme_count}";
                TextExpeditionNumThemes.ToolTip = $"How many themes do you want in this Expedition?\n({ suggested_theme_count } suggested)";

                ExpeditionDataSource eds = web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

                if (null != eds)
                {
                    for (int i = 0; i < eds.LDAAnalysis.NUM_TOPICS; ++i)
                    {
                        TopicOverviewControl.TopicOverviewData tod = new TopicOverviewControl.TopicOverviewData
                        {
                            web_library_detail = web_library_detail,
                            topic = i,
                        };

                        TopicOverviewControl toc = new TopicOverviewControl();
                        toc.PDFDocumentSelected += toc_PDFDocumentSelected;
                        toc.DataContext          = tod;

                        ObjTopicListPanel.Children.Add(toc);
                    }
                }
            }
        }
        private void PopulateDetail(bool detailed_mode)
        {
            // Clear the old
            ObjHeader.Header  = null;
            ObjHeader.ToolTip = null;
            ObjPapers.Children.Clear();

            // Try to get the context
            TopicOverviewData tod = DataContext as TopicOverviewData;

            if (null == tod)
            {
                return;
            }

            // Quick refs
            ExpeditionDataSource eds = tod.web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

            if (null != eds)
            {
                LDAAnalysis lda_analysis = eds.LDAAnalysis;

                // First the terms header
                {
                    string header = eds.GetDescriptionForTopic(tod.topic);
                    ObjHeader.Header           = header;
                    ObjHeader.ToolTip          = header;
                    ObjHeader.HeaderBackground = new SolidColorBrush(eds.Colours[tod.topic]);
                }

                // Then the docs
                {
                    int NUM_DOCS = Math.Min(detailed_mode ? 50 : 10, lda_analysis.NUM_DOCS);

                    ASSERT.Test(tod.topic >= 0);
                    ASSERT.Test(tod.topic < lda_analysis.NUM_TOPICS);

                    for (int d = 0; d < NUM_DOCS && d < eds.docs.Count; ++d)
                    {
                        DocProbability[] docs = lda_analysis.DensityOfDocsInTopicsSorted[tod.topic];
                        ASSERT.Test(docs != null);
                        ASSERT.Test(docs.Length == lda_analysis.NUM_DOCS);
                        DocProbability lda_elem = docs[d];
                        ASSERT.Test(lda_elem != null);

                        PDFDocument pdf_document = tod.web_library_detail.Xlibrary.GetDocumentByFingerprint(eds.docs[lda_elem.doc]);

                        string doc_percentage = String.Format("{0:N0}%", 100 * lda_elem.prob);

                        bool      alternator = false;
                        TextBlock text_doc   = ListFormattingTools.GetDocumentTextBlock(pdf_document, ref alternator, Features.Expedition_TopicDocument, TopicDocumentPressed_MouseButtonEventHandler, doc_percentage + " - ");
                        ObjPapers.Children.Add(text_doc);
                    }

                    // The MORE button
                    if (!detailed_mode && NUM_DOCS < eds.docs.Count)
                    {
                        AugmentedButton button_more = new AugmentedButton();
                        button_more.Caption = "Show me more";
                        button_more.Click  += button_more_Click;
                        ObjPapers.Children.Add(button_more);
                    }

                    // The BRAINSTORM button
                    {
                        AugmentedButton button_brainstorm = new AugmentedButton();
                        button_brainstorm.Caption = "Show me in Brainstorm";
                        button_brainstorm.Click  += button_brainstorm_Click;
                        button_brainstorm.Tag     = tod;
                        ObjPapers.Children.Add(button_brainstorm);
                    }
                }
            }
        }
Example #11
0
        public static ExpeditionDataSource BuildExpeditionDataSource(Library library, int num_topics, bool add_autotags, bool add_tags, ExpeditionBuilderProgressUpdateDelegate progress_update_delegate)
        {
            bool not_aborted_by_user = true;

            // Check that we have a progres update delegate
            if (null == progress_update_delegate)
            {
                progress_update_delegate = DefaultExpeditionBuilderProgressUpdate;
            }

            // What are the sources of data?
            progress_update_delegate("Assembling tags", 0);
            HashSet <string>   tags          = BuildLibraryTagList(library, add_autotags, add_tags);
            List <PDFDocument> pdf_documents = library.PDFDocumentsWithLocalFilePresent;

            // Initialise the datasource
            progress_update_delegate("Initialising datasource", 0);
            ExpeditionDataSource data_source = new ExpeditionDataSource();

            data_source.date_created = DateTime.UtcNow;

            progress_update_delegate("Adding tags", 0);
            data_source.words = new List <string>();
            foreach (string tag in tags)
            {
                data_source.words.Add(tag);
            }

            progress_update_delegate("Adding docs", 0);
            data_source.docs = new List <string>();
            foreach (PDFDocument pdf_document in pdf_documents)
            {
                data_source.docs.Add(pdf_document.Fingerprint);
            }

            progress_update_delegate("Rebuilding indices", 0);
            data_source.RebuildIndices();

            // Now go through each doc and find the tags that match
            data_source.words_in_docs = new int[data_source.docs.Count][];

            int total_processed = 0;

            Parallel.For(0, data_source.docs.Count, d =>
                         //for (int d = 0; d < data_source.docs.Count; ++d)
            {
                int total_processed_local = Interlocked.Increment(ref total_processed);
                if (0 == total_processed_local % 10)
                {
                    not_aborted_by_user = not_aborted_by_user && progress_update_delegate("Scanning documents", total_processed_local / (double)data_source.docs.Count);
                }

                List <int> tags_in_document = new List <int>();

                if (not_aborted_by_user)
                {
                    PDFDocument pdf_document = pdf_documents[d];
                    string full_text         = " " + pdf_document.PDFRenderer.GetFullOCRText() + " ";
                    string full_text_lower   = full_text.ToLower();

                    for (int t = 0; t < data_source.words.Count; ++t)
                    {
                        string tag = ' ' + data_source.words[t] + ' ';

                        string full_text_to_search = full_text;
                        if (StringTools.HasSomeLowerCase(tag))
                        {
                            full_text_to_search = full_text_lower;
                            tag = tag.ToLower();
                        }

                        int num_appearances = StringTools.CountStringOccurence(full_text_to_search, tag);
                        for (int i = 0; i < num_appearances; ++i)
                        {
                            tags_in_document.Add(t);
                        }
                    }
                }

                data_source.words_in_docs[d] = tags_in_document.ToArray();
            }
                         );

            // Initialise the LDA
            not_aborted_by_user = not_aborted_by_user && progress_update_delegate("Building themes sampler", 0);
            int    num_threads = Environment.ProcessorCount;
            double alpha       = 2.0 / num_topics;
            double beta        = 0.01;

            data_source.lda_sampler = new LDASampler(alpha, beta, num_topics, data_source.words.Count, data_source.docs.Count, data_source.words_in_docs);

            LDASamplerMCSerial lda_sampler_mc = new LDASamplerMCSerial(data_source.lda_sampler, num_threads);

            for (int i = 0; i < MAX_TOPIC_ITERATIONS; ++i)
            {
                if (!not_aborted_by_user)
                {
                    break;
                }
                not_aborted_by_user = not_aborted_by_user && progress_update_delegate("Building themes", i / (double)MAX_TOPIC_ITERATIONS);
                lda_sampler_mc.MC(10);
            }

            // Results
            if (not_aborted_by_user)
            {
                progress_update_delegate("Built Expedition", 1);
            }
            else
            {
                progress_update_delegate("Cancelled Expedition", 1);
            }

            return(data_source);
        }
Example #12
0
        public static List <Result> GetRelevantOthers(PDFDocument pdf_document, int NUM_OTHERS)
        {
            List <Result> results = new List <Result>();

            try
            {
                if (null == pdf_document.Library.ExpeditionManager.ExpeditionDataSource)
                {
                    return(results);
                }

                ExpeditionDataSource eds          = pdf_document.Library.ExpeditionManager.ExpeditionDataSource;
                LDAAnalysis          lda_analysis = eds.LDAAnalysis;

                if (!pdf_document.Library.ExpeditionManager.ExpeditionDataSource.docs_index.ContainsKey(pdf_document.Fingerprint))
                {
                    return(results);
                }

                // Fill the similar papers
                {
                    int doc_id = pdf_document.Library.ExpeditionManager.ExpeditionDataSource.docs_index[pdf_document.Fingerprint];
                    TopicProbability[] topics = lda_analysis.DensityOfTopicsInDocsSorted[doc_id];

                    List <DocProbability> similar_docs = new List <DocProbability>();

                    // Only look at the first 5 topics
                    for (int t = 0; t < topics.Length && t < 3; ++t)
                    {
                        int    topic      = topics[t].topic;
                        double topic_prob = topics[t].prob;

                        // Look at the first 50 docs in each topic (if there are that many)
                        DocProbability[] docs = lda_analysis.DensityOfDocsInTopicsSorted[topic];
                        for (int d = 0; d < docs.Length && d < 50; ++d)
                        {
                            int    doc      = docs[d].doc;
                            double doc_prob = docs[d].prob;

                            DocProbability dp = new DocProbability(Math.Sqrt(topic_prob * doc_prob), doc);
                            similar_docs.Add(dp);
                        }
                    }

                    // Now take the top N docs
                    similar_docs.Sort();
                    for (int i = 0; i < similar_docs.Count && i < NUM_OTHERS; ++i)
                    {
                        PDFDocument pdf_document_similar = pdf_document.Library.GetDocumentByFingerprint(eds.docs[similar_docs[i].doc]);
                        results.Add(new Result {
                            pdf_document = pdf_document_similar, relevance = similar_docs[i].prob
                        });
                    }
                }
            }
            catch (Exception ex)
            {
                Logging.Error(ex, "There was a problem getting the relevant others for document {0}", pdf_document.Fingerprint);
            }

            return(results);
        }
Example #13
0
        private void ExpeditionPaperThemesControl_DataContextChanged(object sender, DependencyPropertyChangedEventArgs e)
        {
            // Clear the old
            ObjSeriesTopics.DataSource        = null;
            TxtPleaseRunExpedition.Visibility = Visibility.Visible;
            ChartTopics.Visibility            = Visibility.Collapsed;

            AugmentedBindable <PDFDocument> pdf_document_bindable = DataContext as AugmentedBindable <PDFDocument>;

            if (null == pdf_document_bindable)
            {
                return;
            }

            PDFDocument pdf_document = pdf_document_bindable.Underlying;

            if (null == pdf_document.Library.ExpeditionManager.ExpeditionDataSource)
            {
                return;
            }

            ExpeditionDataSource eds          = pdf_document.Library.ExpeditionManager.ExpeditionDataSource;
            LDAAnalysis          lda_analysis = eds.LDAAnalysis;

            // Draw the pie chart
            {
                try
                {
                    if (!eds.docs_index.ContainsKey(pdf_document.Fingerprint))
                    {
                        return;
                    }

                    int doc_id = eds.docs_index[pdf_document.Fingerprint];
                    TopicProbability[] topics = lda_analysis.DensityOfTopicsInDocsSorted[doc_id];

                    int     ITEMS_IN_CHART = Math.Min(topics.Length, 3);
                    Brush[] brushes        = new Brush[ITEMS_IN_CHART + 1];

                    List <ChartItem> chart_items = new List <ChartItem>();
                    double           remaining_segment_percentage = 1.0;
                    for (int t = 0; t < ITEMS_IN_CHART; ++t)
                    {
                        string topic_name = eds.GetDescriptionForTopic(topics[t].topic);
                        double percentage = topics[t].prob;

                        chart_items.Add(new ChartItem {
                            Topic = topic_name, Percentage = percentage
                        });
                        brushes[t] = new SolidColorBrush(eds.Colours[topics[t].topic]);

                        remaining_segment_percentage -= percentage;
                    }

                    chart_items.Add(new ChartItem {
                        Topic = "Others", Percentage = remaining_segment_percentage
                    });
                    brushes[ITEMS_IN_CHART] = new SolidColorBrush(Colors.White);

                    ObjChartTopicsArea.ColorModel.CustomPalette = brushes;
                    ObjChartTopicsArea.ColorModel.Palette       = ChartColorPalette.Custom;
                    ObjSeriesTopics.DataSource = chart_items;

                    // Silly
                    ObjSeriesTopics.AnimationDuration = TimeSpan.FromMilliseconds(1000);
                    ObjSeriesTopics.EnableAnimation   = false;
                    ObjSeriesTopics.AnimateOneByOne   = true;
                    ObjSeriesTopics.AnimateOption     = AnimationOptions.Fade;
                    ObjSeriesTopics.EnableAnimation   = true;
                }
                catch (Exception ex)
                {
                    Logging.Error(ex, "There was a problem while generating the topics chart for document {0}", pdf_document.Fingerprint);
                }
            }

            TxtPleaseRunExpedition.Visibility = Visibility.Collapsed;
            ChartTopics.Visibility            = Visibility.Visible;
        }
        public static List <Result> GetRelevantOthers(PDFDocument pdf_document, int NUM_OTHERS)
        {
            List <Result> results = new List <Result>();

            try
            {
                ExpeditionDataSource eds = pdf_document.LibraryRef?.Xlibrary?.ExpeditionManager?.ExpeditionDataSource;

                if (null != eds)
                {
                    LDAAnalysis lda_analysis = eds.LDAAnalysis;

                    if (eds.docs_index.ContainsKey(pdf_document.Fingerprint))
                    {
                        // Fill the similar papers

                        int doc_id = eds.docs_index[pdf_document.Fingerprint];
                        TopicProbability[] topics = lda_analysis.DensityOfTopicsInDocsSorted[doc_id];

                        List <DocProbability> similar_docs = new List <DocProbability>();

                        // Only look at the first 5 topics
                        for (int t = 0; t < topics.Length && t < 3; ++t)
                        {
                            int    topic      = topics[t].topic;
                            double topic_prob = topics[t].prob;

                            ASSERT.Test(topic >= 0);
                            ASSERT.Test(topic < lda_analysis.NUM_TOPICS);

                            // Look at the first 50 docs in each topic (if there are that many)
                            DocProbability[] docs = lda_analysis.DensityOfDocsInTopicsSorted[topic];
                            ASSERT.Test(docs != null);
                            ASSERT.Test(docs.Length == lda_analysis.NUM_DOCS);

                            for (int d = 0; d < Math.Min(docs.Length, 50); ++d)
                            {
                                int    doc      = docs[d].doc;
                                double doc_prob = docs[d].prob;

                                DocProbability dp = new DocProbability(Math.Sqrt(topic_prob * doc_prob), doc);
                                similar_docs.Add(dp);
                            }
                        }

                        // Now take the top N docs
                        similar_docs.Sort();
                        for (int i = 0; i < similar_docs.Count && i < NUM_OTHERS; ++i)
                        {
                            string      fingerprint_to_look_for = eds.docs[similar_docs[i].doc];
                            PDFDocument pdf_document_similar    = pdf_document.LibraryRef.Xlibrary.GetDocumentByFingerprint(fingerprint_to_look_for);
                            if (null == pdf_document_similar)
                            {
                                Logging.Warn("ExpeditionPaperSuggestions: Cannot find similar document anymore for fingerprint {0}", fingerprint_to_look_for);
                            }
                            else
                            {
                                results.Add(new Result {
                                    pdf_document = pdf_document_similar, relevance = similar_docs[i].prob
                                });
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Logging.Error(ex, "There was a problem getting the relevant others for document {0}", pdf_document.Fingerprint);
            }

            return(results);
        }
        public static ExpeditionDataSource BuildExpeditionDataSource(WebLibraryDetail web_library_detail, int num_topics, bool add_autotags, bool add_tags, ExpeditionBuilderProgressUpdateDelegate progress_update_delegate)
        {
            WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread();

            // Initialise the datasource
            ExpeditionDataSource data_source = new ExpeditionDataSource();

            data_source.date_created = DateTime.UtcNow;

            try
            {
                // Check that we have a progress update delegate
                if (null == progress_update_delegate)
                {
                    progress_update_delegate = DefaultExpeditionBuilderProgressUpdate;
                }

                // What are the sources of data?
                progress_update_delegate("Assembling tags");
                HashSet <string>   tags          = BuildLibraryTagList(web_library_detail, add_autotags, add_tags);
                List <PDFDocument> pdf_documents = web_library_detail.Xlibrary.PDFDocumentsWithLocalFilePresent;

                progress_update_delegate("Adding tags");
                data_source.words = new List <string>();
                foreach (string tag in tags)
                {
                    data_source.words.Add(tag);
                }

                progress_update_delegate("Adding docs");
                data_source.docs = new List <string>();
                foreach (PDFDocument pdf_document in pdf_documents)
                {
                    data_source.docs.Add(pdf_document.Fingerprint);
                }

                progress_update_delegate("Rebuilding indices");
                data_source.RebuildIndices();

                // Now go through each doc and find the tags that match
                int DATA_SOURCE_DOCS_COUNT = data_source.docs.Count;
                data_source.words_in_docs = new int[DATA_SOURCE_DOCS_COUNT][];

                //int total_processed = 0;

                Parallel.For(0, DATA_SOURCE_DOCS_COUNT, d =>
                             //for (int d = 0; d < DATA_SOURCE_DOCS_COUNT; ++d)
                {
                    //int total_processed_local = Interlocked.Increment(ref total_processed);
                    //if (0 == total_processed_local % 50)
                    if (0 == d % 50)
                    {
                        if (!progress_update_delegate("Scanning documents", d, DATA_SOURCE_DOCS_COUNT))
                        {
                            // Parallel.For() doc at https://docs.microsoft.com/en-us/archive/msdn-magazine/2007/october/parallel-performance-optimize-managed-code-for-multi-core-machines
                            // says:
                            //
                            // Finally, if any exception is thrown in any of the iterations, all iterations are canceled
                            // and the first thrown exception is rethrown in the calling thread, ensuring that exceptions
                            // are properly propagated and never lost.
                            //
                            // --> We can thus easily use an exception to terminate/cancel all iterations of Parallel.For()!
                            throw new TaskCanceledException("Operation canceled by user");
                        }
                    }

                    List <int> tags_in_document = new List <int>();

                    {
                        PDFDocument pdf_document = pdf_documents[d];
                        string full_text         = " " + pdf_document.GetFullOCRText() + " ";
                        string full_text_lower   = full_text.ToLower();

                        for (int t = 0; t < data_source.words.Count; ++t)
                        {
                            string tag = ' ' + data_source.words[t] + ' ';

                            string full_text_to_search = full_text;
                            if (StringTools.HasSomeLowerCase(tag))
                            {
                                full_text_to_search = full_text_lower;
                                tag = tag.ToLower();
                            }

                            int num_appearances = StringTools.CountStringOccurence(full_text_to_search, tag);
                            for (int i = 0; i < num_appearances; ++i)
                            {
                                tags_in_document.Add(t);
                            }
                        }
                    }

                    data_source.words_in_docs[d] = tags_in_document.ToArray();
                }
                             );

                // Initialise the LDA
                if (!progress_update_delegate("Building themes sampler"))
                {
                    // Parallel.For() doc at https://docs.microsoft.com/en-us/archive/msdn-magazine/2007/october/parallel-performance-optimize-managed-code-for-multi-core-machines
                    // says:
                    //
                    // Finally, if any exception is thrown in any of the iterations, all iterations are canceled
                    // and the first thrown exception is rethrown in the calling thread, ensuring that exceptions
                    // are properly propagated and never lost.
                    //
                    // --> We can thus easily use an exception to terminate/cancel all iterations of Parallel.For()!
                    throw new TaskCanceledException("Operation canceled by user");
                }

                int    num_threads = Math.Min(1, (Environment.ProcessorCount - 1) / 2);
                double alpha       = 2.0 / num_topics;
                double beta        = 0.01;
                data_source.lda_sampler = new LDASampler(alpha, beta, num_topics, data_source.words.Count, data_source.docs.Count, data_source.words_in_docs);

                LDASamplerMCSerial lda_sampler_mc = new LDASamplerMCSerial(data_source.lda_sampler, num_threads);
                lda_sampler_mc.MC(MAX_TOPIC_ITERATIONS, (iteration, num_iterations) =>
                {
                    if (!progress_update_delegate("Building themes", iteration, num_iterations))
                    {
                        // Parallel.For() doc at https://docs.microsoft.com/en-us/archive/msdn-magazine/2007/october/parallel-performance-optimize-managed-code-for-multi-core-machines
                        // says:
                        //
                        // Finally, if any exception is thrown in any of the iterations, all iterations are canceled
                        // and the first thrown exception is rethrown in the calling thread, ensuring that exceptions
                        // are properly propagated and never lost.
                        //
                        // --> We can thus easily use an exception to terminate/cancel all iterations of Parallel.For()!
                        throw new TaskCanceledException("Operation canceled by user");
                    }
                });
            }
#pragma warning disable CS0168 // The variable 'ex' is declared but never used
            catch (TaskCanceledException ex)
#pragma warning restore CS0168 // The variable 'ex' is declared but never used
            {
                // This exception should only occur when the user *canceled* the process and should therefor
                // *not* be propagated. Instead, we have to report an aborted result:
                progress_update_delegate("Canceled Expedition", 1, 1);
                return(null);
            }

            progress_update_delegate("Built Expedition", 1, 1);

            return(data_source);
        }