private static void GetDensityForDocument(PDFDocument pdf_document, out int doc_id, out int num_topics, out float[] density_of_topics_in_document) { doc_id = -1; num_topics = -1; density_of_topics_in_document = null; if (null == pdf_document) { return; } ExpeditionDataSource eds = pdf_document.LibraryRef.Xlibrary?.ExpeditionManager?.ExpeditionDataSource; if (null != eds) { LDAAnalysis lda_analysis = eds.LDAAnalysis; if (eds.docs_index.ContainsKey(pdf_document.Fingerprint)) { // Result! doc_id = eds.docs_index[pdf_document.Fingerprint]; num_topics = lda_analysis.NUM_TOPICS; density_of_topics_in_document = new float[num_topics]; for (int i = 0; i < lda_analysis.NUM_TOPICS; ++i) { density_of_topics_in_document[i] = lda_analysis.DensityOfTopicsInDocuments[doc_id, i]; } } } }
public string DumpTopicsPopularity() { StringBuilder sb = new StringBuilder(); // Count how many docs count each topic in their top-5 LDAAnalysis lda = LDAAnalysis; int TOP_N = Math.Min(5, lda.NUM_TOPICS); // Must be less than or equal to 5 int[,] topics_popularity = new int[lda.NUM_TOPICS, TOP_N]; { TopicProbability[][] density_of_top5_topics_in_docs_sorted = lda.DensityOfTop5TopicsInDocsSorted; // [doc][n<5] for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { for (int n = 0; n < TOP_N; ++n) { int topic = density_of_top5_topics_in_docs_sorted[doc][n].topic; ++topics_popularity[topic, n]; } } } // Show the descriptive keywords for each topic { for (int topic = 0; topic < lda.NUM_TOPICS; ++topic) { string description = GetDescriptionForTopic(topic, true, ";", false); sb.AppendFormat("{0}", description); sb.AppendLine(); for (int n = 0; n < TOP_N; ++n) { sb.AppendFormat("{0}de:{1}\t", n + 1, topics_popularity[topic, n]); } sb.AppendLine(); } } sb.AppendLine(); { for (int topic = 0; topic < lda.NUM_TOPICS; ++topic) { sb.AppendFormat("{0}\t", topic); for (int n = 0; n < TOP_N; ++n) { sb.AppendFormat("{0}\t", topics_popularity[topic, n]); } sb.AppendLine(); } } return(sb.ToString()); }
private void TopicOverviewControl_DataContextChanged(object sender, DependencyPropertyChangedEventArgs e) { // Clear the old ObjPDFRendererControlPlaceholder.Children.Clear(); AugmentedBindable <PDFDocument> pdf_document_bindable = DataContext as AugmentedBindable <PDFDocument>; if (null == pdf_document_bindable) { return; } PDFDocument pdf_document = pdf_document_bindable.Underlying; if (null == pdf_document.Library.ExpeditionManager.ExpeditionDataSource) { return; } ExpeditionDataSource eds = pdf_document.Library.ExpeditionManager.ExpeditionDataSource; LDAAnalysis lda_analysis = eds.LDAAnalysis; try { if (!pdf_document.Library.ExpeditionManager.ExpeditionDataSource.docs_index.ContainsKey(pdf_document.Fingerprint)) { MessageBoxes.Warn("Expedition doesn't have any information about this paper. Please Refresh your Expedition."); return; } } catch (Exception ex) { Logging.Error(ex, "There was a problem with Expedition for document {0}", pdf_document.Fingerprint); } if (pdf_document_bindable.Underlying.DocumentExists) { ObjPDFRendererControlPlaceholderBorder.Visibility = Visibility.Visible; ObjPDFRendererControlPlaceholderRow.Height = new GridLength(1, GridUnitType.Star); PDFRendererControl pdf_renderer_control = new PDFRendererControl(pdf_document_bindable.Underlying, false, PDFRendererControl.ZoomType.Zoom1Up); ObjPDFRendererControlPlaceholder.Children.Add(pdf_renderer_control); } else { ObjPDFRendererControlPlaceholderBorder.Visibility = Visibility.Collapsed; ObjPDFRendererControlPlaceholderRow.Height = new GridLength(0, GridUnitType.Pixel); } }
public void PrintStats_TOPICS() { LDAAnalysis lda = LDAAnalysis; for (int topic = 0; topic < lda.NUM_TOPICS; ++topic) { Console.WriteLine("Topic: {0}", GetDescriptionForTopic(topic)); for (int word = 0; word < 10; ++word) { Console.WriteLine("{0} & {1} & {2}", word + 1, words[lda.DensityOfWordsInTopicsSorted[topic][word].word], lda.DensityOfWordsInTopicsSorted[topic][word].prob); } Console.WriteLine(); } }
public void PrintStats_DOCS() { LDAAnalysis lda = LDAAnalysis; for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { Console.Write("Doc {0}:", doc); for (int topic = 0; topic < lda.NUM_TOPICS; ++topic) { Console.Write("\t{0:0}", 100 * lda.DensityOfTopicsInDocuments[doc, topic]); } Console.WriteLine(); } }
public string GetDescriptionForTopic(int topic, bool include_topic_number = true, string separator = "; ", bool stop_at_word_probability_jump = true) { StringBuilder sb = new StringBuilder(); if (include_topic_number) { sb.Append(String.Format("{0}. ", topic + 1)); } LDAAnalysis lda = LDAAnalysis; WordProbability[] lda_wordprobs = lda.DensityOfWordsInTopicsSorted[topic]; ASSERT.Test(lda_wordprobs != null); double last_term_prob = 0; for (int t = 0; t < 5 && t < lda.NUM_WORDS; ++t) { WordProbability lda_node = lda_wordprobs[t]; ASSERT.Test(lda_node != null); if (last_term_prob / lda_node.prob > 10) { if (stop_at_word_probability_jump) { break; } else { sb.Append(" // "); } } last_term_prob = lda_node.prob; sb.Append(String.Format("{0}", words[lda_node.word])); sb.Append(separator); } string description = sb.ToString(); if (description.EndsWith(separator)) { description = description.Substring(0, description.Length - separator.Length); } return(description); }
void ButtonExportTopics_Click(object sender, RoutedEventArgs e) { if (null != library.ExpeditionManager.ExpeditionDataSource) { ExpeditionDataSource eds = library.ExpeditionManager.ExpeditionDataSource; LDAAnalysis lda_analysis = library.ExpeditionManager.ExpeditionDataSource.LDAAnalysis; StringBuilder sb = new StringBuilder(); for (int i = 0; i < library.ExpeditionManager.ExpeditionDataSource.LDAAnalysis.NUM_TOPICS; ++i) { string topic_description = eds.GetDescriptionForTopic(i); sb.AppendFormat("{1}\r\n", i, topic_description); } string filename = TempFile.GenerateTempFilename("txt"); File.WriteAllText(filename, sb.ToString()); Process.Start(filename); } else { MessageBoxes.Error("You need to first run Expedition for this library."); } }
private static void ExpandSimilars(PDFDocument doc, NodeControl node_control) { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); ASSERT.Test(doc != null); FeatureTrackingManager.Instance.UseFeature(Features.Brainstorm_ExploreLibrary_Document_Similars); if (doc != null) { ASSERT.Test(doc.LibraryRef.Xlibrary != null); ExpeditionDataSource eds = doc.LibraryRef.Xlibrary?.ExpeditionManager?.ExpeditionDataSource; if (null != eds) { if (eds.docs_index.ContainsKey(doc.Fingerprint)) { int doc_id = eds.docs_index[doc.Fingerprint]; LDAAnalysis lda = eds.LDAAnalysis; float[,] density_of_topics_in_docs = lda.DensityOfTopicsInDocuments; float[] distribution = new float[lda.NUM_TOPICS]; for (int topic_i = 0; topic_i < lda.NUM_TOPICS; ++topic_i) { distribution[topic_i] = density_of_topics_in_docs[doc_id, topic_i]; } ThemeNodeContentControl.AddDocumentsSimilarToDistribution(node_control, doc.LibraryRef, eds, distribution); } } else { Logging.Warn("Expedition has not been run for library '{0}'.", doc.LibraryRef.Title); } } }
private void PopulateDetail(bool detailed_mode) { // Clear the old ObjHeader.Header = null; ObjHeader.ToolTip = null; ObjPapers.Children.Clear(); // Try to get the context TopicOverviewData tod = DataContext as TopicOverviewData; if (null == tod) { return; } // Quick refs ExpeditionDataSource eds = tod.web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource; if (null != eds) { LDAAnalysis lda_analysis = eds.LDAAnalysis; // First the terms header { string header = eds.GetDescriptionForTopic(tod.topic); ObjHeader.Header = header; ObjHeader.ToolTip = header; ObjHeader.HeaderBackground = new SolidColorBrush(eds.Colours[tod.topic]); } // Then the docs { int NUM_DOCS = Math.Min(detailed_mode ? 50 : 10, lda_analysis.NUM_DOCS); ASSERT.Test(tod.topic >= 0); ASSERT.Test(tod.topic < lda_analysis.NUM_TOPICS); for (int d = 0; d < NUM_DOCS && d < eds.docs.Count; ++d) { DocProbability[] docs = lda_analysis.DensityOfDocsInTopicsSorted[tod.topic]; ASSERT.Test(docs != null); ASSERT.Test(docs.Length == lda_analysis.NUM_DOCS); DocProbability lda_elem = docs[d]; ASSERT.Test(lda_elem != null); PDFDocument pdf_document = tod.web_library_detail.Xlibrary.GetDocumentByFingerprint(eds.docs[lda_elem.doc]); string doc_percentage = String.Format("{0:N0}%", 100 * lda_elem.prob); bool alternator = false; TextBlock text_doc = ListFormattingTools.GetDocumentTextBlock(pdf_document, ref alternator, Features.Expedition_TopicDocument, TopicDocumentPressed_MouseButtonEventHandler, doc_percentage + " - "); ObjPapers.Children.Add(text_doc); } // The MORE button if (!detailed_mode && NUM_DOCS < eds.docs.Count) { AugmentedButton button_more = new AugmentedButton(); button_more.Caption = "Show me more"; button_more.Click += button_more_Click; ObjPapers.Children.Add(button_more); } // The BRAINSTORM button { AugmentedButton button_brainstorm = new AugmentedButton(); button_brainstorm.Caption = "Show me in Brainstorm"; button_brainstorm.Click += button_brainstorm_Click; button_brainstorm.Tag = tod; ObjPapers.Children.Add(button_brainstorm); } } } }
public static List <Result> GetRelevantOthers(PDFDocument pdf_document, int NUM_OTHERS) { List <Result> results = new List <Result>(); try { if (null == pdf_document.Library.ExpeditionManager.ExpeditionDataSource) { return(results); } ExpeditionDataSource eds = pdf_document.Library.ExpeditionManager.ExpeditionDataSource; LDAAnalysis lda_analysis = eds.LDAAnalysis; if (!pdf_document.Library.ExpeditionManager.ExpeditionDataSource.docs_index.ContainsKey(pdf_document.Fingerprint)) { return(results); } // Fill the similar papers { int doc_id = pdf_document.Library.ExpeditionManager.ExpeditionDataSource.docs_index[pdf_document.Fingerprint]; TopicProbability[] topics = lda_analysis.DensityOfTopicsInDocsSorted[doc_id]; List <DocProbability> similar_docs = new List <DocProbability>(); // Only look at the first 5 topics for (int t = 0; t < topics.Length && t < 3; ++t) { int topic = topics[t].topic; double topic_prob = topics[t].prob; // Look at the first 50 docs in each topic (if there are that many) DocProbability[] docs = lda_analysis.DensityOfDocsInTopicsSorted[topic]; for (int d = 0; d < docs.Length && d < 50; ++d) { int doc = docs[d].doc; double doc_prob = docs[d].prob; DocProbability dp = new DocProbability(Math.Sqrt(topic_prob * doc_prob), doc); similar_docs.Add(dp); } } // Now take the top N docs similar_docs.Sort(); for (int i = 0; i < similar_docs.Count && i < NUM_OTHERS; ++i) { PDFDocument pdf_document_similar = pdf_document.Library.GetDocumentByFingerprint(eds.docs[similar_docs[i].doc]); results.Add(new Result { pdf_document = pdf_document_similar, relevance = similar_docs[i].prob }); } } } catch (Exception ex) { Logging.Error(ex, "There was a problem getting the relevant others for document {0}", pdf_document.Fingerprint); } return(results); }
private void ExpeditionPaperThemesControl_DataContextChanged(object sender, DependencyPropertyChangedEventArgs e) { // Clear the old ObjSeriesTopics.DataSource = null; TxtPleaseRunExpedition.Visibility = Visibility.Visible; ChartTopics.Visibility = Visibility.Collapsed; AugmentedBindable <PDFDocument> pdf_document_bindable = DataContext as AugmentedBindable <PDFDocument>; if (null == pdf_document_bindable) { return; } PDFDocument pdf_document = pdf_document_bindable.Underlying; if (null == pdf_document.Library.ExpeditionManager.ExpeditionDataSource) { return; } ExpeditionDataSource eds = pdf_document.Library.ExpeditionManager.ExpeditionDataSource; LDAAnalysis lda_analysis = eds.LDAAnalysis; // Draw the pie chart { try { if (!eds.docs_index.ContainsKey(pdf_document.Fingerprint)) { return; } int doc_id = eds.docs_index[pdf_document.Fingerprint]; TopicProbability[] topics = lda_analysis.DensityOfTopicsInDocsSorted[doc_id]; int ITEMS_IN_CHART = Math.Min(topics.Length, 3); Brush[] brushes = new Brush[ITEMS_IN_CHART + 1]; List <ChartItem> chart_items = new List <ChartItem>(); double remaining_segment_percentage = 1.0; for (int t = 0; t < ITEMS_IN_CHART; ++t) { string topic_name = eds.GetDescriptionForTopic(topics[t].topic); double percentage = topics[t].prob; chart_items.Add(new ChartItem { Topic = topic_name, Percentage = percentage }); brushes[t] = new SolidColorBrush(eds.Colours[topics[t].topic]); remaining_segment_percentage -= percentage; } chart_items.Add(new ChartItem { Topic = "Others", Percentage = remaining_segment_percentage }); brushes[ITEMS_IN_CHART] = new SolidColorBrush(Colors.White); ObjChartTopicsArea.ColorModel.CustomPalette = brushes; ObjChartTopicsArea.ColorModel.Palette = ChartColorPalette.Custom; ObjSeriesTopics.DataSource = chart_items; // Silly ObjSeriesTopics.AnimationDuration = TimeSpan.FromMilliseconds(1000); ObjSeriesTopics.EnableAnimation = false; ObjSeriesTopics.AnimateOneByOne = true; ObjSeriesTopics.AnimateOption = AnimationOptions.Fade; ObjSeriesTopics.EnableAnimation = true; } catch (Exception ex) { Logging.Error(ex, "There was a problem while generating the topics chart for document {0}", pdf_document.Fingerprint); } } TxtPleaseRunExpedition.Visibility = Visibility.Collapsed; ChartTopics.Visibility = Visibility.Visible; }
public static List <Result> GetRelevantOthers(PDFDocument pdf_document, int NUM_OTHERS) { List <Result> results = new List <Result>(); try { ExpeditionDataSource eds = pdf_document.LibraryRef?.Xlibrary?.ExpeditionManager?.ExpeditionDataSource; if (null != eds) { LDAAnalysis lda_analysis = eds.LDAAnalysis; if (eds.docs_index.ContainsKey(pdf_document.Fingerprint)) { // Fill the similar papers int doc_id = eds.docs_index[pdf_document.Fingerprint]; TopicProbability[] topics = lda_analysis.DensityOfTopicsInDocsSorted[doc_id]; List <DocProbability> similar_docs = new List <DocProbability>(); // Only look at the first 5 topics for (int t = 0; t < topics.Length && t < 3; ++t) { int topic = topics[t].topic; double topic_prob = topics[t].prob; ASSERT.Test(topic >= 0); ASSERT.Test(topic < lda_analysis.NUM_TOPICS); // Look at the first 50 docs in each topic (if there are that many) DocProbability[] docs = lda_analysis.DensityOfDocsInTopicsSorted[topic]; ASSERT.Test(docs != null); ASSERT.Test(docs.Length == lda_analysis.NUM_DOCS); for (int d = 0; d < Math.Min(docs.Length, 50); ++d) { int doc = docs[d].doc; double doc_prob = docs[d].prob; DocProbability dp = new DocProbability(Math.Sqrt(topic_prob * doc_prob), doc); similar_docs.Add(dp); } } // Now take the top N docs similar_docs.Sort(); for (int i = 0; i < similar_docs.Count && i < NUM_OTHERS; ++i) { string fingerprint_to_look_for = eds.docs[similar_docs[i].doc]; PDFDocument pdf_document_similar = pdf_document.LibraryRef.Xlibrary.GetDocumentByFingerprint(fingerprint_to_look_for); if (null == pdf_document_similar) { Logging.Warn("ExpeditionPaperSuggestions: Cannot find similar document anymore for fingerprint {0}", fingerprint_to_look_for); } else { results.Add(new Result { pdf_document = pdf_document_similar, relevance = similar_docs[i].prob }); } } } } } catch (Exception ex) { Logging.Error(ex, "There was a problem getting the relevant others for document {0}", pdf_document.Fingerprint); } return(results); }
internal static void AddDocumentsInfluentialInDistribution(NodeControl node_control_, WebLibraryDetail web_library_detail, ExpeditionDataSource eds, float[] tags_distribution) { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); ASSERT.Test(eds != null); Logging.Info("+Performing ThemedPageRank on {0} documents", eds.LDAAnalysis.NUM_DOCS); // We have the distribution of the topic in tags_distribution // Create an array for the document biases // Fill the array using the dot product of the document distribution dotted with the topic distribution - then normalise LDAAnalysis lda = eds.LDAAnalysis; float[,] density_of_topics_in_docs = lda.DensityOfTopicsInDocuments; double[] biases = new double[lda.NUM_DOCS]; for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { double bias_num_squared = 0; double bias_den_doc = 0; double bias_den_tags = 0; for (int topic = 0; topic < lda.NUM_TOPICS; ++topic) { bias_num_squared += density_of_topics_in_docs[doc, topic] * tags_distribution[topic]; bias_den_doc += density_of_topics_in_docs[doc, topic] * density_of_topics_in_docs[doc, topic]; bias_den_tags += tags_distribution[topic] * tags_distribution[topic]; } biases[doc] = bias_num_squared / (Math.Sqrt(bias_den_doc) * Math.Sqrt(bias_den_tags)); } // Then build up a matrix FROM each document - List <int>[] references_outbound = new List <int> [lda.NUM_DOCS]; for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { references_outbound[doc] = new List <int>(); string fingerprint = eds.docs[doc]; PDFDocument pdf_document = web_library_detail.Xlibrary.GetDocumentByFingerprint(fingerprint); if (null == pdf_document) { Logging.Warn("ThemeExplorer::AddInInfluential: Cannot find document anymore for fingerprint {0}", fingerprint); } else { List <Citation> citations_outbound = pdf_document.PDFDocumentCitationManager.GetOutboundCitations(); foreach (Citation citation in citations_outbound) { string fingerprint_inbound = citation.fingerprint_inbound; if (eds.docs_index.ContainsKey(fingerprint_inbound)) { int doc_inbound = eds.docs_index[fingerprint_inbound]; references_outbound[doc].Add(doc_inbound); } } } } // Space for the pageranks double[] pageranks_current = new double[lda.NUM_DOCS]; double[] pageranks_next = new double[lda.NUM_DOCS]; // Initialise for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { pageranks_current[doc] = biases[doc]; } // Iterate int NUM_ITERATIONS = 20; for (int iteration = 0; iteration < NUM_ITERATIONS; ++iteration) { Logging.Info("Performing ThemedPageRank iteration {0}", iteration); // Spread out the activation pageranks for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { foreach (int doc_inbound in references_outbound[doc]) { pageranks_next[doc_inbound] += biases[doc] / references_outbound[doc].Count; } } // Mix the spread out pageranks with the initial bias pageranks double ALPHA = 0.5; for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { pageranks_next[doc] = (1 - ALPHA) * pageranks_next[doc] + ALPHA * biases[doc]; } // Normalise the next pageranks double total = 0; for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { total += pageranks_next[doc]; } if (0 < total) { for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { pageranks_next[doc] /= total; } } // Switch in the next pageranks because we will overwrite them double[] pageranks_temp = pageranks_current; pageranks_current = pageranks_next; pageranks_next = pageranks_temp; } // Sort the pageranks, descending int[] docs = new int[lda.NUM_DOCS]; for (int doc = 0; doc < lda.NUM_DOCS; ++doc) { docs[doc] = doc; } Array.Sort(pageranks_current, docs); Array.Reverse(pageranks_current); Array.Reverse(docs); WPFDoEvents.InvokeInUIThread(() => { // Make the nodes for (int doc = 0; doc < 10 && doc < docs.Length; ++doc) { int doc_id = docs[doc]; string fingerprint = eds.docs[doc_id]; PDFDocument pdf_document = web_library_detail.Xlibrary.GetDocumentByFingerprint(fingerprint); if (null == pdf_document) { Logging.Warn("Couldn't find similar document with fingerprint {0}", fingerprint); } else { PDFDocumentNodeContent content = new PDFDocumentNodeContent(pdf_document.Fingerprint, pdf_document.LibraryRef.Id); NodeControlAddingByKeyboard.AddChildToNodeControl(node_control_, content, false); } } }); }
// ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ private void ApplyTagsDistribution(DistributionUseDelegate distribution_use) { WPFDoEvents.AssertThisCodeIsRunningInTheUIThread(); // Get the distribution for the themes string tags = theme_node_content.Underlying.Tags; string[] tags_array = tags.Split('\n'); string library_fingerprint = theme_node_content.Underlying.library_fingerprint; SafeThreadPool.QueueUserWorkItem(o => { WPFDoEvents.AssertThisCodeIs_NOT_RunningInTheUIThread(); WebLibraryDetail web_library_detail = WebLibraryManager.Instance.GetLibrary(library_fingerprint); if (null == web_library_detail) { Logging.Warn("Unable to locate library " + library_fingerprint); return; } ExpeditionDataSource eds = web_library_detail.Xlibrary?.ExpeditionManager?.ExpeditionDataSource; if (null != eds) { LDAAnalysis lda = eds.LDAAnalysis; float[] tags_distribution = new float[lda.NUM_TOPICS]; int tags_distribution_denom = 0; foreach (string tag in tags_array) { if (eds.words_index.ContainsKey(tag)) { ++tags_distribution_denom; int tag_id = eds.words_index[tag]; for (int topic_i = 0; topic_i < lda.NUM_TOPICS; ++topic_i) { tags_distribution[topic_i] += lda.PseudoDensityOfTopicsInWords[tag_id, topic_i]; } } else { Logging.Warn("Ignoring tag {0} which we don't recognise.", tag); } } if (0 < tags_distribution_denom) { // Normalise the tags distribution for (int topic_i = 0; topic_i < lda.NUM_TOPICS; ++topic_i) { tags_distribution[topic_i] /= tags_distribution_denom; } } distribution_use(node_control, web_library_detail, eds, tags_distribution); } else { Logging.Warn("Expedition has not been run for library '{0}'.", web_library_detail.Title); } }); }