public static string DumpTopicsPopularity(IList <string> words, LDAAnalysis lda_analysis) { StringBuilder sb = new StringBuilder(); // Count how many docs count each topic in their top-5 int TOP_N = Math.Min(5, lda_analysis.NUM_TOPICS); // Must be less than or equal to 5 int[,] topics_popularity = new int[lda_analysis.NUM_TOPICS, TOP_N]; if (true) { TopicProbability[][] density_of_top5_topics_in_docs_sorted = lda_analysis.DensityOfTop5TopicsInDocsSorted; // [doc][n<5] for (int doc = 0; doc < lda_analysis.NUM_DOCS; ++doc) { for (int n = 0; n < TOP_N; ++n) { int topic = density_of_top5_topics_in_docs_sorted[doc][n].topic; ++topics_popularity[topic, n]; } } } // Show the descriptive keywords for each topic if (true) { for (int topic = 0; topic < lda_analysis.NUM_TOPICS; ++topic) { string description = lda_analysis.GetDescriptionForTopic(words, topic, true, ";", false); sb.AppendFormat("{0}", description); sb.AppendLine(); for (int n = 0; n < TOP_N; ++n) { sb.AppendFormat("{0}de:{1}\t", n + 1, topics_popularity[topic, n]); } sb.AppendLine(); } } sb.AppendLine(); if (true) { for (int topic = 0; topic < lda_analysis.NUM_TOPICS; ++topic) { sb.AppendFormat("{0}\t", topic); for (int n = 0; n < TOP_N; ++n) { sb.AppendFormat("{0}\t", topics_popularity[topic, n]); } sb.AppendLine(); } } return(sb.ToString()); }
private static double CalculateSimilarity_JS(LDAAnalysis lda_analysis, float[] distribution, int doc_i) { // Calculate the similarity of the "topic distribution" for this pair of words using the Jensen-Shannon divergence (always finite) double JS_12 = 0.0; double JS_21 = 0.0; for (int topic = 0; topic < lda_analysis.NUM_TOPICS; ++topic) { double P = distribution[topic]; double Q = lda_analysis.DensityOfTopicsInDocuments[doc_i, topic]; double M = (P + Q) / 2.0; if (0 != M) { JS_12 += P * Math.Log(P / M, 2); JS_21 += Q * Math.Log(Q / M, 2); } } return(1.0 - (JS_12 + JS_21) / 2.0); }
public static int[] GetDocumentsSimilarToDistribution(LDAAnalysis lda_analysis, float[] distribution) { int[] docs = new int[lda_analysis.NUM_DOCS]; double[] similarities = new double[lda_analysis.NUM_DOCS]; // Initial ordering for (int doc_i = 0; doc_i < lda_analysis.NUM_DOCS; ++doc_i) { docs[doc_i] = doc_i; } // Similarities for (int doc_i = 0; doc_i < lda_analysis.NUM_DOCS; ++doc_i) { similarities[doc_i] = CalculateSimilarity_JS(lda_analysis, distribution, doc_i); } // Sort Array.Sort(similarities, docs); Array.Reverse(similarities); Array.Reverse(docs); return(docs); }