public static LDASampler FastLoad(string lda_sampler_filename, int[][] WORDS_IN_DOCS) { using (FileStream fs = new FileStream(lda_sampler_filename, FileMode.Open)) { using (BinaryReader br = new BinaryReader(fs)) { int NUM_TOPICS = br.ReadInt32(); int NUM_WORDS = br.ReadInt32(); int NUM_DOCS = br.ReadInt32(); double ALPHA = br.ReadDouble(); double BETA = br.ReadDouble(); LDASampler lda_sampler = new LDASampler(ALPHA, BETA, NUM_TOPICS, NUM_WORDS, NUM_DOCS, WORDS_IN_DOCS, false); lda_sampler.total_iterations = br.ReadInt32(); //internal int[][] topic_of_word_in_doc; // [doc][i] for (int doc = 0; doc < NUM_DOCS; ++doc) { for (int i = 0; i < lda_sampler.topic_of_word_in_doc[doc].Length; ++i) { lda_sampler.topic_of_word_in_doc[doc][i] = br.ReadInt32(); } } //internal float[,] number_of_times_doc_has_a_specific_topic; // [doc,topic] for (int doc = 0; doc < NUM_DOCS; ++doc) { for (int topic = 0; topic < NUM_TOPICS; ++topic) { lda_sampler.number_of_times_doc_has_a_specific_topic[doc, topic] = br.ReadSingle(); } } //internal float[] number_of_times_a_doc_has_any_topic; // [doc] for (int doc = 0; doc < NUM_DOCS; ++doc) { lda_sampler.number_of_times_a_doc_has_any_topic[doc] = br.ReadSingle(); } //internal float[,] number_of_times_a_topic_has_a_specific_word; // [topic,word] for (int topic = 0; topic < NUM_TOPICS; ++topic) { for (int word = 0; word < NUM_WORDS; ++word) { lda_sampler.number_of_times_a_topic_has_a_specific_word[topic, word] = br.ReadSingle(); } } //internal float[] number_of_times_a_topic_has_any_word; // [topic] for (int topic = 0; topic < NUM_TOPICS; ++topic) { lda_sampler.number_of_times_a_topic_has_any_word[topic] = br.ReadSingle(); } return(lda_sampler); } } }
public LDASamplerMCSerial(LDASampler lda_sampler, int NUM_THREADS) { this.lda_sampler = lda_sampler; this.NUM_THREADS = NUM_THREADS; // Work out the ratio of each doc for our sampling { int MAX_REPRESENTATION = 100; int max_doc_length = 0; for (int doc = 0; doc < lda_sampler.NUM_DOCS; ++doc) { max_doc_length = Math.Max(max_doc_length, lda_sampler.WORDS_IN_DOCS[doc].Length); } total_words_in_corpus = 0; for (int doc = 0; doc < lda_sampler.NUM_DOCS; ++doc) { total_words_in_corpus += lda_sampler.WORDS_IN_DOCS[doc].Length; } random_mc_orderings = new List <int>(); for (int doc = 0; doc < lda_sampler.NUM_DOCS; ++doc) { int doc_representation = (0 < max_doc_length) ? MAX_REPRESENTATION * lda_sampler.WORDS_IN_DOCS[doc].Length / max_doc_length : 1; if (0 == doc_representation && lda_sampler.WORDS_IN_DOCS[doc].Length > 0) { //Logging.Info("We have had to bump up the representation for doc {0} because it is too small", doc); doc_representation = 1; } for (int i = 0; i < doc_representation; ++i) { random_mc_orderings.Add(doc); } } } random_mt = new RandomAugmented[NUM_THREADS]; probability_working_buffer = new double[NUM_THREADS][]; for (int thread = 0; thread < NUM_THREADS; ++thread) { random_mt[thread] = new RandomAugmented((DateTime.UtcNow.Millisecond * (1 + thread))); probability_working_buffer[thread] = new double[lda_sampler.NUM_TOPICS]; } }
public static void TestFastSaveLoad() { // Random data double alpha = 0.1; double beta = 0.2; int num_topics = 200; int num_words = 10000; int num_docs = 20000; int[][] WORDS_IN_DOCS = new int[num_docs][]; for (int i = 0; i < num_docs; ++i) { WORDS_IN_DOCS[i] = new int[i % 100 + 1]; } LDASampler lda_sampler = new LDASampler(alpha, beta, num_topics, num_words, num_docs, WORDS_IN_DOCS); { string lda_sampler_filename = @"C:\temp\ldatest_old.dat"; Logging.Info("+OldSave"); SerializeFile.Save(lda_sampler_filename, lda_sampler); Logging.Info("-OldSave"); Logging.Info("+OldLoad"); lda_sampler = (LDASampler)SerializeFile.Load(lda_sampler_filename); Logging.Info("-OldLoad"); } { string lda_sampler_filename = @"C:\temp\ldatest_new.dat"; Logging.Info("+NewSave"); lda_sampler.FastSave(lda_sampler_filename); Logging.Info("-NewSave"); Logging.Info("+NewLoad"); lda_sampler = FastLoad(lda_sampler_filename, WORDS_IN_DOCS); Logging.Info("-NewLoad"); } }
public LDAAnalysis(LDASampler lda) { this.lda = lda; }