Exemple #1
0
        public static LDASampler FastLoad(string lda_sampler_filename, int[][] WORDS_IN_DOCS)
        {
            using (FileStream fs = new FileStream(lda_sampler_filename, FileMode.Open))
            {
                using (BinaryReader br = new BinaryReader(fs))
                {
                    int    NUM_TOPICS = br.ReadInt32();
                    int    NUM_WORDS  = br.ReadInt32();
                    int    NUM_DOCS   = br.ReadInt32();
                    double ALPHA      = br.ReadDouble();
                    double BETA       = br.ReadDouble();

                    LDASampler lda_sampler = new LDASampler(ALPHA, BETA, NUM_TOPICS, NUM_WORDS, NUM_DOCS, WORDS_IN_DOCS, false);

                    lda_sampler.total_iterations = br.ReadInt32();

                    //internal int[][] topic_of_word_in_doc; // [doc][i]
                    for (int doc = 0; doc < NUM_DOCS; ++doc)
                    {
                        for (int i = 0; i < lda_sampler.topic_of_word_in_doc[doc].Length; ++i)
                        {
                            lda_sampler.topic_of_word_in_doc[doc][i] = br.ReadInt32();
                        }
                    }

                    //internal float[,] number_of_times_doc_has_a_specific_topic; // [doc,topic]
                    for (int doc = 0; doc < NUM_DOCS; ++doc)
                    {
                        for (int topic = 0; topic < NUM_TOPICS; ++topic)
                        {
                            lda_sampler.number_of_times_doc_has_a_specific_topic[doc, topic] = br.ReadSingle();
                        }
                    }

                    //internal float[] number_of_times_a_doc_has_any_topic; // [doc]
                    for (int doc = 0; doc < NUM_DOCS; ++doc)
                    {
                        lda_sampler.number_of_times_a_doc_has_any_topic[doc] = br.ReadSingle();
                    }

                    //internal float[,] number_of_times_a_topic_has_a_specific_word; // [topic,word]
                    for (int topic = 0; topic < NUM_TOPICS; ++topic)
                    {
                        for (int word = 0; word < NUM_WORDS; ++word)
                        {
                            lda_sampler.number_of_times_a_topic_has_a_specific_word[topic, word] = br.ReadSingle();
                        }
                    }

                    //internal float[] number_of_times_a_topic_has_any_word; // [topic]
                    for (int topic = 0; topic < NUM_TOPICS; ++topic)
                    {
                        lda_sampler.number_of_times_a_topic_has_any_word[topic] = br.ReadSingle();
                    }

                    return(lda_sampler);
                }
            }
        }
        public LDASamplerMCSerial(LDASampler lda_sampler, int NUM_THREADS)
        {
            this.lda_sampler = lda_sampler;
            this.NUM_THREADS = NUM_THREADS;

            // Work out the ratio of each doc for our sampling
            {
                int MAX_REPRESENTATION = 100;

                int max_doc_length = 0;
                for (int doc = 0; doc < lda_sampler.NUM_DOCS; ++doc)
                {
                    max_doc_length = Math.Max(max_doc_length, lda_sampler.WORDS_IN_DOCS[doc].Length);
                }

                total_words_in_corpus = 0;
                for (int doc = 0; doc < lda_sampler.NUM_DOCS; ++doc)
                {
                    total_words_in_corpus += lda_sampler.WORDS_IN_DOCS[doc].Length;
                }

                random_mc_orderings = new List <int>();
                for (int doc = 0; doc < lda_sampler.NUM_DOCS; ++doc)
                {
                    int doc_representation = (0 < max_doc_length) ? MAX_REPRESENTATION * lda_sampler.WORDS_IN_DOCS[doc].Length / max_doc_length : 1;
                    if (0 == doc_representation && lda_sampler.WORDS_IN_DOCS[doc].Length > 0)
                    {
                        //Logging.Info("We have had to bump up the representation for doc {0} because it is too small", doc);
                        doc_representation = 1;
                    }
                    for (int i = 0; i < doc_representation; ++i)
                    {
                        random_mc_orderings.Add(doc);
                    }
                }
            }

            random_mt = new RandomAugmented[NUM_THREADS];
            probability_working_buffer = new double[NUM_THREADS][];
            for (int thread = 0; thread < NUM_THREADS; ++thread)
            {
                random_mt[thread] = new RandomAugmented((DateTime.UtcNow.Millisecond * (1 + thread)));
                probability_working_buffer[thread] = new double[lda_sampler.NUM_TOPICS];
            }
        }
Exemple #3
0
        public static void TestFastSaveLoad()
        {
            // Random data
            double alpha      = 0.1;
            double beta       = 0.2;
            int    num_topics = 200;
            int    num_words  = 10000;
            int    num_docs   = 20000;

            int[][] WORDS_IN_DOCS = new int[num_docs][];
            for (int i = 0; i < num_docs; ++i)
            {
                WORDS_IN_DOCS[i] = new int[i % 100 + 1];
            }

            LDASampler lda_sampler = new LDASampler(alpha, beta, num_topics, num_words, num_docs, WORDS_IN_DOCS);

            {
                string lda_sampler_filename = @"C:\temp\ldatest_old.dat";
                Logging.Info("+OldSave");
                SerializeFile.Save(lda_sampler_filename, lda_sampler);
                Logging.Info("-OldSave");
                Logging.Info("+OldLoad");
                lda_sampler = (LDASampler)SerializeFile.Load(lda_sampler_filename);
                Logging.Info("-OldLoad");
            }
            {
                string lda_sampler_filename = @"C:\temp\ldatest_new.dat";
                Logging.Info("+NewSave");
                lda_sampler.FastSave(lda_sampler_filename);
                Logging.Info("-NewSave");
                Logging.Info("+NewLoad");
                lda_sampler = FastLoad(lda_sampler_filename, WORDS_IN_DOCS);
                Logging.Info("-NewLoad");
            }
        }
Exemple #4
0
 public LDAAnalysis(LDASampler lda)
 {
     this.lda = lda;
 }