예제 #1
0
        /*
         *      Read conversations from file
         */
        private void Read_conversations(string target_file_path)
        {
            this.convsList = new ArrayList();
            int numberofTweets     = 0;
            int word_count_a_tweet = 0;

            string[]                       line_arr   = null;
            SDTM_v1_Conversation           one_conv   = null;
            SDTM_v1_Tweet                  one_tweet  = null;
            SDTM_v1_Word                   one_word   = null;
            Dictionary <SDTM_v1_Word, int> word_count = null;
            string one_word_count = null;

            string[] one_word_count_arr = null;

            int conv_idx = 0;

            try
            {
                using (StreamReader sr = new StreamReader(target_file_path))
                {
                    string line = null;

                    while ((line = sr.ReadLine()) != null)
                    {
                        // Conversation name, userid1_userid2_convid
                        one_conv = new SDTM_v1_Conversation(conv_idx);
                        conv_idx++;
                        line_arr = line.Split('_');
                        one_conv.set_users(Convert.ToInt32(line_arr[0]), Convert.ToInt32(line_arr[1]));

                        // number of tweets in the conversation
                        line           = sr.ReadLine();
                        numberofTweets = Convert.ToInt32(line);

                        // Each tweet in a conversation
                        for (int tweet_idx = 0; tweet_idx < numberofTweets; tweet_idx++)
                        {
                            // Line format is
                            // user_id	lambda_0	lambda_1	numberofuniquewords	BagofWordsFormat
                            line     = sr.ReadLine();
                            line_arr = line.Split(' ');

                            one_tweet = new SDTM_v1_Tweet();
                            one_tweet.set_tweet_id(Convert.ToInt32(line_arr[0]));
                            one_tweet.set_max_ent_prob(Convert.ToDouble(line_arr[1]), Convert.ToDouble(line_arr[2]));

                            word_count = new Dictionary <SDTM_v1_Word, int>();
                            //word_count_a_tweet = Convert.ToInt32(line_arr[3]) + 4;  // At the end of line_arr
                            word_count_a_tweet = line_arr.Length;

                            for (int word_idx = 4; word_idx < word_count_a_tweet; word_idx++)
                            {
                                one_word_count     = line_arr[word_idx];
                                one_word_count_arr = one_word_count.Split(':');
                                one_word           = new SDTM_v1_Word(Convert.ToInt32(one_word_count_arr[0]));
                                word_count.Add(one_word, Convert.ToInt32(one_word_count_arr[1]));
                            }

                            one_tweet.set_word_count(word_count);

                            // insert tweet to conversation
                            one_conv.insert_tweet(one_tweet);
                        }

                        // insert conversation to list
                        this.convsList.Add(one_conv);
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                Environment.Exit(1);
            }

            this.numConvs = this.convsList.Count;
        }
예제 #2
0
파일: SDTM_v1.cs 프로젝트: NoSyu/SDTM
        /*
            Read conversations from file
         */
        private void Read_conversations(string target_file_path)
        {
            this.convsList = new ArrayList();
            int numberofTweets = 0;
            int word_count_a_tweet = 0;
            string[] line_arr = null;
            SDTM_v1_Conversation one_conv = null;
            SDTM_v1_Tweet one_tweet = null;
            SDTM_v1_Word one_word = null;
            Dictionary<SDTM_v1_Word, int> word_count = null;
            string one_word_count = null;
            string[] one_word_count_arr = null;

            int conv_idx = 0;

            try
            {
                using (StreamReader sr = new StreamReader(target_file_path))
                {
                    string line = null;

                    while ((line = sr.ReadLine()) != null)
                    {
                        // Conversation name, userid1_userid2_convid
                        one_conv = new SDTM_v1_Conversation(conv_idx);
                        conv_idx++;
                        line_arr = line.Split('_');
                        one_conv.set_users(Convert.ToInt32(line_arr[0]), Convert.ToInt32(line_arr[1]));

                        // number of tweets in the conversation
                        line = sr.ReadLine();
                        numberofTweets = Convert.ToInt32(line);

                        // Each tweet in a conversation
                        for (int tweet_idx = 0; tweet_idx < numberofTweets; tweet_idx++)
                        {
                            // Line format is
                            // user_id	lambda_0	lambda_1	numberofuniquewords	BagofWordsFormat
                            line = sr.ReadLine();
                            line_arr = line.Split(' ');

                            one_tweet = new SDTM_v1_Tweet();
                            one_tweet.set_tweet_id(Convert.ToInt32(line_arr[0]));
                            one_tweet.set_max_ent_prob(Convert.ToDouble(line_arr[1]), Convert.ToDouble(line_arr[2]));

                            word_count = new Dictionary<SDTM_v1_Word, int>();
                            //word_count_a_tweet = Convert.ToInt32(line_arr[3]) + 4;  // At the end of line_arr
                            word_count_a_tweet = line_arr.Length;

                            for (int word_idx = 4; word_idx < word_count_a_tweet; word_idx++)
                            {
                                one_word_count = line_arr[word_idx];
                                one_word_count_arr = one_word_count.Split(':');
                                one_word = new SDTM_v1_Word(Convert.ToInt32(one_word_count_arr[0]));
                                word_count.Add(one_word, Convert.ToInt32(one_word_count_arr[1]));
                            }

                            one_tweet.set_word_count(word_count);

                            // insert tweet to conversation
                            one_conv.insert_tweet(one_tweet);
                        }

                        // insert conversation to list
                        this.convsList.Add(one_conv);
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                Environment.Exit(1);
            }

            this.numConvs = this.convsList.Count;
        }
예제 #3
0
        private void GibbsSampling_Each_conv(SDTM_v1_Conversation one_conv)
        {
            // Preparing
            int    oldLevel = 0;
            int    oldTopic = 0;
            int    newLevel = 0;
            int    newTopic = 0;
            int    numTopics_target_level = 0;
            double prob_part_senti_value = 0.0;
            double target_sumBeta = 0.0;
            double beta0, m0, expectLTW, beta, betaw;
            double prob_table_temp = 0.0;
            double sumProb = 0.0;

            // Each tweet
            foreach (SDTM_v1_Tweet one_tweet in one_conv.tweet_list)
            {
                sumProb = 0.0;

                // Decrease current one_tweet value
                oldLevel = one_tweet.sd_level;
                oldTopic = one_tweet.topic;

                foreach (KeyValuePair <SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                {
                    this.matrixLTW[oldLevel, oldTopic, one_entry.Key.wordidx]--;
                }
                this.sumLTW[oldLevel, oldTopic] -= one_tweet.word_count_table.Count;

                one_conv.CLT[oldLevel, oldTopic]--;
                one_conv.sumCLT[oldLevel]--;

                // Fill probability table
                // Level 0
                numTopics_target_level = numTopics_arr[0];
                prob_part_senti_value  = one_tweet.max_ent_prob[0] / (one_conv.sumCLT[0] + this.sumAlpha[0]);
                target_sumBeta         = this.sumBeta[0];

                for (int ti = 0; ti < numTopics_target_level; ti++)
                {
                    beta0     = this.sumLTW[0, ti] + target_sumBeta;
                    m0        = 0;
                    expectLTW = 1.0;

                    foreach (KeyValuePair <SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                    {
                        if (-1 == one_entry.Key.seed_word_level)
                        {
                            beta = this.betas[0];
                        }
                        else
                        {
                            beta = this.betas[2];
                        }

                        betaw = this.matrixLTW[0, ti, one_entry.Key.wordidx] + beta;

                        for (int m = 0; m < (int)one_entry.Value; m++)
                        {
                            expectLTW *= (betaw + m) / (beta0 + m0);
                            m0++;
                        }
                    }

                    prob_table_temp = (one_conv.CLT[0, ti] + this.alpha)
                                      * prob_part_senti_value
                                      * expectLTW;

                    sumProb += prob_table_temp;
                    this.probTable[ti, 0] = prob_table_temp;
                }

                // Level 1 and 2
                for (int level_idx = 1; level_idx < SDTM_v1.numLevels; level_idx++)
                {
                    numTopics_target_level = numTopics_arr[level_idx];
                    prob_part_senti_value  = one_tweet.max_ent_prob[1] / (one_conv.sumCLT[level_idx] + this.sumAlpha[level_idx]);
                    target_sumBeta         = this.sumBeta[level_idx];

                    for (int ti = 0; ti < numTopics_target_level; ti++)
                    {
                        beta0     = this.sumLTW[level_idx, ti] + target_sumBeta;
                        m0        = 0;
                        expectLTW = 1.0;

                        foreach (KeyValuePair <SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                        {
                            if (-1 == one_entry.Key.seed_word_level)
                            {
                                beta = this.betas[0];
                            }
                            else if (level_idx == one_entry.Key.seed_word_level)
                            {
                                beta = this.betas[1];
                            }
                            else
                            {
                                beta = this.betas[2];
                            }

                            betaw = this.matrixLTW[level_idx, ti, one_entry.Key.wordidx] + beta;

                            for (int m = 0; m < (int)one_entry.Value; m++)
                            {
                                expectLTW *= (betaw + m) / (beta0 + m0);
                                m0++;
                            }
                        }

                        prob_table_temp = (one_conv.CLT[level_idx, ti] + this.alpha)
                                          * prob_part_senti_value
                                          * expectLTW;

                        sumProb += prob_table_temp;
                        this.probTable[ti, level_idx] = prob_table_temp;
                    }
                }

                // Multinomial sampling
                Multinomial_sampling(sumProb, out newLevel, out newTopic);

                // Assign and increase with new value
                one_tweet.set_sd_level(newLevel);
                one_tweet.set_topic(newTopic);

                foreach (KeyValuePair <SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                {
                    this.matrixLTW[newLevel, newTopic, one_entry.Key.wordidx]++;
                }
                this.sumLTW[newLevel, newTopic] += one_tweet.word_count_table.Count;

                one_conv.CLT[newLevel, newTopic]++;
                one_conv.sumCLT[newLevel]++;
            }
        }
예제 #4
0
파일: SDTM_v1.cs 프로젝트: NoSyu/SDTM
        private void GibbsSampling_Each_conv(SDTM_v1_Conversation one_conv)
        {
            // Preparing
            int oldLevel = 0;
            int oldTopic = 0;
            int newLevel = 0;
            int newTopic = 0;
            int numTopics_target_level = 0;
            double prob_part_senti_value = 0.0;
            double target_sumBeta = 0.0;
            double beta0, m0, expectLTW, beta, betaw;
            double prob_table_temp = 0.0;
            double sumProb = 0.0;

            // Each tweet
            foreach (SDTM_v1_Tweet one_tweet in one_conv.tweet_list)
            {
                sumProb = 0.0;

                // Decrease current one_tweet value
                oldLevel = one_tweet.sd_level;
                oldTopic = one_tweet.topic;

                foreach (KeyValuePair<SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                {
                    this.matrixLTW[oldLevel, oldTopic, one_entry.Key.wordidx]--;
                }
                this.sumLTW[oldLevel, oldTopic] -= one_tweet.word_count_table.Count;

                one_conv.CLT[oldLevel, oldTopic]--;
                one_conv.sumCLT[oldLevel]--;

                // Fill probability table
                // Level 0
                numTopics_target_level = numTopics_arr[0];
                prob_part_senti_value = one_tweet.max_ent_prob[0] / (one_conv.sumCLT[0] + this.sumAlpha[0]);
                target_sumBeta = this.sumBeta[0];

                for (int ti = 0; ti < numTopics_target_level; ti++)
                {
                    beta0 = this.sumLTW[0, ti] + target_sumBeta;
                    m0 = 0;
                    expectLTW = 1.0;

                    foreach (KeyValuePair<SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                    {
                        if (-1 == one_entry.Key.seed_word_level)
                        {
                            beta = this.betas[0];
                        }
                        else
                        {
                            beta = this.betas[2];
                        }

                        betaw = this.matrixLTW[0, ti, one_entry.Key.wordidx] + beta;

                        for (int m = 0; m < (int) one_entry.Value; m++)
                        {
                            expectLTW *= (betaw + m) / (beta0 + m0);
                            m0++;
                        }
                    }

                    prob_table_temp = (one_conv.CLT[0, ti] + this.alpha)
                            * prob_part_senti_value
                            * expectLTW;

                    sumProb += prob_table_temp;
                    this.probTable[ti, 0] = prob_table_temp;
                }

                // Level 1 and 2
                for (int level_idx = 1; level_idx < SDTM_v1.numLevels; level_idx++)
                {
                    numTopics_target_level = numTopics_arr[level_idx];
                    prob_part_senti_value = one_tweet.max_ent_prob[1] / (one_conv.sumCLT[level_idx] + this.sumAlpha[level_idx]);
                    target_sumBeta = this.sumBeta[level_idx];

                    for (int ti = 0; ti < numTopics_target_level; ti++)
                    {
                        beta0 = this.sumLTW[level_idx, ti] + target_sumBeta;
                        m0 = 0;
                        expectLTW = 1.0;

                        foreach (KeyValuePair<SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                        {
                            if (-1 == one_entry.Key.seed_word_level)
                            {
                                beta = this.betas[0];
                            }
                            else if (level_idx == one_entry.Key.seed_word_level)
                            {
                                beta = this.betas[1];
                            }
                            else
                            {
                                beta = this.betas[2];
                            }

                            betaw = this.matrixLTW[level_idx, ti, one_entry.Key.wordidx] + beta;

                            for (int m = 0; m < (int)one_entry.Value; m++)
                            {
                                expectLTW *= (betaw + m) / (beta0 + m0);
                                m0++;
                            }
                        }

                        prob_table_temp = (one_conv.CLT[level_idx, ti] + this.alpha)
                                * prob_part_senti_value
                                * expectLTW;

                        sumProb += prob_table_temp;
                        this.probTable[ti, level_idx] = prob_table_temp;
                    }
                }

                // Multinomial sampling
                Multinomial_sampling(sumProb, out newLevel, out newTopic);

                // Assign and increase with new value
                one_tweet.set_sd_level(newLevel);
                one_tweet.set_topic(newTopic);

                foreach (KeyValuePair<SDTM_v1_Word, int> one_entry in one_tweet.word_count_table)
                {
                    this.matrixLTW[newLevel, newTopic, one_entry.Key.wordidx]++;
                }
                this.sumLTW[newLevel, newTopic] += one_tweet.word_count_table.Count;

                one_conv.CLT[newLevel, newTopic]++;
                one_conv.sumCLT[newLevel]++;
            }
        }