コード例 #1
0
        void ExtractWords(List <Hashtable> tweets, WordCount wc)
        {
            foreach (Hashtable tweet in tweets)
            {
                string text = tweet["text"] as string;
                if (text == null || text == "")
                {
                    continue;
                }

                text = Helpers.DecodeEncodedNonAsciiCharacters(text);
                string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true);
                if (wordsInTweet.Length == 0)
                {
                    continue;
                }

                string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray();
                tweet.Add("words", uniqueWordsArr);

                wc.AddWords(uniqueWordsArr);
            }

            //Word co-occurrence

            /*
             * Remove stopwords (maybe do on insert?)
             * Record all co-occurrences
             *
             *
             */
        }
コード例 #2
0
ファイル: TweetParser.cs プロジェクト: VacantFanatic/Test-CT
        void ExtractWords(List <Hashtable> tweets, WordCount wc)
        {
            foreach (Hashtable tweet in tweets)
            {
                string text = tweet["text"] as string;
                if (text == null || text == "")
                {
                    tweet[IGNORE] = true;
                    continue;
                }

                text = Helpers.DecodeEncodedNonAsciiCharacters(text);
                string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true);
                //string[] wordsInTweet = WordCount.GetWordsInStringWithBigrams(text, stopwords, useStemming: true);
                if (wordsInTweet.Length == 0)
                {
                    tweet[IGNORE] = true;
                    continue;
                }

                string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray();
                tweet.Add("words", uniqueWordsArr);

                bool isRetweet = text.StartsWith("RT @") || tweet.ContainsKey("retweeted_status") && ((Hashtable)tweet["retweeted_status"]).ContainsKey("id_str");
                wc.AddWords(uniqueWordsArr, isRetweet);
            }
        }