void ExtractWords(List <Hashtable> tweets, WordCount wc) { foreach (Hashtable tweet in tweets) { string text = tweet["text"] as string; if (text == null || text == "") { continue; } text = Helpers.DecodeEncodedNonAsciiCharacters(text); string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true); if (wordsInTweet.Length == 0) { continue; } string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray(); tweet.Add("words", uniqueWordsArr); wc.AddWords(uniqueWordsArr); } //Word co-occurrence /* * Remove stopwords (maybe do on insert?) * Record all co-occurrences * * */ }
void ExtractWords(List <Hashtable> tweets, WordCount wc) { foreach (Hashtable tweet in tweets) { string text = tweet["text"] as string; if (text == null || text == "") { tweet[IGNORE] = true; continue; } text = Helpers.DecodeEncodedNonAsciiCharacters(text); string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true); //string[] wordsInTweet = WordCount.GetWordsInStringWithBigrams(text, stopwords, useStemming: true); if (wordsInTweet.Length == 0) { tweet[IGNORE] = true; continue; } string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray(); tweet.Add("words", uniqueWordsArr); bool isRetweet = text.StartsWith("RT @") || tweet.ContainsKey("retweeted_status") && ((Hashtable)tweet["retweeted_status"]).ContainsKey("id_str"); wc.AddWords(uniqueWordsArr, isRetweet); } }