Example #1
0
        void ExtractWords(List <Hashtable> tweets, WordCount wc)
        {
            foreach (Hashtable tweet in tweets)
            {
                string text = tweet["text"] as string;
                if (text == null || text == "")
                {
                    tweet[IGNORE] = true;
                    continue;
                }

                text = Helpers.DecodeEncodedNonAsciiCharacters(text);
                string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true);
                //string[] wordsInTweet = WordCount.GetWordsInStringWithBigrams(text, stopwords, useStemming: true);
                if (wordsInTweet.Length == 0)
                {
                    tweet[IGNORE] = true;
                    continue;
                }

                string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray();
                tweet.Add("words", uniqueWordsArr);

                bool isRetweet = text.StartsWith("RT @") || tweet.ContainsKey("retweeted_status") && ((Hashtable)tweet["retweeted_status"]).ContainsKey("id_str");
                wc.AddWords(uniqueWordsArr, isRetweet);
            }
        }
Example #2
0
        void ExtractWords(List <Hashtable> tweets, WordCount wc)
        {
            foreach (Hashtable tweet in tweets)
            {
                string text = tweet["text"] as string;
                if (text == null || text == "")
                {
                    continue;
                }

                text = Helpers.DecodeEncodedNonAsciiCharacters(text);
                string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true);
                if (wordsInTweet.Length == 0)
                {
                    continue;
                }

                string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray();
                tweet.Add("words", uniqueWordsArr);

                wc.AddWords(uniqueWordsArr);
            }

            //Word co-occurrence

            /*
             * Remove stopwords (maybe do on insert?)
             * Record all co-occurrences
             *
             *
             */
        }
Example #3
0
        static Dictionary<long, Tweet> GetTweetBatch(long lastProcessedID, int batchSize)
        {
            Dictionary<long, Tweet> tweets = new Dictionary<long, Tweet>();

            string sql = "select TweetID, CreatedAt, Text from SyriaTweetBackup where TweetID > " + lastProcessedID + " order by TweetID limit " + batchSize + ";";
            Helpers.RunSelect(_name, sql, tweets, (values, reader) =>
                {
                    long id = reader.GetInt64("TweetID");
                    DateTime createdAt = reader.GetDateTime("CreatedAt");
                    string text = reader.GetString("Text");
                    string[] words = WordCount.GetWordsInString(text, useStemming: true);
                    values.Add(id, new Tweet()
                    {
                        TweetID = id,
                        CreatedAt = createdAt,
                        Words = words
                    });
                });

            return tweets;
        }