Пример #1
0
        public void Run()
        {
            try
            {
                while (true)
                {
                    DateTime start = DateTime.Now;

                    //Get current tracking filters from database
                    Console.Write("|");
                    List<TrackFilter> filters = GetActiveTrackFilters();
                    Console.Write(".");

                    HashSet<string> stopwords = GetStopwords();

                    //Get a batch of tweets
                    Dictionary<long, string> randomTweetsJson = new Dictionary<long, string>();
                    Dictionary<long, string> filteredTweetsJson = new Dictionary<long, string>();
                    GetTweets(filteredTweetsJson, randomTweetsJson);
                    Console.Write(".");

                    //Parse the Json
                    List<Hashtable> filteredTweets = ParseJsonTweets(filteredTweetsJson, onlyExtractWords: false); //Tweets are appended with CREATED_AT_DATETIME
                    List<Hashtable> randomTweets = ParseJsonTweets(randomTweetsJson, onlyExtractWords: true); //Tweets only contain a "text" field
                    Console.Write(".");

                    //Extract (stemmed) words from tweets
                    WordCount wordCounts = new WordCount();
                    ExtractWords(randomTweets, wordCounts);
                    ExtractWords(filteredTweets, wordCounts); //Tweets are appended with a "words" string array
                    Console.Write(".");

                    //Insert words into Word and WordScores
                    Dictionary<string, long> wordIDs = null;
                    if (wordCounts.HasWords)
                        wordIDs = InsertWords(wordCounts, stopwords);
                    Console.Write(".");

                    if (filteredTweets.Count > 0)
                    {
                        //Make note of the time of the last tweet
                        _lastTweetTime = (DateTime)filteredTweets.Last()[CREATED_AT_DATETIME];

                        List<Hashtable> filteredTweetsBefore = new List<Hashtable>(filteredTweets);
                        if (Settings.TweetParser_UseSecondPassFiltering)
                        {
                            //Remove the off-topic tweets from further processing
                            var filterPerf = RemoveOffTopicTweets(filteredTweets, filters); //Tweets are appended with longitude and latitude
                            Console.Write(".");

                            //Insert filter performance
                            InsertFilterPerformance(filterPerf);
                            Console.Write(".");
                        }
                        //Count number of tweets per hour
                        StoreTweetCountPerHour(filteredTweets, filteredTweetsBefore);
                        Console.Write(".");

                        //Extract URLs
                        ExtractUrls(filteredTweets); //Tweets are appended with a "urls" string array
                        Console.Write(".");

                        //Insert into Tweet, TwitterUser, WordTweet, TweetUrl
                        MySqlCommand bigInsertCommand = BuildInsertSql(filteredTweets, wordIDs, stopwords);
                        Console.Write(".");

                        Helpers.RunSqlStatement(Name, bigInsertCommand);
                        Console.Write(".");

                        ParseAidrMetatags(filteredTweets);
                    }

                    if (filteredTweets.Count + randomTweetsJson.Count > 0)
                    {
                        DeleteParsedJsonTweets(randomTweetsJson.Keys.Union(filteredTweetsJson.Keys));
                    }
                    Console.WriteLine(".");

                    //Possibly perform maintenance
                    PerformMaintenance();

                    //Wait for up to 30 seconds, unless there are more tweets in the database to process
                    int runtime = (int)(DateTime.Now - start).TotalMilliseconds;
                    if (runtime < 30000 && filteredTweetsJson.Count + randomTweetsJson.Count < Settings.TweetParser_BatchSize)
                        Thread.Sleep(30000 - runtime);
                }
            }
            catch (Exception e)
            {
                Output.Print(Name, e);
            }
        }
Пример #2
0
        Dictionary<string, long> InsertWords(WordCount wc, HashSet<string> stopwords)
        {
            //Get unique words
            IEnumerable<string> uniqueWords = wc.GetWords();
            if (uniqueWords == null || uniqueWords.Count() == 0)
                return new Dictionary<string, long>();

            //Insert new words into Word (insert all, skip if exists)
            MySqlCommand insertCommand = new MySqlCommand();

            StringBuilder insertSql = new StringBuilder();
            insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES ");
            int i = 0;
            foreach (string word in uniqueWords)
            {
                if (i > 0)
                    insertSql.Append(",");
                string varName = "@w" + i;
                insertSql.Append("(");
                insertSql.Append(varName);
                insertSql.Append(")");
                insertCommand.Parameters.AddWithValue(varName, word);
                i++;
            }
            insertSql.AppendLine(";");

            Console.Write("'");

            insertCommand.CommandText = insertSql.ToString();
            Helpers.RunSqlStatement(Name, insertCommand, false);

            Console.Write("'");

            //Get WordID for each word
            //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet
            MySqlCommand getAllWordsCommand = new MySqlCommand();
            StringBuilder selectSql = new StringBuilder();
            selectSql.Append(
                @"SELECT
                    Word,
                    WordID
                FROM Word WHERE Word IN (");
            Helpers.AppendList(selectSql, getAllWordsCommand, uniqueWords.Cast<object>().ToArray());
            selectSql.Append(");");
            getAllWordsCommand.CommandText = selectSql.ToString();

            Dictionary<string, long> wordIDs = new Dictionary<string, long>();
            Helpers.RunSelect(Name, getAllWordsCommand, wordIDs,
                (values, reader) => values.Add(reader.GetString("Word"), reader.GetInt64("WordID") ));

            Console.Write("'");

            if (wordIDs.Count == 0)
                return wordIDs; //Empty collection

            //Do insert-update into WordScore
            StringBuilder sbWordScore = new StringBuilder();
            sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES ");

            bool first = true;
            //Some UTF16 characters don't end up in wordIDs
            foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key)))
            {
                if (first)
                    first = false;
                else
                    sbWordScore.Append(',');

                sbWordScore.Append('(');
                sbWordScore.Append(wordIDs[wordCount.Key]);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(')');
            }
            sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);");

            Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false);

            Console.Write("'");

            return wordIDs;
        }
Пример #3
0
        void ExtractWords(List<Hashtable> tweets, WordCount wc)
        {
            foreach (Hashtable tweet in tweets)
            {
                string text = tweet["text"] as string;
                if (text == null || text == "")
                {
                    tweet[IGNORE] = true;
                    continue;
                }

                text = Helpers.DecodeEncodedNonAsciiCharacters(text);
                string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true);
                //string[] wordsInTweet = WordCount.GetWordsInStringWithBigrams(text, stopwords, useStemming: true);
                if (wordsInTweet.Length == 0)
                {
                    tweet[IGNORE] = true;
                    continue;
                }

                string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray();
                tweet.Add("words", uniqueWordsArr);

                bool isRetweet = text.StartsWith("RT @") || tweet.ContainsKey("retweeted_status") && ((Hashtable)tweet["retweeted_status"]).ContainsKey("id_str");
                    wc.AddWords(uniqueWordsArr, isRetweet);
            }
        }
Пример #4
0
 public void SetWords(string words)
 {
     _words        = words.Split(' ');
     _stemmedWords = _words.Select(n => WordCount.NaiveStemming(n)).ToArray();
 }
Пример #5
0
        Dictionary<string, Word> InsertWords(WordCount wc)
        {
            //Get unique words
            IEnumerable<string> uniqueWords = wc.GetWords();
            if (uniqueWords == null || uniqueWords.Count() == 0)
                return new Dictionary<string, Word>();

            //Insert new words into Word (insert all, skip if exists)
            MySqlCommand insertCommand = new MySqlCommand();

            StringBuilder insertSql = new StringBuilder();
            insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES ");
            int i = 0;
            foreach (string word in uniqueWords)
            {
                if (i > 0)
                    insertSql.Append(",");
                string varName = "@w" + i;
                insertSql.Append("(");
                insertSql.Append(varName);
                insertSql.Append(")");
                insertCommand.Parameters.AddWithValue(varName, word);
                i++;
            }
            insertSql.AppendLine(";");

            Console.Write("'");

            insertCommand.CommandText = insertSql.ToString();
            Helpers.RunSqlStatement(Name, insertCommand, false);

            Console.Write("'");

            //Get WordID for each word
            //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet
            MySqlCommand selectCommand = new MySqlCommand();
            StringBuilder selectSql = new StringBuilder();
            selectSql.Append(
                @"SELECT
                    Word,
                    Word.WordID
                FROM Word left join WordScore on WordScore.WordID = Word.WordID WHERE Word IN (");
            AppendList(selectSql, selectCommand, uniqueWords.Cast<object>().ToArray());
            selectSql.Append(");");
            selectCommand.CommandText = selectSql.ToString();

            Dictionary<string, Word> wordIDs = new Dictionary<string, Word>();
            Helpers.RunSelect(Name, selectCommand, wordIDs,
                (values, reader) => values.Add(reader.GetString("Word"), new Word()
                {
                    ID = reader.GetInt64("WordID")
                }));

            Console.Write("'");

            if (wordIDs.Count == 0)
                return wordIDs; //Empty collection

            //Do insert-update into WordScore
            StringBuilder sbWordScore = new StringBuilder();
            sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES ");

            bool first = true;
            //Some UTF16 characters don't end up in wordIDs
            foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key)))
            {
                if (first)
                    first = false;
                else
                    sbWordScore.Append(',');

                sbWordScore.Append('(');
                sbWordScore.Append(wordIDs[wordCount.Key].ID);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(')');
            }
            sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);");

            Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false);

            Console.Write("'");

            //Get wordscores and stopword status for words. Stopwords are never inserted into WordTweet.
            Dictionary<long, Word> wordIDsByID = new Dictionary<long, Word>();
            foreach (Word word in wordIDs.Values)
                wordIDsByID.Add(word.ID, word);

            string getScoresSql =
                @"SELECT
                    WordID,
                    ScoreToIdf(Score4d) as Idf,
                    coalesce(
                        if(Word like '#%', 0, null),
                        (select 0 from TwitterTrackFilter where FilterType = 0 and Word = Word.Word and IsActive limit 1),
                        Score4d > (select value from Constants where name = 'WordScore4dHigh')
                    ) as IsStopWord
                FROM WordScore ws natural join Word WHERE WordID IN (" + string.Join(",", wordIDsByID.Keys.Select(n => n.ToString()).ToArray()) + ");";
            Helpers.RunSelect(Name, getScoresSql, wordIDsByID,
                (values, reader) =>
                {
                    Word word = values[reader.GetInt64("WordID")];
                    word.Idf = reader.GetDouble("Idf");
                    word.IsStopWord = reader.GetBoolean("IsStopword");
                });

            return wordIDs;
        }
Пример #6
0
        void ExtractWords(List<Hashtable> tweets, WordCount wc)
        {
            foreach (Hashtable tweet in tweets)
            {
                string text = tweet["text"] as string;
                if (text == null || text == "")
                    continue;

                text = Helpers.DecodeEncodedNonAsciiCharacters(text);
                string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true);
                if (wordsInTweet.Length == 0)
                    continue;

                string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray();
                tweet.Add("words", uniqueWordsArr);

                wc.AddWords(uniqueWordsArr);
            }

            //Word co-occurrence
            /*
             * Remove stopwords (maybe do on insert?)
             * Record all co-occurrences
             *
             *
             */
        }