public void Run() { try { while (true) { DateTime start = DateTime.Now; //Get current tracking filters from database Console.Write("|"); List<TrackFilter> filters = GetActiveTrackFilters(); Console.Write("."); HashSet<string> stopwords = GetStopwords(); //Get a batch of tweets Dictionary<long, string> randomTweetsJson = new Dictionary<long, string>(); Dictionary<long, string> filteredTweetsJson = new Dictionary<long, string>(); GetTweets(filteredTweetsJson, randomTweetsJson); Console.Write("."); //Parse the Json List<Hashtable> filteredTweets = ParseJsonTweets(filteredTweetsJson, onlyExtractWords: false); //Tweets are appended with CREATED_AT_DATETIME List<Hashtable> randomTweets = ParseJsonTweets(randomTweetsJson, onlyExtractWords: true); //Tweets only contain a "text" field Console.Write("."); //Extract (stemmed) words from tweets WordCount wordCounts = new WordCount(); ExtractWords(randomTweets, wordCounts); ExtractWords(filteredTweets, wordCounts); //Tweets are appended with a "words" string array Console.Write("."); //Insert words into Word and WordScores Dictionary<string, long> wordIDs = null; if (wordCounts.HasWords) wordIDs = InsertWords(wordCounts, stopwords); Console.Write("."); if (filteredTweets.Count > 0) { //Make note of the time of the last tweet _lastTweetTime = (DateTime)filteredTweets.Last()[CREATED_AT_DATETIME]; List<Hashtable> filteredTweetsBefore = new List<Hashtable>(filteredTweets); if (Settings.TweetParser_UseSecondPassFiltering) { //Remove the off-topic tweets from further processing var filterPerf = RemoveOffTopicTweets(filteredTweets, filters); //Tweets are appended with longitude and latitude Console.Write("."); //Insert filter performance InsertFilterPerformance(filterPerf); Console.Write("."); } //Count number of tweets per hour StoreTweetCountPerHour(filteredTweets, filteredTweetsBefore); Console.Write("."); //Extract URLs ExtractUrls(filteredTweets); //Tweets are appended with a "urls" string array Console.Write("."); //Insert into Tweet, TwitterUser, WordTweet, TweetUrl MySqlCommand bigInsertCommand = BuildInsertSql(filteredTweets, wordIDs, stopwords); Console.Write("."); Helpers.RunSqlStatement(Name, bigInsertCommand); Console.Write("."); ParseAidrMetatags(filteredTweets); } if (filteredTweets.Count + randomTweetsJson.Count > 0) { DeleteParsedJsonTweets(randomTweetsJson.Keys.Union(filteredTweetsJson.Keys)); } Console.WriteLine("."); //Possibly perform maintenance PerformMaintenance(); //Wait for up to 30 seconds, unless there are more tweets in the database to process int runtime = (int)(DateTime.Now - start).TotalMilliseconds; if (runtime < 30000 && filteredTweetsJson.Count + randomTweetsJson.Count < Settings.TweetParser_BatchSize) Thread.Sleep(30000 - runtime); } } catch (Exception e) { Output.Print(Name, e); } }
Dictionary<string, long> InsertWords(WordCount wc, HashSet<string> stopwords) { //Get unique words IEnumerable<string> uniqueWords = wc.GetWords(); if (uniqueWords == null || uniqueWords.Count() == 0) return new Dictionary<string, long>(); //Insert new words into Word (insert all, skip if exists) MySqlCommand insertCommand = new MySqlCommand(); StringBuilder insertSql = new StringBuilder(); insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES "); int i = 0; foreach (string word in uniqueWords) { if (i > 0) insertSql.Append(","); string varName = "@w" + i; insertSql.Append("("); insertSql.Append(varName); insertSql.Append(")"); insertCommand.Parameters.AddWithValue(varName, word); i++; } insertSql.AppendLine(";"); Console.Write("'"); insertCommand.CommandText = insertSql.ToString(); Helpers.RunSqlStatement(Name, insertCommand, false); Console.Write("'"); //Get WordID for each word //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet MySqlCommand getAllWordsCommand = new MySqlCommand(); StringBuilder selectSql = new StringBuilder(); selectSql.Append( @"SELECT Word, WordID FROM Word WHERE Word IN ("); Helpers.AppendList(selectSql, getAllWordsCommand, uniqueWords.Cast<object>().ToArray()); selectSql.Append(");"); getAllWordsCommand.CommandText = selectSql.ToString(); Dictionary<string, long> wordIDs = new Dictionary<string, long>(); Helpers.RunSelect(Name, getAllWordsCommand, wordIDs, (values, reader) => values.Add(reader.GetString("Word"), reader.GetInt64("WordID") )); Console.Write("'"); if (wordIDs.Count == 0) return wordIDs; //Empty collection //Do insert-update into WordScore StringBuilder sbWordScore = new StringBuilder(); sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES "); bool first = true; //Some UTF16 characters don't end up in wordIDs foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key))) { if (first) first = false; else sbWordScore.Append(','); sbWordScore.Append('('); sbWordScore.Append(wordIDs[wordCount.Key]); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(')'); } sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);"); Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false); Console.Write("'"); return wordIDs; }
void ExtractWords(List<Hashtable> tweets, WordCount wc) { foreach (Hashtable tweet in tweets) { string text = tweet["text"] as string; if (text == null || text == "") { tweet[IGNORE] = true; continue; } text = Helpers.DecodeEncodedNonAsciiCharacters(text); string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true); //string[] wordsInTweet = WordCount.GetWordsInStringWithBigrams(text, stopwords, useStemming: true); if (wordsInTweet.Length == 0) { tweet[IGNORE] = true; continue; } string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray(); tweet.Add("words", uniqueWordsArr); bool isRetweet = text.StartsWith("RT @") || tweet.ContainsKey("retweeted_status") && ((Hashtable)tweet["retweeted_status"]).ContainsKey("id_str"); wc.AddWords(uniqueWordsArr, isRetweet); } }
public void SetWords(string words) { _words = words.Split(' '); _stemmedWords = _words.Select(n => WordCount.NaiveStemming(n)).ToArray(); }
Dictionary<string, Word> InsertWords(WordCount wc) { //Get unique words IEnumerable<string> uniqueWords = wc.GetWords(); if (uniqueWords == null || uniqueWords.Count() == 0) return new Dictionary<string, Word>(); //Insert new words into Word (insert all, skip if exists) MySqlCommand insertCommand = new MySqlCommand(); StringBuilder insertSql = new StringBuilder(); insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES "); int i = 0; foreach (string word in uniqueWords) { if (i > 0) insertSql.Append(","); string varName = "@w" + i; insertSql.Append("("); insertSql.Append(varName); insertSql.Append(")"); insertCommand.Parameters.AddWithValue(varName, word); i++; } insertSql.AppendLine(";"); Console.Write("'"); insertCommand.CommandText = insertSql.ToString(); Helpers.RunSqlStatement(Name, insertCommand, false); Console.Write("'"); //Get WordID for each word //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet MySqlCommand selectCommand = new MySqlCommand(); StringBuilder selectSql = new StringBuilder(); selectSql.Append( @"SELECT Word, Word.WordID FROM Word left join WordScore on WordScore.WordID = Word.WordID WHERE Word IN ("); AppendList(selectSql, selectCommand, uniqueWords.Cast<object>().ToArray()); selectSql.Append(");"); selectCommand.CommandText = selectSql.ToString(); Dictionary<string, Word> wordIDs = new Dictionary<string, Word>(); Helpers.RunSelect(Name, selectCommand, wordIDs, (values, reader) => values.Add(reader.GetString("Word"), new Word() { ID = reader.GetInt64("WordID") })); Console.Write("'"); if (wordIDs.Count == 0) return wordIDs; //Empty collection //Do insert-update into WordScore StringBuilder sbWordScore = new StringBuilder(); sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES "); bool first = true; //Some UTF16 characters don't end up in wordIDs foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key))) { if (first) first = false; else sbWordScore.Append(','); sbWordScore.Append('('); sbWordScore.Append(wordIDs[wordCount.Key].ID); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(')'); } sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);"); Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false); Console.Write("'"); //Get wordscores and stopword status for words. Stopwords are never inserted into WordTweet. Dictionary<long, Word> wordIDsByID = new Dictionary<long, Word>(); foreach (Word word in wordIDs.Values) wordIDsByID.Add(word.ID, word); string getScoresSql = @"SELECT WordID, ScoreToIdf(Score4d) as Idf, coalesce( if(Word like '#%', 0, null), (select 0 from TwitterTrackFilter where FilterType = 0 and Word = Word.Word and IsActive limit 1), Score4d > (select value from Constants where name = 'WordScore4dHigh') ) as IsStopWord FROM WordScore ws natural join Word WHERE WordID IN (" + string.Join(",", wordIDsByID.Keys.Select(n => n.ToString()).ToArray()) + ");"; Helpers.RunSelect(Name, getScoresSql, wordIDsByID, (values, reader) => { Word word = values[reader.GetInt64("WordID")]; word.Idf = reader.GetDouble("Idf"); word.IsStopWord = reader.GetBoolean("IsStopword"); }); return wordIDs; }
void ExtractWords(List<Hashtable> tweets, WordCount wc) { foreach (Hashtable tweet in tweets) { string text = tweet["text"] as string; if (text == null || text == "") continue; text = Helpers.DecodeEncodedNonAsciiCharacters(text); string[] wordsInTweet = WordCount.GetWordsInString(text, useStemming: true); if (wordsInTweet.Length == 0) continue; string[] uniqueWordsArr = wordsInTweet.Distinct().ToArray(); tweet.Add("words", uniqueWordsArr); wc.AddWords(uniqueWordsArr); } //Word co-occurrence /* * Remove stopwords (maybe do on insert?) * Record all co-occurrences * * */ }