Dictionary<string, long> InsertWords(WordCount wc, HashSet<string> stopwords) { //Get unique words IEnumerable<string> uniqueWords = wc.GetWords(); if (uniqueWords == null || uniqueWords.Count() == 0) return new Dictionary<string, long>(); //Insert new words into Word (insert all, skip if exists) MySqlCommand insertCommand = new MySqlCommand(); StringBuilder insertSql = new StringBuilder(); insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES "); int i = 0; foreach (string word in uniqueWords) { if (i > 0) insertSql.Append(","); string varName = "@w" + i; insertSql.Append("("); insertSql.Append(varName); insertSql.Append(")"); insertCommand.Parameters.AddWithValue(varName, word); i++; } insertSql.AppendLine(";"); Console.Write("'"); insertCommand.CommandText = insertSql.ToString(); Helpers.RunSqlStatement(Name, insertCommand, false); Console.Write("'"); //Get WordID for each word //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet MySqlCommand getAllWordsCommand = new MySqlCommand(); StringBuilder selectSql = new StringBuilder(); selectSql.Append( @"SELECT Word, WordID FROM Word WHERE Word IN ("); Helpers.AppendList(selectSql, getAllWordsCommand, uniqueWords.Cast<object>().ToArray()); selectSql.Append(");"); getAllWordsCommand.CommandText = selectSql.ToString(); Dictionary<string, long> wordIDs = new Dictionary<string, long>(); Helpers.RunSelect(Name, getAllWordsCommand, wordIDs, (values, reader) => values.Add(reader.GetString("Word"), reader.GetInt64("WordID") )); Console.Write("'"); if (wordIDs.Count == 0) return wordIDs; //Empty collection //Do insert-update into WordScore StringBuilder sbWordScore = new StringBuilder(); sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES "); bool first = true; //Some UTF16 characters don't end up in wordIDs foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key))) { if (first) first = false; else sbWordScore.Append(','); sbWordScore.Append('('); sbWordScore.Append(wordIDs[wordCount.Key]); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(')'); } sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);"); Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false); Console.Write("'"); return wordIDs; }
Dictionary<string, Word> InsertWords(WordCount wc) { //Get unique words IEnumerable<string> uniqueWords = wc.GetWords(); if (uniqueWords == null || uniqueWords.Count() == 0) return new Dictionary<string, Word>(); //Insert new words into Word (insert all, skip if exists) MySqlCommand insertCommand = new MySqlCommand(); StringBuilder insertSql = new StringBuilder(); insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES "); int i = 0; foreach (string word in uniqueWords) { if (i > 0) insertSql.Append(","); string varName = "@w" + i; insertSql.Append("("); insertSql.Append(varName); insertSql.Append(")"); insertCommand.Parameters.AddWithValue(varName, word); i++; } insertSql.AppendLine(";"); Console.Write("'"); insertCommand.CommandText = insertSql.ToString(); Helpers.RunSqlStatement(Name, insertCommand, false); Console.Write("'"); //Get WordID for each word //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet MySqlCommand selectCommand = new MySqlCommand(); StringBuilder selectSql = new StringBuilder(); selectSql.Append( @"SELECT Word, Word.WordID FROM Word left join WordScore on WordScore.WordID = Word.WordID WHERE Word IN ("); AppendList(selectSql, selectCommand, uniqueWords.Cast<object>().ToArray()); selectSql.Append(");"); selectCommand.CommandText = selectSql.ToString(); Dictionary<string, Word> wordIDs = new Dictionary<string, Word>(); Helpers.RunSelect(Name, selectCommand, wordIDs, (values, reader) => values.Add(reader.GetString("Word"), new Word() { ID = reader.GetInt64("WordID") })); Console.Write("'"); if (wordIDs.Count == 0) return wordIDs; //Empty collection //Do insert-update into WordScore StringBuilder sbWordScore = new StringBuilder(); sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES "); bool first = true; //Some UTF16 characters don't end up in wordIDs foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key))) { if (first) first = false; else sbWordScore.Append(','); sbWordScore.Append('('); sbWordScore.Append(wordIDs[wordCount.Key].ID); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(')'); } sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);"); Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false); Console.Write("'"); //Get wordscores and stopword status for words. Stopwords are never inserted into WordTweet. Dictionary<long, Word> wordIDsByID = new Dictionary<long, Word>(); foreach (Word word in wordIDs.Values) wordIDsByID.Add(word.ID, word); string getScoresSql = @"SELECT WordID, ScoreToIdf(Score4d) as Idf, coalesce( if(Word like '#%', 0, null), (select 0 from TwitterTrackFilter where FilterType = 0 and Word = Word.Word and IsActive limit 1), Score4d > (select value from Constants where name = 'WordScore4dHigh') ) as IsStopWord FROM WordScore ws natural join Word WHERE WordID IN (" + string.Join(",", wordIDsByID.Keys.Select(n => n.ToString()).ToArray()) + ");"; Helpers.RunSelect(Name, getScoresSql, wordIDsByID, (values, reader) => { Word word = values[reader.GetInt64("WordID")]; word.Idf = reader.GetDouble("Idf"); word.IsStopWord = reader.GetBoolean("IsStopword"); }); return wordIDs; }