Dictionary <string, Word> InsertWords(WordCount wc) { //Get unique words IEnumerable <string> uniqueWords = wc.GetWords(); if (uniqueWords == null || uniqueWords.Count() == 0) { return(new Dictionary <string, Word>()); } //Insert new words into Word (insert all, skip if exists) MySqlCommand insertCommand = new MySqlCommand(); StringBuilder insertSql = new StringBuilder(); insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES "); int i = 0; foreach (string word in uniqueWords) { if (i > 0) { insertSql.Append(","); } string varName = "@w" + i; insertSql.Append("("); insertSql.Append(varName); insertSql.Append(")"); insertCommand.Parameters.AddWithValue(varName, word); i++; } insertSql.AppendLine(";"); Console.Write("'"); insertCommand.CommandText = insertSql.ToString(); Helpers.RunSqlStatement(Name, insertCommand, false); Console.Write("'"); //Get WordID for each word //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet MySqlCommand selectCommand = new MySqlCommand(); StringBuilder selectSql = new StringBuilder(); selectSql.Append( @"SELECT Word, Word.WordID FROM Word left join WordScore on WordScore.WordID = Word.WordID WHERE Word IN ("); AppendList(selectSql, selectCommand, uniqueWords.Cast <object>().ToArray()); selectSql.Append(");"); selectCommand.CommandText = selectSql.ToString(); Dictionary <string, Word> wordIDs = new Dictionary <string, Word>(); Helpers.RunSelect(Name, selectCommand, wordIDs, (values, reader) => values.Add(reader.GetString("Word"), new Word() { ID = reader.GetInt64("WordID") })); Console.Write("'"); if (wordIDs.Count == 0) { return(wordIDs); //Empty collection } //Do insert-update into WordScore StringBuilder sbWordScore = new StringBuilder(); sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES "); bool first = true; //Some UTF16 characters don't end up in wordIDs foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key))) { if (first) { first = false; } else { sbWordScore.Append(','); } sbWordScore.Append('('); sbWordScore.Append(wordIDs[wordCount.Key].ID); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(')'); } sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);"); Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false); Console.Write("'"); //Get wordscores and stopword status for words. Stopwords are never inserted into WordTweet. Dictionary <long, Word> wordIDsByID = new Dictionary <long, Word>(); foreach (Word word in wordIDs.Values) { wordIDsByID.Add(word.ID, word); } string getScoresSql = @"SELECT WordID, ScoreToIdf(Score4d) as Idf, coalesce( if(Word like '#%', 0, null), (select 0 from TwitterTrackFilter where FilterType = 0 and Word = Word.Word and IsActive limit 1), Score4d > (select value from Constants where name = 'WordScore4dHigh') ) as IsStopWord FROM WordScore ws natural join Word WHERE WordID IN (" + string.Join(",", wordIDsByID.Keys.Select(n => n.ToString()).ToArray()) + ");"; Helpers.RunSelect(Name, getScoresSql, wordIDsByID, (values, reader) => { Word word = values[reader.GetInt64("WordID")]; word.Idf = reader.GetDouble("Idf"); word.IsStopWord = reader.GetBoolean("IsStopword"); }); return(wordIDs); }
Dictionary <string, long> InsertWords(WordCount wc, HashSet <string> stopwords) { //Get unique words IEnumerable <string> uniqueWords = wc.GetWords(); if (uniqueWords == null || uniqueWords.Count() == 0) { return(new Dictionary <string, long>()); } //Insert new words into Word (insert all, skip if exists) MySqlCommand insertCommand = new MySqlCommand(); StringBuilder insertSql = new StringBuilder(); insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES "); int i = 0; foreach (string word in uniqueWords) { if (i > 0) { insertSql.Append(","); } string varName = "@w" + i; insertSql.Append("("); insertSql.Append(varName); insertSql.Append(")"); insertCommand.Parameters.AddWithValue(varName, word); i++; } insertSql.AppendLine(";"); Console.Write("'"); insertCommand.CommandText = insertSql.ToString(); Helpers.RunSqlStatement(Name, insertCommand, false); Console.Write("'"); //Get WordID for each word //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet MySqlCommand getAllWordsCommand = new MySqlCommand(); StringBuilder selectSql = new StringBuilder(); selectSql.Append( @"SELECT Word, WordID FROM Word WHERE Word IN ("); Helpers.AppendList(selectSql, getAllWordsCommand, uniqueWords.Cast <object>().ToArray()); selectSql.Append(");"); getAllWordsCommand.CommandText = selectSql.ToString(); Dictionary <string, long> wordIDs = new Dictionary <string, long>(); Helpers.RunSelect(Name, getAllWordsCommand, wordIDs, (values, reader) => values.Add(reader.GetString("Word"), reader.GetInt64("WordID"))); Console.Write("'"); if (wordIDs.Count == 0) { return(wordIDs); //Empty collection } //Do insert-update into WordScore StringBuilder sbWordScore = new StringBuilder(); sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES "); bool first = true; //Some UTF16 characters don't end up in wordIDs foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key))) { if (first) { first = false; } else { sbWordScore.Append(','); } sbWordScore.Append('('); sbWordScore.Append(wordIDs[wordCount.Key]); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(','); sbWordScore.Append(wordCount.Value); sbWordScore.Append(')'); } sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);"); Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false); Console.Write("'"); return(wordIDs); }