Ejemplo n.º 1
0
        Dictionary <string, Word> InsertWords(WordCount wc)
        {
            //Get unique words
            IEnumerable <string> uniqueWords = wc.GetWords();

            if (uniqueWords == null || uniqueWords.Count() == 0)
            {
                return(new Dictionary <string, Word>());
            }

            //Insert new words into Word (insert all, skip if exists)
            MySqlCommand insertCommand = new MySqlCommand();

            StringBuilder insertSql = new StringBuilder();

            insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES ");
            int i = 0;

            foreach (string word in uniqueWords)
            {
                if (i > 0)
                {
                    insertSql.Append(",");
                }
                string varName = "@w" + i;
                insertSql.Append("(");
                insertSql.Append(varName);
                insertSql.Append(")");
                insertCommand.Parameters.AddWithValue(varName, word);
                i++;
            }
            insertSql.AppendLine(";");

            Console.Write("'");

            insertCommand.CommandText = insertSql.ToString();
            Helpers.RunSqlStatement(Name, insertCommand, false);

            Console.Write("'");

            //Get WordID for each word
            //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet
            MySqlCommand  selectCommand = new MySqlCommand();
            StringBuilder selectSql     = new StringBuilder();

            selectSql.Append(
                @"SELECT
                    Word,
                    Word.WordID
                FROM Word left join WordScore on WordScore.WordID = Word.WordID WHERE Word IN (");
            AppendList(selectSql, selectCommand, uniqueWords.Cast <object>().ToArray());
            selectSql.Append(");");
            selectCommand.CommandText = selectSql.ToString();

            Dictionary <string, Word> wordIDs = new Dictionary <string, Word>();

            Helpers.RunSelect(Name, selectCommand, wordIDs,
                              (values, reader) => values.Add(reader.GetString("Word"), new Word()
            {
                ID = reader.GetInt64("WordID")
            }));

            Console.Write("'");

            if (wordIDs.Count == 0)
            {
                return(wordIDs); //Empty collection
            }
            //Do insert-update into WordScore
            StringBuilder sbWordScore = new StringBuilder();

            sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES ");

            bool first = true;

            //Some UTF16 characters don't end up in wordIDs
            foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key)))
            {
                if (first)
                {
                    first = false;
                }
                else
                {
                    sbWordScore.Append(',');
                }

                sbWordScore.Append('(');
                sbWordScore.Append(wordIDs[wordCount.Key].ID);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(')');
            }
            sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);");

            Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false);

            Console.Write("'");

            //Get wordscores and stopword status for words. Stopwords are never inserted into WordTweet.
            Dictionary <long, Word> wordIDsByID = new Dictionary <long, Word>();

            foreach (Word word in wordIDs.Values)
            {
                wordIDsByID.Add(word.ID, word);
            }

            string getScoresSql =
                @"SELECT 
                    WordID,
                    ScoreToIdf(Score4d) as Idf,
                    coalesce(
                        if(Word like '#%', 0, null),
                        (select 0 from TwitterTrackFilter where FilterType = 0 and Word = Word.Word and IsActive limit 1),
                        Score4d > (select value from Constants where name = 'WordScore4dHigh')
                    ) as IsStopWord
                FROM WordScore ws natural join Word WHERE WordID IN (" + string.Join(",", wordIDsByID.Keys.Select(n => n.ToString()).ToArray()) + ");";

            Helpers.RunSelect(Name, getScoresSql, wordIDsByID,
                              (values, reader) =>
            {
                Word word       = values[reader.GetInt64("WordID")];
                word.Idf        = reader.GetDouble("Idf");
                word.IsStopWord = reader.GetBoolean("IsStopword");
            });

            return(wordIDs);
        }
Ejemplo n.º 2
0
        Dictionary <string, long> InsertWords(WordCount wc, HashSet <string> stopwords)
        {
            //Get unique words
            IEnumerable <string> uniqueWords = wc.GetWords();

            if (uniqueWords == null || uniqueWords.Count() == 0)
            {
                return(new Dictionary <string, long>());
            }

            //Insert new words into Word (insert all, skip if exists)
            MySqlCommand insertCommand = new MySqlCommand();

            StringBuilder insertSql = new StringBuilder();

            insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES ");
            int i = 0;

            foreach (string word in uniqueWords)
            {
                if (i > 0)
                {
                    insertSql.Append(",");
                }
                string varName = "@w" + i;
                insertSql.Append("(");
                insertSql.Append(varName);
                insertSql.Append(")");
                insertCommand.Parameters.AddWithValue(varName, word);
                i++;
            }
            insertSql.AppendLine(";");

            Console.Write("'");

            insertCommand.CommandText = insertSql.ToString();
            Helpers.RunSqlStatement(Name, insertCommand, false);

            Console.Write("'");

            //Get WordID for each word
            //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet
            MySqlCommand  getAllWordsCommand = new MySqlCommand();
            StringBuilder selectSql          = new StringBuilder();

            selectSql.Append(
                @"SELECT
                    Word,
                    WordID
                FROM Word WHERE Word IN (");
            Helpers.AppendList(selectSql, getAllWordsCommand, uniqueWords.Cast <object>().ToArray());
            selectSql.Append(");");
            getAllWordsCommand.CommandText = selectSql.ToString();

            Dictionary <string, long> wordIDs = new Dictionary <string, long>();

            Helpers.RunSelect(Name, getAllWordsCommand, wordIDs,
                              (values, reader) => values.Add(reader.GetString("Word"), reader.GetInt64("WordID")));

            Console.Write("'");

            if (wordIDs.Count == 0)
            {
                return(wordIDs); //Empty collection
            }
            //Do insert-update into WordScore
            StringBuilder sbWordScore = new StringBuilder();

            sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES ");

            bool first = true;

            //Some UTF16 characters don't end up in wordIDs
            foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key)))
            {
                if (first)
                {
                    first = false;
                }
                else
                {
                    sbWordScore.Append(',');
                }

                sbWordScore.Append('(');
                sbWordScore.Append(wordIDs[wordCount.Key]);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(')');
            }
            sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);");

            Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false);

            Console.Write("'");

            return(wordIDs);
        }