Ejemplo n.º 1
0
        Dictionary<string, long> InsertWords(WordCount wc, HashSet<string> stopwords)
        {
            //Get unique words
            IEnumerable<string> uniqueWords = wc.GetWords();
            if (uniqueWords == null || uniqueWords.Count() == 0)
                return new Dictionary<string, long>();

            //Insert new words into Word (insert all, skip if exists)
            MySqlCommand insertCommand = new MySqlCommand();

            StringBuilder insertSql = new StringBuilder();
            insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES ");
            int i = 0;
            foreach (string word in uniqueWords)
            {
                if (i > 0)
                    insertSql.Append(",");
                string varName = "@w" + i;
                insertSql.Append("(");
                insertSql.Append(varName);
                insertSql.Append(")");
                insertCommand.Parameters.AddWithValue(varName, word);
                i++;
            }
            insertSql.AppendLine(";");

            Console.Write("'");

            insertCommand.CommandText = insertSql.ToString();
            Helpers.RunSqlStatement(Name, insertCommand, false);

            Console.Write("'");

            //Get WordID for each word
            //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet
            MySqlCommand getAllWordsCommand = new MySqlCommand();
            StringBuilder selectSql = new StringBuilder();
            selectSql.Append(
                @"SELECT
                    Word,
                    WordID
                FROM Word WHERE Word IN (");
            Helpers.AppendList(selectSql, getAllWordsCommand, uniqueWords.Cast<object>().ToArray());
            selectSql.Append(");");
            getAllWordsCommand.CommandText = selectSql.ToString();

            Dictionary<string, long> wordIDs = new Dictionary<string, long>();
            Helpers.RunSelect(Name, getAllWordsCommand, wordIDs,
                (values, reader) => values.Add(reader.GetString("Word"), reader.GetInt64("WordID") ));

            Console.Write("'");

            if (wordIDs.Count == 0)
                return wordIDs; //Empty collection

            //Do insert-update into WordScore
            StringBuilder sbWordScore = new StringBuilder();
            sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES ");

            bool first = true;
            //Some UTF16 characters don't end up in wordIDs
            foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key)))
            {
                if (first)
                    first = false;
                else
                    sbWordScore.Append(',');

                sbWordScore.Append('(');
                sbWordScore.Append(wordIDs[wordCount.Key]);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(')');
            }
            sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);");

            Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false);

            Console.Write("'");

            return wordIDs;
        }
Ejemplo n.º 2
0
        Dictionary<string, Word> InsertWords(WordCount wc)
        {
            //Get unique words
            IEnumerable<string> uniqueWords = wc.GetWords();
            if (uniqueWords == null || uniqueWords.Count() == 0)
                return new Dictionary<string, Word>();

            //Insert new words into Word (insert all, skip if exists)
            MySqlCommand insertCommand = new MySqlCommand();

            StringBuilder insertSql = new StringBuilder();
            insertSql.Append("INSERT IGNORE INTO Word (Word) VALUES ");
            int i = 0;
            foreach (string word in uniqueWords)
            {
                if (i > 0)
                    insertSql.Append(",");
                string varName = "@w" + i;
                insertSql.Append("(");
                insertSql.Append(varName);
                insertSql.Append(")");
                insertCommand.Parameters.AddWithValue(varName, word);
                i++;
            }
            insertSql.AppendLine(";");

            Console.Write("'");

            insertCommand.CommandText = insertSql.ToString();
            Helpers.RunSqlStatement(Name, insertCommand, false);

            Console.Write("'");

            //Get WordID for each word
            //Track keywords may be counted as stopwords if they have been stemmed before insert into WordTweet
            MySqlCommand selectCommand = new MySqlCommand();
            StringBuilder selectSql = new StringBuilder();
            selectSql.Append(
                @"SELECT
                    Word,
                    Word.WordID
                FROM Word left join WordScore on WordScore.WordID = Word.WordID WHERE Word IN (");
            AppendList(selectSql, selectCommand, uniqueWords.Cast<object>().ToArray());
            selectSql.Append(");");
            selectCommand.CommandText = selectSql.ToString();

            Dictionary<string, Word> wordIDs = new Dictionary<string, Word>();
            Helpers.RunSelect(Name, selectCommand, wordIDs,
                (values, reader) => values.Add(reader.GetString("Word"), new Word()
                {
                    ID = reader.GetInt64("WordID")
                }));

            Console.Write("'");

            if (wordIDs.Count == 0)
                return wordIDs; //Empty collection

            //Do insert-update into WordScore
            StringBuilder sbWordScore = new StringBuilder();
            sbWordScore.Append("INSERT INTO WordScore (WordID, Score1h, Score4d) VALUES ");

            bool first = true;
            //Some UTF16 characters don't end up in wordIDs
            foreach (var wordCount in wc.GetWordCounts().Where(n => wordIDs.ContainsKey(n.Key)))
            {
                if (first)
                    first = false;
                else
                    sbWordScore.Append(',');

                sbWordScore.Append('(');
                sbWordScore.Append(wordIDs[wordCount.Key].ID);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(',');
                sbWordScore.Append(wordCount.Value);
                sbWordScore.Append(')');
            }
            sbWordScore.Append(" ON DUPLICATE KEY UPDATE Score1h = Score1h + VALUES(Score1h), Score4d = Score4d + VALUES(Score4d);");

            Helpers.RunSqlStatement(Name, sbWordScore.ToString(), false);

            Console.Write("'");

            //Get wordscores and stopword status for words. Stopwords are never inserted into WordTweet.
            Dictionary<long, Word> wordIDsByID = new Dictionary<long, Word>();
            foreach (Word word in wordIDs.Values)
                wordIDsByID.Add(word.ID, word);

            string getScoresSql =
                @"SELECT
                    WordID,
                    ScoreToIdf(Score4d) as Idf,
                    coalesce(
                        if(Word like '#%', 0, null),
                        (select 0 from TwitterTrackFilter where FilterType = 0 and Word = Word.Word and IsActive limit 1),
                        Score4d > (select value from Constants where name = 'WordScore4dHigh')
                    ) as IsStopWord
                FROM WordScore ws natural join Word WHERE WordID IN (" + string.Join(",", wordIDsByID.Keys.Select(n => n.ToString()).ToArray()) + ");";
            Helpers.RunSelect(Name, getScoresSql, wordIDsByID,
                (values, reader) =>
                {
                    Word word = values[reader.GetInt64("WordID")];
                    word.Idf = reader.GetDouble("Idf");
                    word.IsStopWord = reader.GetBoolean("IsStopword");
                });

            return wordIDs;
        }