/// <summary>
        ///     Updates the hashtag sentiment much in the same way general sentiment is recorded.
        /// </summary>
        /// <param name="tweetText">The tweet text.</param>
        /// <param name="isRetweet">
        ///     if set to <c>true</c> [is retweet].
        /// </param>
        /// <param name="sentiment">The sentiment.</param>
        /// <param name="timestamp">The timestamp.</param>
        /// <param name="context">The context.</param>
        public static void UpdateHashtagSentiment(string tweetText, bool isRetweet, Sentiment sentiment,
                                                  DateTime timestamp, IOclumenContext context)
        {
            // clean up a bit, we can't use the one for processing tweets due to wanting to keep punctuation
            // and we cant do that after because we would have to call remove excess spaces again anyways
            IList <string> hashtags =
                TwitterTextUtility.GetHashtags(new TextCleaner(tweetText).RemoveExcessSpaces().ToLower().ToString());

            foreach (string hashtag in hashtags)
            {
                HashtagNgram hashtagNgram = context.Hashtags.FirstOrDefault(x => x.Text == hashtag);

                // create the new ngram entity if it isn't in our database already
                if (hashtagNgram == null)
                {
                    hashtagNgram = new HashtagNgram
                    {
                        Cardinality = 1,
                        FirstSeen   = timestamp,
                        Text        = hashtag
                    };

                    context.Hashtags.Add(hashtagNgram);
                    context.SaveChanges();
                }

                UpdateNgramUsageCount(hashtagNgram, isRetweet, sentiment);
            }
        }
Exemple #2
0
        public Sentiment GetTextSentiment <TEntity>(string text, bool isRetweet, int ngramCardinality,
                                                    decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary, DbSet <TEntity> ngramDbSet,
                                                    IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase
        {
            text = new TextCleaner(text).StripPunctuation().RemoveExcessSpaces().ToLower().ToString();

            IList <string> ngrams = NgramGenerator.GenerateNgrams(text, ngramCardinality);

            if (isStemmed)
            {
                ngrams = Processor.StemNgram(ngrams, dictionary);
            }

            var ngramCounts = new List <IList <KeyValuePair <Sentiment, decimal> > >(ngrams.Count);
            IList <KeyValuePair <Sentiment, decimal> > classCounts = GetClassCount(isRetweet, ngramCardinality,
                                                                                   smoothingFactor, ngramDbSet,
                                                                                   oclumenContext);

            // get the raw counts for each of the ngrams
            foreach (string ngram in ngrams)
            {
                ngramCounts.Add(GetNgramCount(ngram, isRetweet, ngramCardinality, smoothingFactor, ngramDbSet,
                                              oclumenContext, ngramDictionary));

                //Debug.WriteLine(ngram + " " + ngramCounts.Last().First(x => x.Key == Sentiment.Positive).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Neutral).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Negative).Value);
            }

            int vocabularySize = GetVocabularySize(isRetweet, ngramCardinality, ngramDbSet, oclumenContext);

            // ok now let's get the probabilities, combining the individual ngram probabilities
            // witht he probability of a given sentiment class
            var sentimentProb = GetNgramSentimentProbabilities(vocabularySize, ngramCounts, classCounts);

            return(sentimentProb.Last().Key);
        }
        /// <summary>
        ///     Takes a partialTweetText, cleans it, breaks it into ngrams and stemmed ngrams.
        /// </summary>
        /// <param name="partialTweetText">The partialTweetText.</param>
        /// <param name="isRetweet">
        ///     if set to <c>true</c> [is retweet].
        /// </param>
        /// <param name="sentiment">The sentiment.</param>
        /// <param name="timestamp">The current time in UTC.</param>
        /// <param name="dictionary">The dictionary.</param>
        /// <param name="context">The context.</param>
        public static void ProcessTweetTextForCorpus(string partialTweetText,
                                                     bool isRetweet,
                                                     Sentiment sentiment,
                                                     DateTime timestamp,
                                                     IDictionary <String, String> dictionary,
                                                     IOclumenContext context)
        {
            // first let's grab the hashtags
            UpdateHashtagSentiment(partialTweetText, isRetweet, sentiment, timestamp, context);

            // clean up the partialTweetText a bit
            string cleanedTweet =
                new TextCleaner(partialTweetText).StripPunctuation().RemoveExcessSpaces().ToLower().ToString();


            IList <string> unigrams = NgramGenerator.GenerateNgrams(cleanedTweet, 1);
            IList <string> bigrams  = NgramGenerator.GenerateNgrams(cleanedTweet, 2);

            IList <string> stemmedNgrams  = StemNgram(unigrams, dictionary);
            IList <string> stemmedBigrams = StemNgram(bigrams, dictionary);

            UpdateNgramsSentiment(unigrams, isRetweet, sentiment, false, context.BasicNgrams, context);
            UpdateNgramsSentiment(bigrams, isRetweet, sentiment, false, context.BasicNgrams, context);
            UpdateNgramsSentiment(stemmedNgrams, isRetweet, sentiment, true, context.StemmedNgrams, context);
            UpdateNgramsSentiment(stemmedBigrams, isRetweet, sentiment, true, context.StemmedNgrams, context);
        }
Exemple #4
0
        /// <summary>
        /// Gets the text sentiment, automatically decides whether to use the tweet or retweet corpus counts
        /// </summary>
        /// <typeparam name="TEntity">The type of the entity.</typeparam>
        /// <param name="text">The text.</param>
        /// <param name="ngramCardinality">The ngram cardinality.</param>
        /// <param name="smoothingFactor">The smoothing factor.</param>
        /// <param name="isStemmed">if set to <c>true</c> [is stemmed].</param>
        /// <param name="dictionary">The dictionary used by the stemmer.</param>
        /// <param name="ngramDbSet">The ngram db set.</param>
        /// <param name="oclumenContext">The oclumen context.</param>
        /// <returns></returns>
        public Sentiment GetTextSentiment <TEntity>(string text, int ngramCardinality, decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary,
                                                    DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase
        {
            String originalText;

            TwitterTextUtility.GetRetweets(text, out originalText);

            if (originalText == String.Empty)
            {
                return(GetTextSentiment(text, true, ngramCardinality, smoothingFactor, isStemmed, dictionary, ngramDbSet, oclumenContext, ngramDictionary));
            }
            else
            {
                return(GetTextSentiment(text, false, ngramCardinality, smoothingFactor, isStemmed, dictionary, ngramDbSet, oclumenContext, ngramDictionary));
            }
        }
Exemple #5
0
        /// <summary>
        ///     Returns a list of key value pairs for a given sentiment and the raw counts seen.
        /// </summary>
        /// <typeparam name="TEntity">The type of the entity.</typeparam>
        /// <param name="isRetweet">
        ///     if set to <c>true</c> [is retweet].
        /// </param>
        /// <param name="ngramCardinality">The ngram cardinality.</param>
        /// <param name="smoothingFactor">The smoothing factor.</param>
        /// <param name="ngramDbSet">The ngram db set.</param>
        /// <param name="oclumenContext">The oclumen context.</param>
        /// <returns></returns>
        /// <exception cref="System.DivideByZeroException">No corpus sentiment for this class.</exception>
        protected IList <KeyValuePair <Sentiment, decimal> > GetClassCount <TEntity>(bool isRetweet,
                                                                                     int ngramCardinality,
                                                                                     decimal smoothingFactor,
                                                                                     DbSet <TEntity> ngramDbSet,
                                                                                     IOclumenContext oclumenContext)
            where TEntity : NgramBase
        {
            var classProbabilityList = new List <KeyValuePair <Sentiment, decimal> >(3);

            decimal positiveSum, neutralSum, negativeSum;

            positiveSum = oclumenContext.RawTweets.Count(x => x.CorpusSentiment == (int)Sentiment.Positive && x.CorpusSentimentTimestamp != DateTime.MinValue);
            neutralSum  = oclumenContext.RawTweets.Count(x => x.CorpusSentiment == (int)Sentiment.Neutral && x.CorpusSentimentTimestamp != DateTime.MinValue);
            negativeSum = oclumenContext.RawTweets.Count(x => x.CorpusSentiment == (int)Sentiment.Negative && x.CorpusSentimentTimestamp != DateTime.MinValue);

            classProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Positive, positiveSum));
            classProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Neutral, neutralSum));
            classProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Negative, negativeSum));

            return(classProbabilityList);
        }
        /// <summary>
        ///     Updates the ngrams sentiment count based on the sentiment of the partialTweetText and whether
        ///     or not this ngram is part of the retweet.
        /// </summary>
        /// <param name="ngrams">The ngrams.</param>
        /// <param name="isRetweet">
        ///     if set to <c>true</c> [is retweet].
        /// </param>
        /// <param name="sentiment">The sentiment.</param>
        /// <param name="ngramDbSet">The ngram db set.</param>
        /// <param name="isStemmed">
        ///     if set to <c>true</c> [is stemmed].
        /// </param>
        /// <param name="context">The context.</param>
        public static void UpdateNgramsSentiment <TEntity>(IList <String> ngrams, bool isRetweet, Sentiment sentiment,
                                                           bool isStemmed, DbSet <TEntity> ngramDbSet,
                                                           IOclumenContext context) where TEntity : NgramBase
        {
            foreach (String currentNgram in ngrams)
            {
                NgramBase dbNgram = ngramDbSet.FirstOrDefault(x => x.Text == currentNgram);

                if (dbNgram == null)
                {
                    // ok we dont have this ngram in our database yet
                    NgramBase newNgram;

                    if (isStemmed)
                    {
                        newNgram = new StemmedNgram
                        {
                            Text        = currentNgram,
                            Cardinality = GetNgramCardinality(currentNgram)
                        };
                    }
                    else
                    {
                        newNgram = new BasicNgram {
                            Text = currentNgram, Cardinality = GetNgramCardinality(currentNgram)
                        };
                    }

                    ngramDbSet.Add((TEntity)newNgram);
                    dbNgram = newNgram;
                    context.SaveChanges();
                }

                // update the usage count
                UpdateNgramUsageCount(dbNgram, isRetweet, sentiment);
            }
        }
Exemple #7
0
 /// <summary>
 /// Gets the total number of distinct words in the corpus
 /// </summary>
 /// <typeparam name="TEntity">The type of the entity.</typeparam>
 /// <param name="isRetweet">if set to <c>true</c> [is retweet].</param>
 /// <param name="ngramCardinality">The ngram cardinality.</param>
 /// <param name="ngramDbSet">The ngram db set.</param>
 /// <param name="oclumenContext">The oclumen context.</param>
 /// <returns></returns>
 protected int GetVocabularySize <TEntity>(bool isRetweet, int ngramCardinality, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext)
     where TEntity : NgramBase
 {
     if (isRetweet)
     {
         return(ngramDbSet.Count(x => x.Cardinality == ngramCardinality && (x.RtPositiveCount != 0 || x.RtNeutralCount != 0 || x.RtNegativeCount != 0)));
     }
     else
     {
         return(ngramDbSet.Count(x => x.Cardinality == ngramCardinality && (x.PositiveCount != 0 || x.NeutralCount != 0 || x.NegativeCount != 0)));
     }
 }
Exemple #8
0
        /// <summary>
        ///     Returns the number of times that an ngram was seen in a positive, neutral an negative context.
        /// </summary>
        /// <typeparam name="TEntity">The type of the entity.</typeparam>
        /// <param name="text">The text.</param>
        /// <param name="isRetweet">
        ///     if set to <c>true</c> [is retweet].
        /// </param>
        /// <param name="ngramCardinality">The ngram cardinality.</param>
        /// <param name="smoothingFactor">The smoothing factor.</param>
        /// <param name="ngramDbSet">The ngram db set.</param>
        /// <param name="oclumenContext">The oclumen context.</param>
        /// <returns></returns>
        protected IList <KeyValuePair <Sentiment, decimal> > GetNgramCount <TEntity>(string text, bool isRetweet,
                                                                                     int ngramCardinality,
                                                                                     decimal smoothingFactor,
                                                                                     DbSet <TEntity> ngramDbSet,
                                                                                     IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null)
            where TEntity : NgramBase
        {
            //check to see if using ngramDictionary
            if (ngramDictionary != null)
            {
                text = text.ToLower();

                if (isRetweet)
                {
                    text = text + "_rt";
                }

                if (ngramDictionary.ContainsKey(text))
                {
                    return(ngramDictionary[text]);
                }
                else
                {
                    var ngramProbabilityListFake = new List <KeyValuePair <Sentiment, decimal> >(3);
                    ngramProbabilityListFake.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Positive, smoothingFactor));
                    ngramProbabilityListFake.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Neutral, smoothingFactor));
                    ngramProbabilityListFake.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Negative, smoothingFactor));

                    return(ngramProbabilityListFake);
                }
            }

            var ngramProbabilityList = new List <KeyValuePair <Sentiment, decimal> >(3);

            decimal positiveCount = 0, neuturalCount = 0, negativeCount = 0;

            TEntity ngramRecord = ngramDbSet.FirstOrDefault(x => x.Text == text && x.Cardinality == ngramCardinality);

            if (ngramRecord != null)
            {
                if (isRetweet)
                {
                    positiveCount = ngramRecord.RtPositiveCount;
                    neuturalCount = ngramRecord.RtNeutralCount;
                    negativeCount = ngramRecord.RtNegativeCount;
                }
                else
                {
                    positiveCount = ngramRecord.PositiveCount;
                    neuturalCount = ngramRecord.NeutralCount;
                    negativeCount = ngramRecord.NegativeCount;
                }
            }

            positiveCount += smoothingFactor;
            neuturalCount += smoothingFactor;
            negativeCount += smoothingFactor;

            ngramProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Positive, positiveCount));
            ngramProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Neutral, neuturalCount));
            ngramProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Negative, negativeCount));

            return(ngramProbabilityList);
        }