/// <summary> /// Updates the hashtag sentiment much in the same way general sentiment is recorded. /// </summary> /// <param name="tweetText">The tweet text.</param> /// <param name="isRetweet"> /// if set to <c>true</c> [is retweet]. /// </param> /// <param name="sentiment">The sentiment.</param> /// <param name="timestamp">The timestamp.</param> /// <param name="context">The context.</param> public static void UpdateHashtagSentiment(string tweetText, bool isRetweet, Sentiment sentiment, DateTime timestamp, IOclumenContext context) { // clean up a bit, we can't use the one for processing tweets due to wanting to keep punctuation // and we cant do that after because we would have to call remove excess spaces again anyways IList <string> hashtags = TwitterTextUtility.GetHashtags(new TextCleaner(tweetText).RemoveExcessSpaces().ToLower().ToString()); foreach (string hashtag in hashtags) { HashtagNgram hashtagNgram = context.Hashtags.FirstOrDefault(x => x.Text == hashtag); // create the new ngram entity if it isn't in our database already if (hashtagNgram == null) { hashtagNgram = new HashtagNgram { Cardinality = 1, FirstSeen = timestamp, Text = hashtag }; context.Hashtags.Add(hashtagNgram); context.SaveChanges(); } UpdateNgramUsageCount(hashtagNgram, isRetweet, sentiment); } }
public Sentiment GetTextSentiment <TEntity>(string text, bool isRetweet, int ngramCardinality, decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase { text = new TextCleaner(text).StripPunctuation().RemoveExcessSpaces().ToLower().ToString(); IList <string> ngrams = NgramGenerator.GenerateNgrams(text, ngramCardinality); if (isStemmed) { ngrams = Processor.StemNgram(ngrams, dictionary); } var ngramCounts = new List <IList <KeyValuePair <Sentiment, decimal> > >(ngrams.Count); IList <KeyValuePair <Sentiment, decimal> > classCounts = GetClassCount(isRetweet, ngramCardinality, smoothingFactor, ngramDbSet, oclumenContext); // get the raw counts for each of the ngrams foreach (string ngram in ngrams) { ngramCounts.Add(GetNgramCount(ngram, isRetweet, ngramCardinality, smoothingFactor, ngramDbSet, oclumenContext, ngramDictionary)); //Debug.WriteLine(ngram + " " + ngramCounts.Last().First(x => x.Key == Sentiment.Positive).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Neutral).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Negative).Value); } int vocabularySize = GetVocabularySize(isRetweet, ngramCardinality, ngramDbSet, oclumenContext); // ok now let's get the probabilities, combining the individual ngram probabilities // witht he probability of a given sentiment class var sentimentProb = GetNgramSentimentProbabilities(vocabularySize, ngramCounts, classCounts); return(sentimentProb.Last().Key); }
/// <summary> /// Takes a partialTweetText, cleans it, breaks it into ngrams and stemmed ngrams. /// </summary> /// <param name="partialTweetText">The partialTweetText.</param> /// <param name="isRetweet"> /// if set to <c>true</c> [is retweet]. /// </param> /// <param name="sentiment">The sentiment.</param> /// <param name="timestamp">The current time in UTC.</param> /// <param name="dictionary">The dictionary.</param> /// <param name="context">The context.</param> public static void ProcessTweetTextForCorpus(string partialTweetText, bool isRetweet, Sentiment sentiment, DateTime timestamp, IDictionary <String, String> dictionary, IOclumenContext context) { // first let's grab the hashtags UpdateHashtagSentiment(partialTweetText, isRetweet, sentiment, timestamp, context); // clean up the partialTweetText a bit string cleanedTweet = new TextCleaner(partialTweetText).StripPunctuation().RemoveExcessSpaces().ToLower().ToString(); IList <string> unigrams = NgramGenerator.GenerateNgrams(cleanedTweet, 1); IList <string> bigrams = NgramGenerator.GenerateNgrams(cleanedTweet, 2); IList <string> stemmedNgrams = StemNgram(unigrams, dictionary); IList <string> stemmedBigrams = StemNgram(bigrams, dictionary); UpdateNgramsSentiment(unigrams, isRetweet, sentiment, false, context.BasicNgrams, context); UpdateNgramsSentiment(bigrams, isRetweet, sentiment, false, context.BasicNgrams, context); UpdateNgramsSentiment(stemmedNgrams, isRetweet, sentiment, true, context.StemmedNgrams, context); UpdateNgramsSentiment(stemmedBigrams, isRetweet, sentiment, true, context.StemmedNgrams, context); }
/// <summary> /// Gets the text sentiment, automatically decides whether to use the tweet or retweet corpus counts /// </summary> /// <typeparam name="TEntity">The type of the entity.</typeparam> /// <param name="text">The text.</param> /// <param name="ngramCardinality">The ngram cardinality.</param> /// <param name="smoothingFactor">The smoothing factor.</param> /// <param name="isStemmed">if set to <c>true</c> [is stemmed].</param> /// <param name="dictionary">The dictionary used by the stemmer.</param> /// <param name="ngramDbSet">The ngram db set.</param> /// <param name="oclumenContext">The oclumen context.</param> /// <returns></returns> public Sentiment GetTextSentiment <TEntity>(string text, int ngramCardinality, decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase { String originalText; TwitterTextUtility.GetRetweets(text, out originalText); if (originalText == String.Empty) { return(GetTextSentiment(text, true, ngramCardinality, smoothingFactor, isStemmed, dictionary, ngramDbSet, oclumenContext, ngramDictionary)); } else { return(GetTextSentiment(text, false, ngramCardinality, smoothingFactor, isStemmed, dictionary, ngramDbSet, oclumenContext, ngramDictionary)); } }
/// <summary> /// Returns a list of key value pairs for a given sentiment and the raw counts seen. /// </summary> /// <typeparam name="TEntity">The type of the entity.</typeparam> /// <param name="isRetweet"> /// if set to <c>true</c> [is retweet]. /// </param> /// <param name="ngramCardinality">The ngram cardinality.</param> /// <param name="smoothingFactor">The smoothing factor.</param> /// <param name="ngramDbSet">The ngram db set.</param> /// <param name="oclumenContext">The oclumen context.</param> /// <returns></returns> /// <exception cref="System.DivideByZeroException">No corpus sentiment for this class.</exception> protected IList <KeyValuePair <Sentiment, decimal> > GetClassCount <TEntity>(bool isRetweet, int ngramCardinality, decimal smoothingFactor, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext) where TEntity : NgramBase { var classProbabilityList = new List <KeyValuePair <Sentiment, decimal> >(3); decimal positiveSum, neutralSum, negativeSum; positiveSum = oclumenContext.RawTweets.Count(x => x.CorpusSentiment == (int)Sentiment.Positive && x.CorpusSentimentTimestamp != DateTime.MinValue); neutralSum = oclumenContext.RawTweets.Count(x => x.CorpusSentiment == (int)Sentiment.Neutral && x.CorpusSentimentTimestamp != DateTime.MinValue); negativeSum = oclumenContext.RawTweets.Count(x => x.CorpusSentiment == (int)Sentiment.Negative && x.CorpusSentimentTimestamp != DateTime.MinValue); classProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Positive, positiveSum)); classProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Neutral, neutralSum)); classProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Negative, negativeSum)); return(classProbabilityList); }
/// <summary> /// Updates the ngrams sentiment count based on the sentiment of the partialTweetText and whether /// or not this ngram is part of the retweet. /// </summary> /// <param name="ngrams">The ngrams.</param> /// <param name="isRetweet"> /// if set to <c>true</c> [is retweet]. /// </param> /// <param name="sentiment">The sentiment.</param> /// <param name="ngramDbSet">The ngram db set.</param> /// <param name="isStemmed"> /// if set to <c>true</c> [is stemmed]. /// </param> /// <param name="context">The context.</param> public static void UpdateNgramsSentiment <TEntity>(IList <String> ngrams, bool isRetweet, Sentiment sentiment, bool isStemmed, DbSet <TEntity> ngramDbSet, IOclumenContext context) where TEntity : NgramBase { foreach (String currentNgram in ngrams) { NgramBase dbNgram = ngramDbSet.FirstOrDefault(x => x.Text == currentNgram); if (dbNgram == null) { // ok we dont have this ngram in our database yet NgramBase newNgram; if (isStemmed) { newNgram = new StemmedNgram { Text = currentNgram, Cardinality = GetNgramCardinality(currentNgram) }; } else { newNgram = new BasicNgram { Text = currentNgram, Cardinality = GetNgramCardinality(currentNgram) }; } ngramDbSet.Add((TEntity)newNgram); dbNgram = newNgram; context.SaveChanges(); } // update the usage count UpdateNgramUsageCount(dbNgram, isRetweet, sentiment); } }
/// <summary> /// Gets the total number of distinct words in the corpus /// </summary> /// <typeparam name="TEntity">The type of the entity.</typeparam> /// <param name="isRetweet">if set to <c>true</c> [is retweet].</param> /// <param name="ngramCardinality">The ngram cardinality.</param> /// <param name="ngramDbSet">The ngram db set.</param> /// <param name="oclumenContext">The oclumen context.</param> /// <returns></returns> protected int GetVocabularySize <TEntity>(bool isRetweet, int ngramCardinality, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext) where TEntity : NgramBase { if (isRetweet) { return(ngramDbSet.Count(x => x.Cardinality == ngramCardinality && (x.RtPositiveCount != 0 || x.RtNeutralCount != 0 || x.RtNegativeCount != 0))); } else { return(ngramDbSet.Count(x => x.Cardinality == ngramCardinality && (x.PositiveCount != 0 || x.NeutralCount != 0 || x.NegativeCount != 0))); } }
/// <summary> /// Returns the number of times that an ngram was seen in a positive, neutral an negative context. /// </summary> /// <typeparam name="TEntity">The type of the entity.</typeparam> /// <param name="text">The text.</param> /// <param name="isRetweet"> /// if set to <c>true</c> [is retweet]. /// </param> /// <param name="ngramCardinality">The ngram cardinality.</param> /// <param name="smoothingFactor">The smoothing factor.</param> /// <param name="ngramDbSet">The ngram db set.</param> /// <param name="oclumenContext">The oclumen context.</param> /// <returns></returns> protected IList <KeyValuePair <Sentiment, decimal> > GetNgramCount <TEntity>(string text, bool isRetweet, int ngramCardinality, decimal smoothingFactor, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase { //check to see if using ngramDictionary if (ngramDictionary != null) { text = text.ToLower(); if (isRetweet) { text = text + "_rt"; } if (ngramDictionary.ContainsKey(text)) { return(ngramDictionary[text]); } else { var ngramProbabilityListFake = new List <KeyValuePair <Sentiment, decimal> >(3); ngramProbabilityListFake.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Positive, smoothingFactor)); ngramProbabilityListFake.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Neutral, smoothingFactor)); ngramProbabilityListFake.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Negative, smoothingFactor)); return(ngramProbabilityListFake); } } var ngramProbabilityList = new List <KeyValuePair <Sentiment, decimal> >(3); decimal positiveCount = 0, neuturalCount = 0, negativeCount = 0; TEntity ngramRecord = ngramDbSet.FirstOrDefault(x => x.Text == text && x.Cardinality == ngramCardinality); if (ngramRecord != null) { if (isRetweet) { positiveCount = ngramRecord.RtPositiveCount; neuturalCount = ngramRecord.RtNeutralCount; negativeCount = ngramRecord.RtNegativeCount; } else { positiveCount = ngramRecord.PositiveCount; neuturalCount = ngramRecord.NeutralCount; negativeCount = ngramRecord.NegativeCount; } } positiveCount += smoothingFactor; neuturalCount += smoothingFactor; negativeCount += smoothingFactor; ngramProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Positive, positiveCount)); ngramProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Neutral, neuturalCount)); ngramProbabilityList.Add(new KeyValuePair <Sentiment, decimal>(Sentiment.Negative, negativeCount)); return(ngramProbabilityList); }