/// <summary> /// Takes a partialTweetText, cleans it, breaks it into ngrams and stemmed ngrams. /// </summary> /// <param name="partialTweetText">The partialTweetText.</param> /// <param name="isRetweet"> /// if set to <c>true</c> [is retweet]. /// </param> /// <param name="sentiment">The sentiment.</param> /// <param name="timestamp">The current time in UTC.</param> /// <param name="dictionary">The dictionary.</param> /// <param name="context">The context.</param> public static void ProcessTweetTextForCorpus(string partialTweetText, bool isRetweet, Sentiment sentiment, DateTime timestamp, IDictionary <String, String> dictionary, IOclumenContext context) { // first let's grab the hashtags UpdateHashtagSentiment(partialTweetText, isRetweet, sentiment, timestamp, context); // clean up the partialTweetText a bit string cleanedTweet = new TextCleaner(partialTweetText).StripPunctuation().RemoveExcessSpaces().ToLower().ToString(); IList <string> unigrams = NgramGenerator.GenerateNgrams(cleanedTweet, 1); IList <string> bigrams = NgramGenerator.GenerateNgrams(cleanedTweet, 2); IList <string> stemmedNgrams = StemNgram(unigrams, dictionary); IList <string> stemmedBigrams = StemNgram(bigrams, dictionary); UpdateNgramsSentiment(unigrams, isRetweet, sentiment, false, context.BasicNgrams, context); UpdateNgramsSentiment(bigrams, isRetweet, sentiment, false, context.BasicNgrams, context); UpdateNgramsSentiment(stemmedNgrams, isRetweet, sentiment, true, context.StemmedNgrams, context); UpdateNgramsSentiment(stemmedBigrams, isRetweet, sentiment, true, context.StemmedNgrams, context); }
public Sentiment GetTextSentiment <TEntity>(string text, bool isRetweet, int ngramCardinality, decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase { text = new TextCleaner(text).StripPunctuation().RemoveExcessSpaces().ToLower().ToString(); IList <string> ngrams = NgramGenerator.GenerateNgrams(text, ngramCardinality); if (isStemmed) { ngrams = Processor.StemNgram(ngrams, dictionary); } var ngramCounts = new List <IList <KeyValuePair <Sentiment, decimal> > >(ngrams.Count); IList <KeyValuePair <Sentiment, decimal> > classCounts = GetClassCount(isRetweet, ngramCardinality, smoothingFactor, ngramDbSet, oclumenContext); // get the raw counts for each of the ngrams foreach (string ngram in ngrams) { ngramCounts.Add(GetNgramCount(ngram, isRetweet, ngramCardinality, smoothingFactor, ngramDbSet, oclumenContext, ngramDictionary)); //Debug.WriteLine(ngram + " " + ngramCounts.Last().First(x => x.Key == Sentiment.Positive).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Neutral).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Negative).Value); } int vocabularySize = GetVocabularySize(isRetweet, ngramCardinality, ngramDbSet, oclumenContext); // ok now let's get the probabilities, combining the individual ngram probabilities // witht he probability of a given sentiment class var sentimentProb = GetNgramSentimentProbabilities(vocabularySize, ngramCounts, classCounts); return(sentimentProb.Last().Key); }
public void NgramGeneratorTextDuplicateIgnore() { var ngrams = NgramGenerator.GenerateNgrams("hello world hello world", 2); Assert.IsTrue(ngrams.Contains("hello world")); Assert.IsTrue(ngrams.Contains("world hello")); Assert.IsTrue(ngrams.Count == 2); }
public void NgramGeneratorTestEmpty() { // empty string Assert.IsTrue(NgramGenerator.GenerateNgrams("", 1).Count == 0); // test when no ngram can be generated because number of words < ngram size Assert.IsTrue(NgramGenerator.GenerateNgrams("hello world", 3).Count == 0); }
public void NgramGeneratorTestBigrams() { var ngrams = NgramGenerator.GenerateNgrams("hello world", 2); Assert.IsTrue(ngrams.Contains("hello world")); Assert.IsTrue(ngrams.Count == 1); ngrams = NgramGenerator.GenerateNgrams("hello world test one", 2); Assert.IsTrue(ngrams.Contains("hello world")); Assert.IsTrue(ngrams.Contains("world test")); Assert.IsTrue(ngrams.Contains("test one")); Assert.IsTrue(ngrams.Count == 3); }