/// <summary>
        ///     Takes a partialTweetText, cleans it, breaks it into ngrams and stemmed ngrams.
        /// </summary>
        /// <param name="partialTweetText">The partialTweetText.</param>
        /// <param name="isRetweet">
        ///     if set to <c>true</c> [is retweet].
        /// </param>
        /// <param name="sentiment">The sentiment.</param>
        /// <param name="timestamp">The current time in UTC.</param>
        /// <param name="dictionary">The dictionary.</param>
        /// <param name="context">The context.</param>
        public static void ProcessTweetTextForCorpus(string partialTweetText,
                                                     bool isRetweet,
                                                     Sentiment sentiment,
                                                     DateTime timestamp,
                                                     IDictionary <String, String> dictionary,
                                                     IOclumenContext context)
        {
            // first let's grab the hashtags
            UpdateHashtagSentiment(partialTweetText, isRetweet, sentiment, timestamp, context);

            // clean up the partialTweetText a bit
            string cleanedTweet =
                new TextCleaner(partialTweetText).StripPunctuation().RemoveExcessSpaces().ToLower().ToString();


            IList <string> unigrams = NgramGenerator.GenerateNgrams(cleanedTweet, 1);
            IList <string> bigrams  = NgramGenerator.GenerateNgrams(cleanedTweet, 2);

            IList <string> stemmedNgrams  = StemNgram(unigrams, dictionary);
            IList <string> stemmedBigrams = StemNgram(bigrams, dictionary);

            UpdateNgramsSentiment(unigrams, isRetweet, sentiment, false, context.BasicNgrams, context);
            UpdateNgramsSentiment(bigrams, isRetweet, sentiment, false, context.BasicNgrams, context);
            UpdateNgramsSentiment(stemmedNgrams, isRetweet, sentiment, true, context.StemmedNgrams, context);
            UpdateNgramsSentiment(stemmedBigrams, isRetweet, sentiment, true, context.StemmedNgrams, context);
        }
Example #2
0
        public Sentiment GetTextSentiment <TEntity>(string text, bool isRetweet, int ngramCardinality,
                                                    decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary, DbSet <TEntity> ngramDbSet,
                                                    IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase
        {
            text = new TextCleaner(text).StripPunctuation().RemoveExcessSpaces().ToLower().ToString();

            IList <string> ngrams = NgramGenerator.GenerateNgrams(text, ngramCardinality);

            if (isStemmed)
            {
                ngrams = Processor.StemNgram(ngrams, dictionary);
            }

            var ngramCounts = new List <IList <KeyValuePair <Sentiment, decimal> > >(ngrams.Count);
            IList <KeyValuePair <Sentiment, decimal> > classCounts = GetClassCount(isRetweet, ngramCardinality,
                                                                                   smoothingFactor, ngramDbSet,
                                                                                   oclumenContext);

            // get the raw counts for each of the ngrams
            foreach (string ngram in ngrams)
            {
                ngramCounts.Add(GetNgramCount(ngram, isRetweet, ngramCardinality, smoothingFactor, ngramDbSet,
                                              oclumenContext, ngramDictionary));

                //Debug.WriteLine(ngram + " " + ngramCounts.Last().First(x => x.Key == Sentiment.Positive).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Neutral).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Negative).Value);
            }

            int vocabularySize = GetVocabularySize(isRetweet, ngramCardinality, ngramDbSet, oclumenContext);

            // ok now let's get the probabilities, combining the individual ngram probabilities
            // witht he probability of a given sentiment class
            var sentimentProb = GetNgramSentimentProbabilities(vocabularySize, ngramCounts, classCounts);

            return(sentimentProb.Last().Key);
        }
Example #3
0
        public void NgramGeneratorTextDuplicateIgnore()
        {
            var ngrams = NgramGenerator.GenerateNgrams("hello world hello world", 2);

            Assert.IsTrue(ngrams.Contains("hello world"));
            Assert.IsTrue(ngrams.Contains("world hello"));
            Assert.IsTrue(ngrams.Count == 2);
        }
Example #4
0
        public void NgramGeneratorTestEmpty()
        {
            // empty string
            Assert.IsTrue(NgramGenerator.GenerateNgrams("", 1).Count == 0);

            // test when no ngram can be generated because number of words < ngram size
            Assert.IsTrue(NgramGenerator.GenerateNgrams("hello world", 3).Count == 0);
        }
Example #5
0
        public void NgramGeneratorTestBigrams()
        {
            var ngrams = NgramGenerator.GenerateNgrams("hello world", 2);

            Assert.IsTrue(ngrams.Contains("hello world"));
            Assert.IsTrue(ngrams.Count == 1);

            ngrams = NgramGenerator.GenerateNgrams("hello world test one", 2);
            Assert.IsTrue(ngrams.Contains("hello world"));
            Assert.IsTrue(ngrams.Contains("world test"));
            Assert.IsTrue(ngrams.Contains("test one"));
            Assert.IsTrue(ngrams.Count == 3);
        }