示例#1
0
        public int Classify(string review)
        {
            var tokenizer = new HappyFunTokenizer(_preserveCase);
            var tokens    = this.AddNegationAugments(tokenizer.Tokenize(review));
            var scores    = new Dictionary <int, double>();

            for (var i = 1; i < 6; i++)
            {
                scores[i] = this.GetEmptyScoreForSentiment(i);
            }

            foreach (var token in tokens)
            {
                if (!Utilities.CheckUtf8(token))
                {
                    continue;
                }

                if (!this._wordList.ContainsKey(token))
                {
                    continue;
                }

                var index = this._wordList[token];

                for (var i = 1; i < 6; i++)
                {
                    var score = this.GetProbabilityOfWordGivenSentimentFast(index, i);

                    scores[i] *= score / (1.0 - score);
                }
            }

            return(scores.Aggregate((l, r) => l.Value > r.Value ? l : r).Key);
        }
示例#2
0
        protected int AddTrainingEntry(HappyFunTokenizer tokenizer, int score, string review)
        {
            var addedEntries = new HashSet <int>();
            var tokens       = this.AddNegationAugments(tokenizer.Tokenize(review));
            var count        = 0;

            foreach (var token in tokens)
            {
                if (!Utilities.CheckUtf8(token))
                {
                    continue;
                }

                if (this._wordList.ContainsKey(token))
                {
                    var index = this._wordList[token];

                    if (addedEntries.Contains(index))
                    {
                        continue;
                    }

                    count++;
                    this._sentimentWordCounts[score][index]++;
                    addedEntries.Add(index);
                }
                else
                {
                    var index = this._wordList.Count;
                    this._wordList[token] = index;
                    count++;

                    for (var i = 0; i < 6; i++)
                    {
                        this._sentimentWordCounts[i][index] = 0;
                    }

                    this._sentimentWordCounts[score][index]++;
                    addedEntries.Add(index);
                }
            }

            return(count);
        }
示例#3
0
        protected bool ProcessTrainingFile(StreamReader sr)
        {
            var    tokenizer  = new HappyFunTokenizer(_preserveCase);
            var    score      = 0;
            var    count      = 0;
            var    maxCount   = 0;
            var    totalCount = 0;
            string line;
            string review = null;

            /*
             * Read training data, tokenize and store feature vectors in a sparse index to
             * limit the memory usage.
             */
            this._wordList            = new Dictionary <string, int>();
            this._sentimentCounts     = new Dictionary <int, int>();
            this._sentimentWordCounts = new Dictionary <int, Dictionary <int, int> >();

            for (var i = 0; i < 6; i++)
            {
                this._sentimentCounts[i]     = 0;
                this._sentimentWordCounts[i] = new Dictionary <int, int>();
            }

            while ((line = sr.ReadLine()) != null)
            {
                line = line.Trim();

                if (line.Length == 0)
                {
                    if (!string.IsNullOrEmpty(review) && score > 0)
                    {
                        var tokenCount = this.AddTrainingEntry(tokenizer, score, review);
                        this._sentimentCounts[score]++;

                        count++;
                        totalCount += tokenCount;

                        if (tokenCount > maxCount)
                        {
                            maxCount = tokenCount;
                        }

                        score  = 0;
                        review = null;
                    }

                    continue;
                }

                var sTest = this._regexes["score"].Match(line);
                if (sTest.Success)
                {
                    score = int.Parse(sTest.Groups["score"].Value);
                    continue;
                }

                var rTest = this._regexes["review"].Match(line);
                if (!rTest.Success)
                {
                    continue;
                }
                review = rTest.Groups["review"].Value;
            }

            if (!string.IsNullOrEmpty(review) && score > 0)
            {
                var tokenCount = this.AddTrainingEntry(tokenizer, score, review);
                this._sentimentCounts[score]++;

                count++;
                totalCount += tokenCount;

                if (tokenCount > maxCount)
                {
                    maxCount = tokenCount;
                }
            }

            this._entryCount = count;

            Console.WriteLine("Bytes: {0} (Average: {1:F2})", maxCount * sizeof(int), ((double)totalCount / count) * sizeof(int));
            Console.WriteLine("Found {0} words and {1} entries.", this._wordList.Count, count);

            return(true);
        }