public int Classify(string review) { var tokenizer = new HappyFunTokenizer(_preserveCase); var tokens = this.AddNegationAugments(tokenizer.Tokenize(review)); var scores = new Dictionary <int, double>(); for (var i = 1; i < 6; i++) { scores[i] = this.GetEmptyScoreForSentiment(i); } foreach (var token in tokens) { if (!Utilities.CheckUtf8(token)) { continue; } if (!this._wordList.ContainsKey(token)) { continue; } var index = this._wordList[token]; for (var i = 1; i < 6; i++) { var score = this.GetProbabilityOfWordGivenSentimentFast(index, i); scores[i] *= score / (1.0 - score); } } return(scores.Aggregate((l, r) => l.Value > r.Value ? l : r).Key); }
protected int AddTrainingEntry(HappyFunTokenizer tokenizer, int score, string review) { var addedEntries = new HashSet <int>(); var tokens = this.AddNegationAugments(tokenizer.Tokenize(review)); var count = 0; foreach (var token in tokens) { if (!Utilities.CheckUtf8(token)) { continue; } if (this._wordList.ContainsKey(token)) { var index = this._wordList[token]; if (addedEntries.Contains(index)) { continue; } count++; this._sentimentWordCounts[score][index]++; addedEntries.Add(index); } else { var index = this._wordList.Count; this._wordList[token] = index; count++; for (var i = 0; i < 6; i++) { this._sentimentWordCounts[i][index] = 0; } this._sentimentWordCounts[score][index]++; addedEntries.Add(index); } } return(count); }
protected bool ProcessTrainingFile(StreamReader sr) { var tokenizer = new HappyFunTokenizer(_preserveCase); var score = 0; var count = 0; var maxCount = 0; var totalCount = 0; string line; string review = null; /* * Read training data, tokenize and store feature vectors in a sparse index to * limit the memory usage. */ this._wordList = new Dictionary <string, int>(); this._sentimentCounts = new Dictionary <int, int>(); this._sentimentWordCounts = new Dictionary <int, Dictionary <int, int> >(); for (var i = 0; i < 6; i++) { this._sentimentCounts[i] = 0; this._sentimentWordCounts[i] = new Dictionary <int, int>(); } while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (line.Length == 0) { if (!string.IsNullOrEmpty(review) && score > 0) { var tokenCount = this.AddTrainingEntry(tokenizer, score, review); this._sentimentCounts[score]++; count++; totalCount += tokenCount; if (tokenCount > maxCount) { maxCount = tokenCount; } score = 0; review = null; } continue; } var sTest = this._regexes["score"].Match(line); if (sTest.Success) { score = int.Parse(sTest.Groups["score"].Value); continue; } var rTest = this._regexes["review"].Match(line); if (!rTest.Success) { continue; } review = rTest.Groups["review"].Value; } if (!string.IsNullOrEmpty(review) && score > 0) { var tokenCount = this.AddTrainingEntry(tokenizer, score, review); this._sentimentCounts[score]++; count++; totalCount += tokenCount; if (tokenCount > maxCount) { maxCount = tokenCount; } } this._entryCount = count; Console.WriteLine("Bytes: {0} (Average: {1:F2})", maxCount * sizeof(int), ((double)totalCount / count) * sizeof(int)); Console.WriteLine("Found {0} words and {1} entries.", this._wordList.Count, count); return(true); }