public double Compute(string sentence) { Dictionary <string, bool> words = new Dictionary <string, bool>(); HashSet <string> negationWords = new HashSet <string>() { "not", "nor", "no" }; var textFiles = new Dictionary <string, bool> { { "Lexicon\\NegativeWords.txt", false }, { "Lexicon\\PositiveWords.txt", true } }; foreach (var item in textFiles) { using (StreamReader sr = new StreamReader(item.Key)) { while (!sr.EndOfStream) { var wrd = sr.ReadLine(); words[wrd] = item.Value; } } } var sentenceWords = TextParser.SplitToWords(sentence); double positiveRatio = 0.0; double totalSum = 0.0; double negationFactor = -1; foreach (var word in sentenceWords) { var lowerWord = word.ToLower(); if (negationWords.Contains(lowerWord)) { negationFactor = -negationFactor; } bool isPositive; if (!words.TryGetValue(word, out isPositive)) { continue; } positiveRatio += negationFactor * (isPositive ? 1 : -1); totalSum += 1; } if (totalSum > 0) { return(positiveRatio / totalSum); } return(0); }
public int Compute(string sentence) { var sample = new DataSample(); var sampleDataPoints = new List <DataPoint>(); var sentenceWords = TextParser.SplitToWords(sentence).ToArray(); var isNegated = false; for (int index = 0; index < sentenceWords.Length; index++) { var currentWord = sentenceWords[index].ToLower(); if (_negationWords.Contains(currentWord)) { isNegated = !isNegated; } else { if (currentWord.EndsWith("n't")) { isNegated = !isNegated; } else { currentWord = isNegated ? "not_" + currentWord : currentWord; } if (_wordDictionary.ContainsKey(currentWord)) { sampleDataPoints.Add(new DataPoint { ColumnId = _wordDictionary[currentWord], Value = 1 }); } } } sample.DataPoints = sampleDataPoints.ToArray(); return(_naiveBayesClassifier.Compute(sample)); }
public void Train(IEnumerable <Tuple <string, string> > trainingSet, int count) { _wordDictionary = new Dictionary <string, int>(); _classes = new Dictionary <string, int>(); DataSample[] samples = new DataSample[count]; int wordId = 0; int classId = 0; var trainingItemIndex = 0; trainingSet = trainingSet.Take(count); foreach (var trainingItem in trainingSet) { string[] sentences = { trainingItem.Item1 }; var classValue = trainingItem.Item2; if (!_classes.ContainsKey(classValue)) { _classes.Add(classValue, classId); classId++; } var dataSample = new DataSample { ClassId = _classes[classValue] }; var sampleDataPoints = new List <DataPoint>(); foreach (var sentence in sentences) { var sentenceWords = TextParser.SplitToWords(sentence); var isNegated = false; for (int index = 0; index < sentenceWords.Count; index++) { var currentWord = sentenceWords[index].ToLower(); if (currentWord.StartsWith("@")) { continue; } if (_negationWords.Contains(currentWord)) { isNegated = !isNegated; } else { if (currentWord.EndsWith("n't")) { isNegated = !isNegated; } else { currentWord = isNegated ? "not_" + currentWord : currentWord; } if (!_wordDictionary.ContainsKey(currentWord)) { _wordDictionary.Add(currentWord, wordId); wordId++; } sampleDataPoints.Add(new DataPoint { ColumnId = _wordDictionary[currentWord], Value = 1 }); } } } dataSample.DataPoints = sampleDataPoints.ToArray(); samples[trainingItemIndex] = dataSample; trainingItemIndex++; } _columnsDataTypes = new ColumnDataType[wordId]; for (var index = 0; index < wordId; index++) { _columnsDataTypes[index] = new ColumnDataType { IsDiscrete = true, NumberOfCategories = 2 }; } _naiveBayesClassifier = new NaiveBayesClassifier(samples, 2, _columnsDataTypes); }