Пример #1
0
        public IDictionary <string, double> Classify(string contents, HashSet <string> wordsToIgnore)
        {
            if (_classA == null || _classB == null)
            {
                _logger.WriteLog(new Exception("One of the evidences were not properly defined"), "");
                throw new Exception("One of the evidences were not properly defined");
            }
            if (string.IsNullOrWhiteSpace(_classA.Name()))
            {
                _logger.WriteLog(new Exception("Evidence name not defined on the first one"), "");
                throw new Exception("Evidence name not defined on the first one");
            }
            if (string.IsNullOrWhiteSpace(_classB.Name()))
            {
                _logger.WriteLog(new Exception("Evidence name not defined on the second one"), "");
                throw new Exception("Evidence name not defined on the second one");
            }

            var words     = Tokenizer.TokenizeNow(contents).ToList();
            var chunkSize = Math.Ceiling(words.Count() / (double)WORDS_IN_CHUNK);
            var index     = 0;

            #region Classify in chunks

            var scores = new List <Dictionary <string, decimal> >();
            for (var i = 0; i < chunkSize; i++)
            {
                var score = new Dictionary <string, decimal>
                {
                    { _classA.Name(), (decimal)0.0 },
                    { _classB.Name(), (decimal)0.0 }
                };
                scores.Add(score);
            }

            foreach (var wordsChunk in words.Chunk(WORDS_IN_CHUNK))
            {
                foreach (
                    var word in
                    wordsChunk.Where(word => !string.IsNullOrWhiteSpace(word) && !wordsToIgnore.Contains(word)))
                {
                    //First Class
                    var    classAEvidence = _classA.GetEvidence();
                    double wordCountInClassA;
                    wordCountInClassA = classAEvidence.TryGetValue(word, out wordCountInClassA)
                                            ? wordCountInClassA
                                            : 1;    //ToDo - Make this 1 or 0.01

                    var scoreClassA = (decimal)Math.Log(wordCountInClassA / _classA.TotalWords());
                    scores[index][_classA.Name()] += scoreClassA;

                    //Second Class
                    var    classBEvidence = _classB.GetEvidence();
                    double wordCountInClassB;
                    wordCountInClassB = classBEvidence.TryGetValue(word, out wordCountInClassB)
                                            ? wordCountInClassB
                                            : 1;    //ToDo - Make this 1 or 0.01
                    var scoreClassB = (decimal)Math.Log(wordCountInClassB / _classB.TotalWords());
                    scores[index][_classB.Name()] += scoreClassB;
                }

                var totalWordsAllCategories = _classA.TotalWords() + _classB.TotalWords();
                scores[index][_classA.Name()] += (decimal)Math.Log(_classA.TotalWords() / totalWordsAllCategories);
                scores[index][_classB.Name()] += (decimal)Math.Log(_classB.TotalWords() / totalWordsAllCategories);

                var scoreA     = Math.Exp((double)scores[index][_classA.Name()]);
                var scoreB     = Math.Exp((double)scores[index][_classB.Name()]);
                var totalScore = scoreA + scoreB;

                try
                {
                    scores[index][_classA.Name()] = (decimal)(100 * scoreA / totalScore);
                    scores[index][_classB.Name()] = (decimal)(100 * scoreB / totalScore);
                }
                catch (OverflowException overflow)
                {
                    var message =
                        $"Overflow exception for scoreA: {scoreA} scoreB: {scoreB} TotalScore: {totalScore} , Message: {overflow}";
                    _logger.WriteLog(null, message);
                    throw;
                }
                index++;
            }

            //Coumpute the average
            var results = new Dictionary <string, double>
            {
                { _classA.Name(), 0.0 },
                { _classB.Name(), 0.0 }
            };

            foreach (var score in scores)
            {
                results[_classA.Name()] += (double)score[_classA.Name()];
                results[_classB.Name()] += (double)score[_classB.Name()];
            }

            results[_classA.Name()] = results[_classA.Name()] / scores.Count;
            results[_classB.Name()] = results[_classB.Name()] / scores.Count;

            return(results);

            #endregion Classify in chunks
        }
        /// <summary>
        /// Classifies the sentiment of the input text corpus
        /// </summary>
        /// <param name="contents">Text contents that needs to be classified. Works best for 1000+ words</param>
        /// <param name="wordsToIgnore"></param>
        /// <returns></returns>
        private IDictionary <string, double> Classify(string contents, HashSet <string> wordsToIgnore)
        {
            if (_classA == null || _classB == null)
            {
                throw new Exception("One of the evidences were not properly defined");
            }
            if (string.IsNullOrWhiteSpace(_classA.Name()))
            {
                throw new Exception("Evidence name not defined on the first one");
            }
            if (string.IsNullOrWhiteSpace(_classB.Name()))
            {
                throw new Exception("Evidence name not defined on the second one");
            }

            var words     = Tokenizer.TokenizeNow(contents).ToList();
            var chunkSize = Math.Ceiling(words.Count() / (double)WORDS_IN_CHUNK);
            var index     = 0;

            #region Classify in chunks

            var scores = new List <Dictionary <string, decimal> >();
            for (var i = 0; i < chunkSize; i++)
            {
                var score = new Dictionary <string, decimal>
                {
                    { _classA.Name(), (decimal)0.0 },
                    { _classB.Name(), (decimal)0.0 }
                };
                scores.Add(score);
            }

            foreach (var wordsChunk in words.Chunk(WORDS_IN_CHUNK))
            {
                foreach (
                    var word in
                    wordsChunk.Where(word => !string.IsNullOrWhiteSpace(word) && !wordsToIgnore.Contains(word)))
                {
                    //First Class
                    var    classAEvidence = _classA.GetEvidence();
                    double wordCountInClassA;
                    wordCountInClassA = classAEvidence.TryGetValue(word, out wordCountInClassA)
                                            ? wordCountInClassA
                                            : 1;    //ToDo - Make this 1 or 0.01

                    var scoreClassA = (decimal)Math.Log(wordCountInClassA / _classA.TotalWords());
                    scores[index][_classA.Name()] += scoreClassA;

                    //Second Class
                    var    classBEvidence = _classB.GetEvidence();
                    double wordCountInClassB;
                    wordCountInClassB = classBEvidence.TryGetValue(word, out wordCountInClassB)
                                            ? wordCountInClassB
                                            : 1;    //ToDo - Make this 1 or 0.01
                    var scoreClassB = (decimal)Math.Log(wordCountInClassB / _classB.TotalWords());
                    scores[index][_classB.Name()] += scoreClassB;

                    //Logger.DebugFormat(",[TAG_A],{0}, {1}, {2}, {3}, {4},,{5}, {6}, {7}, {8}", word, _classA.Name(),
                    //wordCountInClassA, _classA.TotalWords(), scoreClassA, _classB.Name(),
                    //wordCountInClassB, _classB.TotalWords(), scoreClassB);
                }

                var totalWordsAllCategories = _classA.TotalWords() + _classB.TotalWords();
                scores[index][_classA.Name()] += (decimal)Math.Log(_classA.TotalWords() / totalWordsAllCategories);
                scores[index][_classB.Name()] += (decimal)Math.Log(_classB.TotalWords() / totalWordsAllCategories);

                var scoreA     = Math.Exp((double)scores[index][_classA.Name()]);
                var scoreB     = Math.Exp((double)scores[index][_classB.Name()]);
                var totalScore = scoreA + scoreB;

                try
                {
                    scores[index][_classA.Name()] = (decimal)(100 * scoreA / totalScore);
                    scores[index][_classB.Name()] = (decimal)(100 * scoreB / totalScore);
                }
                catch (OverflowException overflow)
                {
                    throw;
                }

                //Logger.DebugFormat("Chunk_{0} score for {1} : {2}", index, _classA.Name(), scores[index][_classA.Name()]);
                //Logger.DebugFormat("Chunk_{0} score for {1} : {2}", index, _classB.Name(), scores[index][_classB.Name()]);

                index++;
            }

            //Coumpute the average
            var results = new Dictionary <string, double>
            {
                { _classA.Name(), 0.0 },
                { _classB.Name(), 0.0 }
            };

            foreach (var score in scores)
            {
                results[_classA.Name()] += (double)score[_classA.Name()];
                results[_classB.Name()] += (double)score[_classB.Name()];
            }

            results[_classA.Name()] = results[_classA.Name()] / scores.Count;
            results[_classB.Name()] = results[_classB.Name()] / scores.Count;

            //Logger.DebugFormat("Total score for {0} : {1}, {2} : {3} ", _classA.Name(), results[_classA.Name()],
            //                   _classB.Name(), results[_classB.Name()]);

            return(results);

            #endregion Classify in chunks
        }