コード例 #1
0
        public void EnglishTextPreprocessor_Preprocess()
        {
            // arrange
            var preprocessor = new TextPreprocessor(
                new EnglishSimpleTokenizer(),
                new EnglishStopwords(),
                new EnglishSimpleNormalizer(),
                new EnglishPorterStemmer());
            var text =
                @"Jack London was born on January 12, 1876.  By age 30 London was internationally famous for his books
        Call of the Wild (1903), The Sea Wolf (1904) and other literary and journalistic accomplishments.";

            // act
            var dict = preprocessor.Preprocess(text);

            // assert
            Assert.AreEqual(dict.Count, 15);
            Assert.AreEqual(dict[0], "jack");
            Assert.AreEqual(dict[1], "london");
            Assert.AreEqual(dict[2], "born");
            Assert.AreEqual(dict[3], "januari");
            Assert.AreEqual(dict[4], "ag");
            Assert.AreEqual(dict[5], "london");
            Assert.AreEqual(dict[6], "internation");
            Assert.AreEqual(dict[7], "famou");
            Assert.AreEqual(dict[8], "book");
            Assert.AreEqual(dict[9], "wild");
            Assert.AreEqual(dict[10], "sea");
            Assert.AreEqual(dict[11], "wolf");
            Assert.AreEqual(dict[12], "literari");
            Assert.AreEqual(dict[13], "journalist");
            Assert.AreEqual(dict[14], "accomplish");
        }
コード例 #2
0
        public void ConvertNewLines(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(inputText);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
コード例 #3
0
        public void TrimTrailingNewLine(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(
                inputText,
                trimTrailingNewLine: true);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
コード例 #4
0
        public void ForceAscii(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(
                inputText,
                forceAscii: true);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
コード例 #5
0
        public void ResolveTrigraphs(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(
                inputText,
                resolveTrigraphs: true);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
コード例 #6
0
        public void Tokenize(string text, string textTokensExpected)
        {
            // Arrange
            var textPreprocessor = new TextPreprocessor(new PorterStemmer());

            // Act
            var tokens = textPreprocessor.Tokenize(text);

            // Assert
            Assert.AreEqual(textTokensExpected, string.Join(" ", tokens));
        }
コード例 #7
0
        public MainForm()
        {
            InitializeComponent();

            _preprocessor = new TextPreprocessor(new EnglishStemmer());
            _analyzer     = new SentimentAnalyzer(_preprocessor);
            _logs         = new SortableBindingList <SentimentLog>();

            Helpers.ApplyStyle(ref dgvLogs);
            dgvLogs.DataSource = _logs;
        }
コード例 #8
0
        public static TextAlgorithmBase Create_TFIDFAlgorithm()
        {
            var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                            new EnglishStopwords(),
                                            new EnglishSimpleNormalizer(),
                                            new EnglishPorterStemmer());
            var alg = new TFIDFNaiveBayesianAlgorithm()
            {
                Preprocessor = proc
            };

            return(alg);
        }
コード例 #9
0
        public static TextAlgorithmBase Create_FourierMultinomialAlgorithm(double t)
        {
            var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                            new EnglishStopwords(),
                                            new EnglishSimpleNormalizer(),
                                            new EnglishPorterStemmer());
            var alg = new MultinomialNaiveBayesianAlgorithm()
            {
                Preprocessor = proc, FeatureExtractor = new FourierFeatureExtractor {
                    T = t
                }
            };

            return(alg);
        }
コード例 #10
0
        public async Task FindDuplicates(string text)
        {
            // Arrange
            var stemmer          = new PorterStemmer();
            var textPreprocessor = new TextPreprocessor(stemmer);
            var document         = new Document
            {
                Id     = 3,
                Tokens = textPreprocessor.Tokenize(text)
            };
            var tfIdfSimilarityScoring = await BuildService(textPreprocessor);

            // Act
            var scores = await tfIdfSimilarityScoring.GetSimilarityScoresAsync(document);

            // Assert
            Assert.IsTrue(scores.Any(s => s.Score > 0.5d));
        }
コード例 #11
0
        public static TextAlgorithmBase Create_GeneralTextAlgorithm()
        {
            var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                            new EnglishStopwords(),
                                            new EnglishSimpleNormalizer(),
                                            new EnglishPorterStemmer());
            var kernel = new TriangularKernel();
            var subAlg = new NaiveBayesianKernelAlgorithm(kernel, 0.5D)
            {
                UseKernelMinValue = true, KernelMinValue = 0.000001D
            };
            var alg = new GeneralTextAlgorithm(subAlg)
            {
                Preprocessor = proc
            };

            return(alg);
        }
コード例 #12
0
        private void init()
        {
            using (var spam = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.SPAM_p2.42.mld"))
            {
                var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                                new EnglishStopwords(),
                                                new EnglishSimpleNormalizer(),
                                                new EnglishPorterStemmer());
                m_SpamAlgorithm = new ComplementNaiveBayesianAlgorithm()
                {
                    Preprocessor = proc
                };
                m_SpamAlgorithm.Deserialize(spam);
            }

            using (var r8 = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.RR8_p4.37.mld"))
            {
                var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                                new EnglishStopwords(),
                                                new EnglishSimpleNormalizer(),
                                                new EnglishPorterStemmer());
                m_ReutersR8Algorithm = new MultinomialNaiveBayesianAlgorithm()
                {
                    Preprocessor = proc
                };
                m_ReutersR8Algorithm.Deserialize(r8);
            }

            using (var n20 = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.N20_p17.35.mld"))
            {
                var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                                new EnglishStopwords(),
                                                new EnglishSimpleNormalizer(),
                                                new EnglishPorterStemmer());
                m_Newsgroups20Algorithm = new TFIDFNaiveBayesianAlgorithm()
                {
                    Preprocessor = proc
                };
                m_Newsgroups20Algorithm.Deserialize(n20);
            }
        }
コード例 #13
0
 public SanitizeTests()
 {
     textPreprocessor = new TextPreprocessor();
 }
コード例 #14
0
        public void EmptyStringRemainsEmpty()
        {
            var textPreprocessor = new TextPreprocessor(String.Empty);

            Assert.Equal(String.Empty, textPreprocessor.ToString());
        }