Beispiel #1
0
        public void EnglishTextPreprocessor_Preprocess()
        {
            // arrange
            var preprocessor = new TextPreprocessor(
                new EnglishSimpleTokenizer(),
                new EnglishStopwords(),
                new EnglishSimpleNormalizer(),
                new EnglishPorterStemmer());
            var text =
                @"Jack London was born on January 12, 1876.  By age 30 London was internationally famous for his books
        Call of the Wild (1903), The Sea Wolf (1904) and other literary and journalistic accomplishments.";

            // act
            var dict = preprocessor.Preprocess(text);

            // assert
            Assert.AreEqual(dict.Count, 15);
            Assert.AreEqual(dict[0], "jack");
            Assert.AreEqual(dict[1], "london");
            Assert.AreEqual(dict[2], "born");
            Assert.AreEqual(dict[3], "januari");
            Assert.AreEqual(dict[4], "ag");
            Assert.AreEqual(dict[5], "london");
            Assert.AreEqual(dict[6], "internation");
            Assert.AreEqual(dict[7], "famou");
            Assert.AreEqual(dict[8], "book");
            Assert.AreEqual(dict[9], "wild");
            Assert.AreEqual(dict[10], "sea");
            Assert.AreEqual(dict[11], "wolf");
            Assert.AreEqual(dict[12], "literari");
            Assert.AreEqual(dict[13], "journalist");
            Assert.AreEqual(dict[14], "accomplish");
        }
Beispiel #2
0
        public void ConvertNewLines(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(inputText);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
Beispiel #3
0
        public void TrimTrailingNewLine(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(
                inputText,
                trimTrailingNewLine: true);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
Beispiel #4
0
        public void ForceAscii(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(
                inputText,
                forceAscii: true);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
Beispiel #5
0
        public void ResolveTrigraphs(
            string inputText,
            string expected)
        {
            var textPreprocessor = new TextPreprocessor(
                inputText,
                resolveTrigraphs: true);

            Assert.Equal(expected, textPreprocessor.ToString());
        }
Beispiel #6
0
        public void Tokenize(string text, string textTokensExpected)
        {
            // Arrange
            var textPreprocessor = new TextPreprocessor(new PorterStemmer());

            // Act
            var tokens = textPreprocessor.Tokenize(text);

            // Assert
            Assert.AreEqual(textTokensExpected, string.Join(" ", tokens));
        }
Beispiel #7
0
        public MainForm()
        {
            InitializeComponent();

            _preprocessor = new TextPreprocessor(new EnglishStemmer());
            _analyzer     = new SentimentAnalyzer(_preprocessor);
            _logs         = new SortableBindingList <SentimentLog>();

            Helpers.ApplyStyle(ref dgvLogs);
            dgvLogs.DataSource = _logs;
        }
Beispiel #8
0
        public static TextAlgorithmBase Create_TFIDFAlgorithm()
        {
            var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                            new EnglishStopwords(),
                                            new EnglishSimpleNormalizer(),
                                            new EnglishPorterStemmer());
            var alg = new TFIDFNaiveBayesianAlgorithm()
            {
                Preprocessor = proc
            };

            return(alg);
        }
Beispiel #9
0
        public static TextAlgorithmBase Create_FourierMultinomialAlgorithm(double t)
        {
            var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                            new EnglishStopwords(),
                                            new EnglishSimpleNormalizer(),
                                            new EnglishPorterStemmer());
            var alg = new MultinomialNaiveBayesianAlgorithm()
            {
                Preprocessor = proc, FeatureExtractor = new FourierFeatureExtractor {
                    T = t
                }
            };

            return(alg);
        }
        public async Task FindDuplicates(string text)
        {
            // Arrange
            var stemmer          = new PorterStemmer();
            var textPreprocessor = new TextPreprocessor(stemmer);
            var document         = new Document
            {
                Id     = 3,
                Tokens = textPreprocessor.Tokenize(text)
            };
            var tfIdfSimilarityScoring = await BuildService(textPreprocessor);

            // Act
            var scores = await tfIdfSimilarityScoring.GetSimilarityScoresAsync(document);

            // Assert
            Assert.IsTrue(scores.Any(s => s.Score > 0.5d));
        }
Beispiel #11
0
        public static TextAlgorithmBase Create_GeneralTextAlgorithm()
        {
            var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                            new EnglishStopwords(),
                                            new EnglishSimpleNormalizer(),
                                            new EnglishPorterStemmer());
            var kernel = new TriangularKernel();
            var subAlg = new NaiveBayesianKernelAlgorithm(kernel, 0.5D)
            {
                UseKernelMinValue = true, KernelMinValue = 0.000001D
            };
            var alg = new GeneralTextAlgorithm(subAlg)
            {
                Preprocessor = proc
            };

            return(alg);
        }
Beispiel #12
0
        private void init()
        {
            using (var spam = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.SPAM_p2.42.mld"))
            {
                var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                                new EnglishStopwords(),
                                                new EnglishSimpleNormalizer(),
                                                new EnglishPorterStemmer());
                m_SpamAlgorithm = new ComplementNaiveBayesianAlgorithm()
                {
                    Preprocessor = proc
                };
                m_SpamAlgorithm.Deserialize(spam);
            }

            using (var r8 = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.RR8_p4.37.mld"))
            {
                var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                                new EnglishStopwords(),
                                                new EnglishSimpleNormalizer(),
                                                new EnglishPorterStemmer());
                m_ReutersR8Algorithm = new MultinomialNaiveBayesianAlgorithm()
                {
                    Preprocessor = proc
                };
                m_ReutersR8Algorithm.Deserialize(r8);
            }

            using (var n20 = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.N20_p17.35.mld"))
            {
                var proc = new TextPreprocessor(new EnglishSimpleTokenizer(),
                                                new EnglishStopwords(),
                                                new EnglishSimpleNormalizer(),
                                                new EnglishPorterStemmer());
                m_Newsgroups20Algorithm = new TFIDFNaiveBayesianAlgorithm()
                {
                    Preprocessor = proc
                };
                m_Newsgroups20Algorithm.Deserialize(n20);
            }
        }
Beispiel #13
0
 public SanitizeTests()
 {
     textPreprocessor = new TextPreprocessor();
 }
Beispiel #14
0
        public void EmptyStringRemainsEmpty()
        {
            var textPreprocessor = new TextPreprocessor(String.Empty);

            Assert.Equal(String.Empty, textPreprocessor.ToString());
        }