public void EnglishTextPreprocessor_Preprocess() { // arrange var preprocessor = new TextPreprocessor( new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); var text = @"Jack London was born on January 12, 1876. By age 30 London was internationally famous for his books Call of the Wild (1903), The Sea Wolf (1904) and other literary and journalistic accomplishments."; // act var dict = preprocessor.Preprocess(text); // assert Assert.AreEqual(dict.Count, 15); Assert.AreEqual(dict[0], "jack"); Assert.AreEqual(dict[1], "london"); Assert.AreEqual(dict[2], "born"); Assert.AreEqual(dict[3], "januari"); Assert.AreEqual(dict[4], "ag"); Assert.AreEqual(dict[5], "london"); Assert.AreEqual(dict[6], "internation"); Assert.AreEqual(dict[7], "famou"); Assert.AreEqual(dict[8], "book"); Assert.AreEqual(dict[9], "wild"); Assert.AreEqual(dict[10], "sea"); Assert.AreEqual(dict[11], "wolf"); Assert.AreEqual(dict[12], "literari"); Assert.AreEqual(dict[13], "journalist"); Assert.AreEqual(dict[14], "accomplish"); }
public void ConvertNewLines( string inputText, string expected) { var textPreprocessor = new TextPreprocessor(inputText); Assert.Equal(expected, textPreprocessor.ToString()); }
public void TrimTrailingNewLine( string inputText, string expected) { var textPreprocessor = new TextPreprocessor( inputText, trimTrailingNewLine: true); Assert.Equal(expected, textPreprocessor.ToString()); }
public void ForceAscii( string inputText, string expected) { var textPreprocessor = new TextPreprocessor( inputText, forceAscii: true); Assert.Equal(expected, textPreprocessor.ToString()); }
public void ResolveTrigraphs( string inputText, string expected) { var textPreprocessor = new TextPreprocessor( inputText, resolveTrigraphs: true); Assert.Equal(expected, textPreprocessor.ToString()); }
public void Tokenize(string text, string textTokensExpected) { // Arrange var textPreprocessor = new TextPreprocessor(new PorterStemmer()); // Act var tokens = textPreprocessor.Tokenize(text); // Assert Assert.AreEqual(textTokensExpected, string.Join(" ", tokens)); }
public MainForm() { InitializeComponent(); _preprocessor = new TextPreprocessor(new EnglishStemmer()); _analyzer = new SentimentAnalyzer(_preprocessor); _logs = new SortableBindingList <SentimentLog>(); Helpers.ApplyStyle(ref dgvLogs); dgvLogs.DataSource = _logs; }
public static TextAlgorithmBase Create_TFIDFAlgorithm() { var proc = new TextPreprocessor(new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); var alg = new TFIDFNaiveBayesianAlgorithm() { Preprocessor = proc }; return(alg); }
public static TextAlgorithmBase Create_FourierMultinomialAlgorithm(double t) { var proc = new TextPreprocessor(new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); var alg = new MultinomialNaiveBayesianAlgorithm() { Preprocessor = proc, FeatureExtractor = new FourierFeatureExtractor { T = t } }; return(alg); }
public async Task FindDuplicates(string text) { // Arrange var stemmer = new PorterStemmer(); var textPreprocessor = new TextPreprocessor(stemmer); var document = new Document { Id = 3, Tokens = textPreprocessor.Tokenize(text) }; var tfIdfSimilarityScoring = await BuildService(textPreprocessor); // Act var scores = await tfIdfSimilarityScoring.GetSimilarityScoresAsync(document); // Assert Assert.IsTrue(scores.Any(s => s.Score > 0.5d)); }
public static TextAlgorithmBase Create_GeneralTextAlgorithm() { var proc = new TextPreprocessor(new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); var kernel = new TriangularKernel(); var subAlg = new NaiveBayesianKernelAlgorithm(kernel, 0.5D) { UseKernelMinValue = true, KernelMinValue = 0.000001D }; var alg = new GeneralTextAlgorithm(subAlg) { Preprocessor = proc }; return(alg); }
private void init() { using (var spam = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.SPAM_p2.42.mld")) { var proc = new TextPreprocessor(new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); m_SpamAlgorithm = new ComplementNaiveBayesianAlgorithm() { Preprocessor = proc }; m_SpamAlgorithm.Deserialize(spam); } using (var r8 = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.RR8_p4.37.mld")) { var proc = new TextPreprocessor(new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); m_ReutersR8Algorithm = new MultinomialNaiveBayesianAlgorithm() { Preprocessor = proc }; m_ReutersR8Algorithm.Deserialize(r8); } using (var n20 = Assembly.GetExecutingAssembly().GetManifestResourceStream("ML.TextDemo.data.N20_p17.35.mld")) { var proc = new TextPreprocessor(new EnglishSimpleTokenizer(), new EnglishStopwords(), new EnglishSimpleNormalizer(), new EnglishPorterStemmer()); m_Newsgroups20Algorithm = new TFIDFNaiveBayesianAlgorithm() { Preprocessor = proc }; m_Newsgroups20Algorithm.Deserialize(n20); } }
public SanitizeTests() { textPreprocessor = new TextPreprocessor(); }
public void EmptyStringRemainsEmpty() { var textPreprocessor = new TextPreprocessor(String.Empty); Assert.Equal(String.Empty, textPreprocessor.ToString()); }