public void Handle_bigrams() { var results = NGramProcessor.MakeNgrams(Sentence, 2); var arr = results as string[] ?? results.ToArray(); arr.Should().Contain("hello this"); arr.Should().Contain("an example"); arr.Should().Contain("I hope"); arr.Should().Contain("ain't that"); }
public void CreateNgrams(string arg = null) { if (!string.IsNullOrEmpty(arg)) { Sentence = arg; } Log.Verbose("text match {ngram} {text}", Sentence); Words = Sentence.SplitSentenceIntoWords() .ToList() .ConvertAll(x => x.ToLowerInvariantWithOutSpaces()) .Where(x => !x.IsGibberish()) .ToList(); WordsCleaned = Words.Select(x => x.ToAlphaNumericOnly()).Where(x => !string.IsNullOrWhiteSpace(x)) .ToList(); var text = string.Join(" ", WordsCleaned); Ngrams = new ConcurrentDictionary <int, IEnumerable <string> >(); if (WordsCleaned.Count > 3) { Ngrams.Add(new KeyValuePair <int, IEnumerable <string> >(4, NGramProcessor.MakeNgrams(text, 4))); } if (WordsCleaned.Count > 2) { Ngrams.Add(new KeyValuePair <int, IEnumerable <string> >(3, NGramProcessor.MakeNgrams(text, 3))); } if (WordsCleaned.Count > 1) { Ngrams.Add(new KeyValuePair <int, IEnumerable <string> >(2, NGramProcessor.MakeNgrams(text, 2))); } Ngrams.Add(new KeyValuePair <int, IEnumerable <string> >(1, WordsCleaned)); }