public void WikipediaCases(string input, string output) { var result = StateMachineTokenizer.Tokenize(input.ToCharArray(), lowerCase: false); var actual = new string(result.ToArray()); Assert.Equal(output, actual); }
public void SpecialCase(string input, string output, bool lowerCase) { var result = StateMachineTokenizer.Tokenize(input.ToCharArray(), lowerCase); var actual = new string(result.ToArray()); Assert.Equal(output, actual); }
private static IEnumerable <int> Process(IList <char> text) { var cleaned = WikitextCleaner.Clean(text); var tokenized = StateMachineTokenizer.Tokenize(cleaned, lowerCase: true); return(NaiveTokenizer.Tokenize(tokenized) .Select(ft => TextHasher.CalculateHashCode(tokenized, ft.From, ft.To))); }
static void TokenizeWikipedia() { var outputWikipediaPath = tokenizedPath; PrepareOutputDirectory(outputWikipediaPath); var reader = new CorpusZipReader <IList <char> >(cleanedPath, charDataSerializer); var writer = new CorpusZipWriter <IList <char> >(outputWikipediaPath, charDataSerializer); var tokenizer = new CorpusTransformer <IList <char>, IList <char> >(t => StateMachineTokenizer.Tokenize(t, lowerCase: true)); tokenizer.Transform(reader, writer); }