public void WikipediaCases(string input, string output)
        {
            var result = StateMachineTokenizer.Tokenize(input.ToCharArray(), lowerCase: false);
            var actual = new string(result.ToArray());

            Assert.Equal(output, actual);
        }
        public void SpecialCase(string input, string output, bool lowerCase)
        {
            var result = StateMachineTokenizer.Tokenize(input.ToCharArray(), lowerCase);
            var actual = new string(result.ToArray());

            Assert.Equal(output, actual);
        }
예제 #3
0
        private static IEnumerable <int> Process(IList <char> text)
        {
            var cleaned   = WikitextCleaner.Clean(text);
            var tokenized = StateMachineTokenizer.Tokenize(cleaned, lowerCase: true);

            return(NaiveTokenizer.Tokenize(tokenized)
                   .Select(ft => TextHasher.CalculateHashCode(tokenized, ft.From, ft.To)));
        }
예제 #4
0
        static void TokenizeWikipedia()
        {
            var outputWikipediaPath = tokenizedPath;

            PrepareOutputDirectory(outputWikipediaPath);

            var reader = new CorpusZipReader <IList <char> >(cleanedPath, charDataSerializer);
            var writer = new CorpusZipWriter <IList <char> >(outputWikipediaPath, charDataSerializer);

            var tokenizer = new CorpusTransformer <IList <char>, IList <char> >(t => StateMachineTokenizer.Tokenize(t, lowerCase: true));

            tokenizer.Transform(reader, writer);
        }