/// <summary> /// Tokenizes the given text into a list of DocumentToken objects. /// </summary> /// <param name="text">The text to tokenize</param> /// <param name="sourceLocation">A string describing the source of the text. This could be a text file path or some other identifier.</param> /// <returns>A list of tokens</returns> public static List <DocumentToken> Tokenize(string text, string sourceLocation) { var tokenizer = new DocumentTokenizer(); tokenizer.SourceFileLocation = sourceLocation; tokenizer.Delimiters.AddRange(DocumentToken.Delimiters); tokenizer.WhitespaceBehavior = WhitespaceBehavior.DelimitAndInclude; List <DocumentToken> tokens = tokenizer.Tokenize(text); foreach (var token in tokens) { AssignTokenTypes(token, tokens); } List <DocumentToken> filtered = RemoveLinesThatOnlyContainReplacements(tokens); return(filtered); }
public void TokenizeDocumentsTest() { var docs = new List <Document <string> > { new Document <string> ( new DocumentMetadata(new DocumentId(0), "Title 1"), "Title 1. This is, the first document" ), new Document <string> ( new DocumentMetadata(new DocumentId(1), "Title 2"), "Title 2; This is the second. document" ), }; var corpus = new List <Block <string> > { Block <string> .Make(0, docs), }; var tokenized = new List <IEnumerable <TokenizedBlock> >(); var reader = new Mock <ICorpusReader <string> >(); reader.Setup(r => r.Read()).Returns(corpus); var writer = new Mock <ICorpusWriter <IEnumerable <string> > >(); writer.Setup(w => w.Write(It.IsAny <IEnumerable <TokenizedBlock> >())) .Callback((IEnumerable <TokenizedBlock> d) => tokenized.Add(d)); var tokenizer = new DocumentTokenizer(new WordRegexTokenizer(lowerCase: false)); tokenizer.Transform(reader.Object, writer.Object); Assert.Single(tokenized); var tokenizedBlock = tokenized[0].First(); Assert.Equal(2, tokenizedBlock.Documents.Count); Assert.Equal("Title 1 This is the first document".Split(), tokenizedBlock.Documents[0].Data); Assert.Equal("Title 2 This is the second document".Split(), tokenizedBlock.Documents[1].Data); }