Пример #1
0
        /// <summary>
        /// Tokenizes the given text into a list of DocumentToken objects.
        /// </summary>
        /// <param name="text">The text to tokenize</param>
        /// <param name="sourceLocation">A string describing the source of the text.  This could be a text file path or some other identifier.</param>
        /// <returns>A list of tokens</returns>
        public static List <DocumentToken> Tokenize(string text, string sourceLocation)
        {
            var tokenizer = new DocumentTokenizer();

            tokenizer.SourceFileLocation = sourceLocation;
            tokenizer.Delimiters.AddRange(DocumentToken.Delimiters);
            tokenizer.WhitespaceBehavior = WhitespaceBehavior.DelimitAndInclude;

            List <DocumentToken> tokens = tokenizer.Tokenize(text);

            foreach (var token in tokens)
            {
                AssignTokenTypes(token, tokens);
            }
            List <DocumentToken> filtered = RemoveLinesThatOnlyContainReplacements(tokens);

            return(filtered);
        }
        public void TokenizeDocumentsTest()
        {
            var docs = new List <Document <string> >
            {
                new Document <string> (
                    new DocumentMetadata(new DocumentId(0), "Title 1"),
                    "Title 1. This is, the first document"
                    ),
                new Document <string> (
                    new DocumentMetadata(new DocumentId(1), "Title 2"),
                    "Title 2; This is the second. document"
                    ),
            };

            var corpus = new List <Block <string> >
            {
                Block <string> .Make(0, docs),
            };

            var tokenized = new List <IEnumerable <TokenizedBlock> >();

            var reader = new Mock <ICorpusReader <string> >();

            reader.Setup(r => r.Read()).Returns(corpus);

            var writer = new Mock <ICorpusWriter <IEnumerable <string> > >();

            writer.Setup(w => w.Write(It.IsAny <IEnumerable <TokenizedBlock> >()))
            .Callback((IEnumerable <TokenizedBlock> d) => tokenized.Add(d));

            var tokenizer = new DocumentTokenizer(new WordRegexTokenizer(lowerCase: false));

            tokenizer.Transform(reader.Object, writer.Object);

            Assert.Single(tokenized);
            var tokenizedBlock = tokenized[0].First();

            Assert.Equal(2, tokenizedBlock.Documents.Count);
            Assert.Equal("Title 1 This is the first document".Split(), tokenizedBlock.Documents[0].Data);
            Assert.Equal("Title 2 This is the second document".Split(), tokenizedBlock.Documents[1].Data);
        }