public void CreateVectorExtractor_LargeCorpus() { var index = new DocumentIndex(); using (var corpusStream = GetResource("shakespeare.txt")) { var corpus = new Corpus(corpusStream.Tokenise()); int id = 0; index.IndexDocuments(corpus.Blocks.Select(b => new TokenisedTextDocument((id++).ToString(), b))); } var ve = index.CreateVectorExtractor(1024); var vect = ve.ExtractColumnVector(index.Tokeniser.Tokenise("love time fortune")); Console.WriteLine(vect); }
private void Append(TokenisedTextDocument document) { _index.IndexDocument(document); var corpus = new Corpus(document.Tokens); foreach (var block in corpus.Blocks) { _markovChain.AnalyseSequence(block.Select(b => b.Text)); } }