public void CreateVectorExtractor() { var index = new DocumentIndex(); var docs = TestData.TestCorpus().Select(t => XDocument.Parse(t)).ToList().AsQueryable(); int id = 0; var blocks = docs.SelectMany(d => new Corpus(index.Tokeniser.Tokenise(d.Root.Value)).Blocks.ToList()).ToList(); var tdocs = blocks .Select(b => new TokenisedTextDocument((id++).ToString(), b)) .ToList(); index.IndexDocuments(tdocs); var ve = index.CreateVectorExtractor(); var vect = ve.ExtractColumnVector(index.Tokeniser.Tokenise("love time fortune")); Console.WriteLine(vect); }
public void CreateVectorExtractor_LargeCorpus() { var index = new DocumentIndex(); using (var corpusStream = GetResource("shakespeare.txt")) { var corpus = new Corpus(corpusStream.Tokenise()); int id = 0; index.IndexDocuments(corpus.Blocks.Select(b => new TokenisedTextDocument((id++).ToString(), b))); } var ve = index.CreateVectorExtractor(1024); var vect = ve.ExtractColumnVector(index.Tokeniser.Tokenise("love time fortune")); Console.WriteLine(vect); }