public void Index_Then_Search() { var search = new DocumentIndex(); var docs = TestData.TestCorpus().Select(t => XDocument.Parse(t)).ToList().AsQueryable(); search.IndexDocuments(docs, d => d.Root.Attribute("id").Value); var matches = search.SearchInternal("love time"); Assert.That(matches.All(m => m.Value > 0)); Assert.That(matches.First().Key, Is.EqualTo("3")); }
public void Index_Then_Export_ReturnsExpectedXml() { var search = new DocumentIndex(); var docs = TestData.TestCorpus().Select(t => XDocument.Parse(t)).ToList().AsQueryable(); search.IndexDocuments(docs, d => d.Root.Attribute("id").Value); var xml = search.ExportAsXml(); Assert.That(xml.Root.Name.LocalName, Is.EqualTo("index")); Assert.That(xml.Root.Attribute("doc-count").Value, Is.EqualTo(docs.Count().ToString())); Assert.That(xml.Root.Elements().All(e => e.Name.LocalName == "term")); xml.WriteTo(XmlWriter.Create(Console.Out, new XmlWriterSettings() { Indent = true })); }
public void Export_Then_Import_RestoresState() { var index1 = new DocumentIndex(); var docs = TestData.TestCorpus().Select(t => XDocument.Parse(t)).ToList().AsQueryable(); index1.IndexDocuments(docs, d => d.Root.Attribute("id").Value); var xml = index1.ExportAsXml(); var index2 = new DocumentIndex(index1.Tokeniser); index2.ImportXml(xml); var xml2 = index2.ExportAsXml(); Assert.That(xml.ToString(), Is.EqualTo(xml2.ToString())); }
public void CreateVectorExtractor() { var index = new DocumentIndex(); var docs = TestData.TestCorpus().Select(t => XDocument.Parse(t)).ToList().AsQueryable(); int id = 0; var blocks = docs.SelectMany(d => new Corpus(index.Tokeniser.Tokenise(d.Root.Value)).Blocks.ToList()).ToList(); var tdocs = blocks .Select(b => new TokenisedTextDocument((id++).ToString(), b)) .ToList(); index.IndexDocuments(tdocs); var ve = index.CreateVectorExtractor(); var vect = ve.ExtractColumnVector(index.Tokeniser.Tokenise("love time fortune")); Console.WriteLine(vect); }
public void CreateVectorExtractor_LargeCorpus() { var index = new DocumentIndex(); using (var corpusStream = GetResource("shakespeare.txt")) { var corpus = new Corpus(corpusStream.Tokenise()); int id = 0; index.IndexDocuments(corpus.Blocks.Select(b => new TokenisedTextDocument((id++).ToString(), b))); } var ve = index.CreateVectorExtractor(1024); var vect = ve.ExtractColumnVector(index.Tokeniser.Tokenise("love time fortune")); Console.WriteLine(vect); }