public void Can_traverse_index_in_memory() { var model = new BagOfCharsModel(); var tree = model.CreateTree(model, _data); Debug.WriteLine(PathFinder.Visualize(tree)); Assert.DoesNotThrow(() => { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = PathFinder.ClosestMatch(tree, queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched with {hit.Node.Vector.Label} with {hit.Score * 100}% certainty."); } } }); }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var collection = args["collection"]; var documentId = long.Parse(args["documentId"]); var select = new HashSet <string>(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); var collectionId = collection.ToHash(); var model = new BagOfCharsModel(); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) using (var documents = new DocumentStreamSession(sessionFactory)) using (var documentReader = new DocumentReader(collectionId, sessionFactory)) { var doc = documents.ReadDocument((collectionId, documentId), select, documentReader); foreach (var field in doc.Fields) { var tokens = model.Tokenize(field.Value.ToString()); var tree = new VectorNode(); foreach (var token in tokens) { tree.MergeOrAdd(new VectorNode(token), model); } Console.WriteLine(field.Name); Console.WriteLine(PathFinder.Visualize(tree)); Console.WriteLine(string.Join('\n', tokens)); } } }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var collection = args["collection"]; var skip = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0; var take = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue; var sampleSize = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000; var pageSize = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "wikibase_item", "title", "text" }; var fieldsToIndex = new HashSet <string> { "language", "title", "text" }; if (take == 0) { take = int.MaxValue; } var model = new BagOfCharsModel(); var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex); var debugger = new BatchDebugger(logger, sampleSize); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory))) { foreach (var page in payload.Batch(pageSize)) { using (var indexSession = new IndexSession <string>(model, model)) { foreach (var document in page) { writeSession.Put(document); foreach (var field in document.IndexableFields) { foreach (var token in model.Tokenize((string)field.Value)) { debugger.Step(); } } } } } logger.LogInformation($"tokenized {debugger.StepCount} in {debugger.Time}."); } } }
public void Can_tokenize() { const string data = "Ferriman–Gallwey score"; // NOTE: string contains "En dash" character: https://unicode-table.com/en/#2013 var model = new BagOfCharsModel(); var tokens = model.Tokenize(data); var labels = tokens.Select(x => x.Label.ToString()).ToList(); var t0 = data.Substring(0, 8); var t1 = data.Substring(9, 7); var t2 = data.Substring(17, 5); Assert.IsTrue(labels.Contains(t0)); Assert.IsTrue(labels.Contains(t1)); Assert.IsTrue(labels.Contains(t2)); }
public void Can_traverse_streamed() { var model = new BagOfCharsModel(); var tree = model.CreateTree(model, _data); using (var indexStream = new MemoryStream()) using (var vectorStream = new MemoryStream()) using (var pageStream = new MemoryStream()) { using (var writer = new ColumnWriter(indexStream, keepStreamOpen: true)) { writer.CreatePage(tree, vectorStream, new PageIndexWriter(pageStream, keepStreamOpen: true)); } pageStream.Position = 0; Assert.DoesNotThrow(() => { using (var reader = new ColumnReader(new PageIndexReader(pageStream), indexStream, vectorStream, _sessionFactory, _loggerFactory.CreateLogger <ColumnReader>())) { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = reader.ClosestMatch(queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched vector in disk with {hit.Score * 100}% certainty."); } } } }); } }
public void Can_produce_traversable_in_memory_index() { var model = new BagOfCharsModel(); VectorNode tree; using (var indexSession = new IndexSession <string>(model, model)) { for (long i = 0; i < _data.Length; i++) { indexSession.Put(i, 0, _data[i]); } tree = indexSession.GetInMemoryIndex()[0]; } Debug.WriteLine(PathFinder.Visualize(tree)); Assert.DoesNotThrow(() => { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = PathFinder.ClosestMatch(tree, queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched with {hit.Node.Vector.Label} with {hit.Score * 100}% certainty."); } } }); }