public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var model = new BagOfCharsModel(); var collectionId = "cc_wet".ToHash(); var storeFields = new HashSet <string> { "url" }; var indexFields = new HashSet <string> { "description" }; using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { sessionFactory.Truncate(collectionId); sessionFactory.Write( collectionId, ReadWetFile(fileName) .Select(dic => new Document( dic.Select(kvp => new Field( kvp.Key, kvp.Value, index: indexFields.Contains(kvp.Key), store: storeFields.Contains(kvp.Key))).ToList())), model, reportSize: 1000); } }
public static IServiceProvider Configure(IServiceCollection services) { var assemblyPath = Directory.GetCurrentDirectory(); var config = new KeyValueConfiguration(Path.Combine(assemblyPath, "sir.ini")); services.Add(new ServiceDescriptor(typeof(IConfigurationProvider), config)); var loggerFactory = services.BuildServiceProvider().GetService <ILoggerFactory>(); var logger = loggerFactory.CreateLogger("Sir"); var model = new BagOfCharsModel(); var sessionFactory = new SessionFactory(@"c:\data\resin", logger); var qp = new QueryParser <string>(sessionFactory, model, logger); var httpParser = new HttpQueryParser(qp); services.AddSingleton(typeof(IModel <string>), model); services.AddSingleton(typeof(ISessionFactory), sessionFactory); services.AddSingleton(typeof(SessionFactory), sessionFactory); services.AddSingleton(typeof(QueryParser <string>), qp); services.AddSingleton(typeof(HttpQueryParser), httpParser); services.AddSingleton(typeof(IHttpWriter), new HttpWriter(sessionFactory)); services.AddSingleton(typeof(IHttpReader), new HttpReader( sessionFactory, httpParser, loggerFactory.CreateLogger <HttpReader>())); return(services.BuildServiceProvider()); }
public void Can_traverse_index_in_memory() { var model = new BagOfCharsModel(); var tree = model.CreateTree(model, _data); Debug.WriteLine(PathFinder.Visualize(tree)); Assert.DoesNotThrow(() => { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = PathFinder.ClosestMatch(tree, queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched with {hit.Node.Vector.Label} with {hit.Score * 100}% certainty."); } } }); }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var collection = args["collection"]; var skip = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0; var take = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue; var sampleSize = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000; var pageSize = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "wikibase_item", "title", "text", "url" }; var fieldsToIndex = new HashSet <string> { "title", "text" }; if (take == 0) { take = int.MaxValue; } var model = new BagOfCharsModel(); var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { var debugger = new IndexDebugger(logger, sampleSize); using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory))) { foreach (var page in payload.Batch(pageSize)) { using (var indexStream = new WritableIndexStream(collectionId, sessionFactory, logger: logger)) using (var indexSession = new IndexSession <string>(model, model)) { foreach (var document in page) { writeSession.Put(document); foreach (var field in document.IndexableFields) { indexSession.Put(document.Id, field.KeyId, (string)field.Value); } debugger.Step(indexSession); } indexStream.Write(indexSession.GetInMemoryIndex()); //foreach (var column in indexSession.InMemoryIndex) //{ // Print($"wikipedia.{column.Key}", column.Value); //} } } } } }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var collection = args["collection"]; var documentId = long.Parse(args["documentId"]); var select = new HashSet <string>(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); var collectionId = collection.ToHash(); var model = new BagOfCharsModel(); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) using (var documents = new DocumentStreamSession(sessionFactory)) using (var documentReader = new DocumentReader(collectionId, sessionFactory)) { var doc = documents.ReadDocument((collectionId, documentId), select, documentReader); foreach (var field in doc.Fields) { var tokens = model.Tokenize(field.Value.ToString()); var tree = new VectorNode(); foreach (var token in tokens) { tree.MergeOrAdd(new VectorNode(token), model); } Console.WriteLine(field.Name); Console.WriteLine(PathFinder.Visualize(tree)); Console.WriteLine(string.Join('\n', tokens)); } } }
static void Main(string[] args) { var loggerFactory = LoggerFactory.Create(builder => { builder .AddFilter("Microsoft", LogLevel.Warning) .AddFilter("System", LogLevel.Warning) .AddFilter("Sir", LogLevel.Information) .AddConsole(); }); var logger = loggerFactory.CreateLogger("Sir"); logger.LogInformation($"processing command: {string.Join(" ", args)}"); var model = new BagOfCharsModel(); var command = args[0].ToLower(); var flags = ParseArgs(args); var plugin = ResolvePlugin(command); var time = Stopwatch.StartNew(); if (plugin != null) { try { plugin.Run(flags, logger); } catch (Exception ex) { logger.LogError(ex, ex.Message); } } else if ((command == "slice")) { Slice(flags); } else if (command == "truncate") { Truncate(flags["dataDirectory"], flags["collection"], logger); } else if (command == "truncate-index") { TruncateIndex(flags["dataDirectory"], flags["collection"], logger); } else if (command == "optimize") { Optimize(flags, model, logger); } else { logger.LogInformation("unknown command: {0}", command); return; } logger.LogInformation($"executed {command} in {time.Elapsed}"); }
public void Can_search_filestreamed_with_multiple_pages() { var model = new BagOfCharsModel(); const string collection = "Can_search_streamed_with_one_page_per_document"; var collectionId = collection.ToHash(); const string fieldName = "description"; _sessionFactory.Truncate(collectionId); using (var stream = new WritableIndexStream(collectionId, _sessionFactory)) using (var writeSession = new WriteSession(new DocumentWriter(collectionId, _sessionFactory))) { var keyId = writeSession.EnsureKeyExists(fieldName); for (long i = 0; i < _data.Length; i++) { var data = _data[i]; using (var indexSession = new IndexSession <string>(model, model)) { var doc = new Document(new Field[] { new Field(fieldName, data, index: true, store: true) }); writeSession.Put(doc); indexSession.Put(doc.Id, keyId, data); stream.Write(indexSession.GetInMemoryIndex()); } } } var queryParser = new QueryParser <string>(_sessionFactory, model); using (var searchSession = new SearchSession(_sessionFactory, model, new PostingsReader(_sessionFactory))) { Assert.DoesNotThrow(() => { foreach (var word in _data) { var query = queryParser.Parse(collection, word, fieldName, fieldName, and: true, or: false); var result = searchSession.Search(query, 0, 1); var document = result.Documents.FirstOrDefault(); if (document == null) { throw new Exception($"unable to find {word}."); } if (document.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched with {document.Score * 100}% certainty."); } }); } }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var collection = args["collection"]; var skip = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0; var take = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue; var sampleSize = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000; var pageSize = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "wikibase_item", "title", "text" }; var fieldsToIndex = new HashSet <string> { "language", "title", "text" }; if (take == 0) { take = int.MaxValue; } var model = new BagOfCharsModel(); var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex); var debugger = new BatchDebugger(logger, sampleSize); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory))) { foreach (var page in payload.Batch(pageSize)) { using (var indexSession = new IndexSession <string>(model, model)) { foreach (var document in page) { writeSession.Put(document); foreach (var field in document.IndexableFields) { foreach (var token in model.Tokenize((string)field.Value)) { debugger.Step(); } } } } } logger.LogInformation($"tokenized {debugger.StepCount} in {debugger.Time}."); } } }
static void Main(string[] args) { var model = new BagOfCharsModel(); if (args[0] == "--build-graph") { RunInteractiveGraphBuilder(model); } else { Similarity(args[0], args[1], model); CompareToBaseVector(args[0], args[1], model); } }
public void Can_tokenize() { const string data = "Ferriman–Gallwey score"; // NOTE: string contains "En dash" character: https://unicode-table.com/en/#2013 var model = new BagOfCharsModel(); var tokens = model.Tokenize(data); var labels = tokens.Select(x => x.Label.ToString()).ToList(); var t0 = data.Substring(0, 8); var t1 = data.Substring(9, 7); var t2 = data.Substring(17, 5); Assert.IsTrue(labels.Contains(t0)); Assert.IsTrue(labels.Contains(t1)); Assert.IsTrue(labels.Contains(t2)); }
public void Can_traverse_streamed() { var model = new BagOfCharsModel(); var tree = model.CreateTree(model, _data); using (var indexStream = new MemoryStream()) using (var vectorStream = new MemoryStream()) using (var pageStream = new MemoryStream()) { using (var writer = new ColumnWriter(indexStream, keepStreamOpen: true)) { writer.CreatePage(tree, vectorStream, new PageIndexWriter(pageStream, keepStreamOpen: true)); } pageStream.Position = 0; Assert.DoesNotThrow(() => { using (var reader = new ColumnReader(new PageIndexReader(pageStream), indexStream, vectorStream, _sessionFactory, _loggerFactory.CreateLogger <ColumnReader>())) { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = reader.ClosestMatch(queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched vector in disk with {hit.Score * 100}% certainty."); } } } }); } }
public void Can_produce_traversable_in_memory_index() { var model = new BagOfCharsModel(); VectorNode tree; using (var indexSession = new IndexSession <string>(model, model)) { for (long i = 0; i < _data.Length; i++) { indexSession.Put(i, 0, _data[i]); } tree = indexSession.GetInMemoryIndex()[0]; } Debug.WriteLine(PathFinder.Visualize(tree)); Assert.DoesNotThrow(() => { foreach (var word in _data) { foreach (var queryVector in model.Tokenize(word)) { var hit = PathFinder.ClosestMatch(tree, queryVector, model); if (hit == null) { throw new Exception($"unable to find {word} in tree."); } if (hit.Score < model.IdenticalAngle) { throw new Exception($"unable to score {word}."); } Debug.WriteLine($"{word} matched with {hit.Node.Vector.Label} with {hit.Score * 100}% certainty."); } } }); }