public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var collection = args["collection"]; var documentId = long.Parse(args["documentId"]); var select = new HashSet <string>(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); var collectionId = collection.ToHash(); var model = new BagOfCharsModel(); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) using (var documents = new DocumentStreamSession(sessionFactory)) using (var documentReader = new DocumentReader(collectionId, sessionFactory)) { var doc = documents.ReadDocument((collectionId, documentId), select, documentReader); foreach (var field in doc.Fields) { var tokens = model.Tokenize(field.Value.ToString()); var tree = new VectorNode(); foreach (var token in tokens) { tree.MergeOrAdd(new VectorNode(token), model); } Console.WriteLine(field.Name); Console.WriteLine(PathFinder.Visualize(tree)); Console.WriteLine(string.Join('\n', tokens)); } } }
private static async Task Index(string dir, string collection, int skip, int take, int batchSize) { var timer = new Stopwatch(); timer.Start(); var files = Directory.GetFiles(dir, "*.docs"); var sessionFactory = new SessionFactory( dir, new LatinTokenizer(), new IniConfiguration(Path.Combine(Directory.GetCurrentDirectory(), "sir.ini"))); var batchNo = 0; foreach (var docFileName in files) { var name = Path.GetFileNameWithoutExtension(docFileName) .Split(".", StringSplitOptions.RemoveEmptyEntries); var collectionId = name[0]; if (collectionId == collection.ToHash().ToString()) { using (var readSession = new DocumentStreamSession(collection, sessionFactory)) { var docs = readSession.ReadDocs(); if (skip > 0) { docs = docs.Skip(skip); } if (take > 0) { docs = docs.Take(take); } var writeTimer = new Stopwatch(); foreach (var batch in docs.Batch(batchSize)) { writeTimer.Restart(); var job = new IndexingJob(collection, batch); using (var indexSession = sessionFactory.CreateIndexSession(collection)) { await indexSession.Write(job); } _log.Log(string.Format("batch {0} done in {1}", batchNo++, writeTimer.Elapsed)); } } break; } } _log.Log(string.Format("indexing took {0}", timer.Elapsed)); }