public void Run(IDictionary <string, string> args, ILogger logger) { var fileName = args["fileName"]; var uri = new Uri(args["uri"]); var count = int.Parse(args["count"]); var batchSize = int.Parse(args["batchSize"]); var batchNo = 0; using (var httpClient = new HttpClient()) { var payload = WikipediaHelper.ReadWP(fileName, 0, count) .Select(x => new Dictionary <string, object> { { "_language", x["language"].ToString() }, { "_url", string.Format("www.wikipedia.org/search-redirect.php?family=wikipedia&language={0}&search={1}", x["language"], x["title"]) }, { "title", x["title"] }, { "body", x["text"] } }); foreach (var batch in payload.Batch(batchSize)) { var time = Stopwatch.StartNew(); Submit(batch, uri, httpClient); time.Stop(); var docsPerSecond = (int)(batchSize / time.Elapsed.TotalSeconds); Console.WriteLine($"batch {batchNo++} took {time.Elapsed} {docsPerSecond} docs/s"); } } }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var collection = args["collection"]; var skip = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0; var take = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue; var sampleSize = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000; var pageSize = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "wikibase_item", "title", "text", "url" }; var fieldsToIndex = new HashSet <string> { "title", "text" }; if (take == 0) { take = int.MaxValue; } var model = new BagOfCharsModel(); var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { var debugger = new IndexDebugger(logger, sampleSize); using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory))) { foreach (var page in payload.Batch(pageSize)) { using (var indexStream = new WritableIndexStream(collectionId, sessionFactory, logger: logger)) using (var indexSession = new IndexSession <string>(model, model)) { foreach (var document in page) { writeSession.Put(document); foreach (var field in document.IndexableFields) { indexSession.Put(document.Id, field.KeyId, (string)field.Value); } debugger.Step(indexSession); } indexStream.Write(indexSession.GetInMemoryIndex()); //foreach (var column in indexSession.InMemoryIndex) //{ // Print($"wikipedia.{column.Key}", column.Value); //} } } } } }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var collection = args["collection"]; var skip = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0; var take = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue; var sampleSize = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000; var pageSize = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "wikibase_item", "title", "text" }; var fieldsToIndex = new HashSet <string> { "language", "title", "text" }; if (take == 0) { take = int.MaxValue; } var model = new BagOfCharsModel(); var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex); var debugger = new BatchDebugger(logger, sampleSize); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory))) { foreach (var page in payload.Batch(pageSize)) { using (var indexSession = new IndexSession <string>(model, model)) { foreach (var document in page) { writeSession.Put(document); foreach (var field in document.IndexableFields) { foreach (var token in model.Tokenize((string)field.Value)) { debugger.Step(); } } } } } logger.LogInformation($"tokenized {debugger.StepCount} in {debugger.Time}."); } } }
public void Run(IDictionary <string, string> args, ILogger logger) { var dataDirectory = args["dataDirectory"]; var fileName = args["fileName"]; var collection = args["collection"]; var skip = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0; var take = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue; var sampleSize = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "wikibase_item", "title", "text", "url" }; var fieldsToIndex = new HashSet <string>(); if (take == 0) { take = int.MaxValue; } var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex); using (var sessionFactory = new SessionFactory(dataDirectory, logger)) { sessionFactory.Truncate(collectionId); var debugger = new BatchDebugger(logger, sampleSize); using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory))) { foreach (var document in payload) { writeSession.Put(document); debugger.Step(); } } } }
public void Run(IDictionary <string, string> args, ILogger logger) { var fileName = args["fileName"]; var dir = args["directory"]; var collection = args["collection"]; var skip = int.Parse(args["skip"]); var take = int.Parse(args["take"]); var pageSize = int.Parse(args["pageSize"]); var reportSize = args.ContainsKey("reportSize") ? int.Parse(args["reportSize"]) : 1000; var collectionId = collection.ToHash(); var fieldsToStore = new HashSet <string> { "language", "url", "title", "description" }; var fieldsToIndex = new HashSet <string> { "language", "url", "title", "description" }; var debugger = new IndexDebugger(); var payload = WikipediaHelper.ReadWP(fileName, skip, take) .Select(x => new Dictionary <string, object> { { "language", x["language"].ToString() }, { "url", string.Format("www.wikipedia.org/search-redirect.php?family=wikipedia&language={0}&search={1}", x["language"], x["title"]) }, { "title", x["title"] }, { "description", x["text"] } }); using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger)) { foreach (var page in payload.Batch(pageSize)) { using (var writeSession = sessionFactory.CreateWriteSession(collectionId)) using (var indexSession = sessionFactory.CreateIndexSession(collectionId, new TextModel())) { foreach (var batch in page.Batch(reportSize)) { var time = Stopwatch.StartNew(); foreach (var document in page) { var documentId = writeSession.Put(document, fieldsToStore); foreach (var kv in document) { if (fieldsToIndex.Contains(kv.Key) && kv.Value != null) { var keyId = writeSession.EnsureKeyExists(kv.Key); indexSession.Put(documentId, keyId, kv.Value.ToString()); } } } var debugInfo = debugger.GetDebugInfo(indexSession); if (debugInfo != null) { logger.LogInformation(debugInfo); } } } } } }