public void Run(IDictionary <string, string> args, ILogger logger) { var time = Stopwatch.StartNew(); var dataDirectory = args["dataDirectory"]; var images = new MnistReader(args["imageFileName"], args["labelFileName"]).Read(); var collection = args["collection"]; var count = 0; var errors = 0; var model = new LinearClassifierImageModel(); using (var sessionFactory = new SessionFactory(directory: dataDirectory, logger: logger)) using (var querySession = sessionFactory.CreateSearchSession(model)) { var queryParser = new QueryParser <IImage>(sessionFactory, model, logger); foreach (var image in images) { var query = queryParser.Parse(collection, image, field: "image", select: "label", and: true, or: false); var result = querySession.Search(query, 0, 1); count++; if (result.Total == 0) { errors++; } else { var documentLabel = (string)result.Documents.First().Get("label").Value; if (!documentLabel.Equals(image.Label)) { errors++; logger.LogDebug($"error. label: {image.Label} document label: {documentLabel}\n{((MnistImage)image).Print()}\n{((MnistImage)image).Print()}"); } } logger.LogInformation($"errors: {errors}. total tests {count}. error rate: {(float)errors / count * 100}%"); } } logger.LogInformation($"tested {count} mnist images in {time.Elapsed}"); }
public override void Execute() { try { var query = _queryParser.Parse( collections: Collections, q: Q, fields: Fields, select: _select, and: And, or: Or); var targetCollectionId = _target.ToHash(); IEnumerable <Document> documents; using (var readSession = _sessionFactory.CreateSearchSession(_model)) { documents = readSession.Search(query, _skip, _take).Documents; } if (_truncate) { _sessionFactory.Truncate(targetCollectionId); } using (var documentWriter = new DocumentWriter(targetCollectionId, _sessionFactory)) { foreach (var field in _indexFieldNames) { documentWriter.EnsureKeyExists(field); } } _sessionFactory.SaveAs( targetCollectionId, documents, _model); } catch (Exception ex) { _logger.LogError($"error processing {this} {ex}"); } }
public async Task <SearchResult> Read(HttpRequest request, IModel <string> model) { var timer = Stopwatch.StartNew(); var take = 100; var skip = 0; if (request.Query.ContainsKey("take")) { take = int.Parse(request.Query["take"]); } if (request.Query.ContainsKey("skip")) { skip = int.Parse(request.Query["skip"]); } var query = await _httpQueryParser.ParseRequest(request); if (query == null) { return(new SearchResult(null, 0, 0, new Document[0])); } #if DEBUG var debug = new Dictionary <string, object>(); _httpQueryParser.ParseQuery(query, debug); var queryLog = JsonConvert.SerializeObject(debug); _logger.LogDebug($"incoming query: {queryLog}"); #endif using (var readSession = _sessionFactory.CreateSearchSession(model)) { return(readSession.Search(query, skip, take)); } }
private void DownloadAndIndexWetFile() { var writePayload = new List <Document>(); var originalQuery = _queryParser.Parse( Collections, Q, Fields, select: new string[] { "url", "title", "filename" }, and: And, or: Or); using (var readSession = _sessionFactory.CreateSearchSession(_model)) { var originalResult = readSession.Search(originalQuery, _skip, _take) .Documents .ToDictionary(x => (string)x.Get("url").Value); var wetFileIds = new SortedList <string, object>(); SearchResult wetResult = null; var wetCollectionId = "cc_wet".ToHash(); foreach (var doc in originalResult.Values) { var fileName = (string)doc.Get("filename").Value; var wetFileId = fileName.Replace("/warc", "/wet").Replace(".gz", ".wet.gz"); wetFileIds.TryAdd(wetFileId, null); break; } foreach (var fileName in wetFileIds.Keys) { var wetQuery = _queryParser.Parse( collections: new string[] { "cc_wet" }, q: fileName, fields: new string[] { "filename" }, select: new string[] { "filename" }, and: true, or: false); if (wetQuery != null) { wetResult = readSession.Search(wetQuery, 0, 1); } if (wetResult == null || wetResult.Total == 0) { var localFileName = Path.Combine(_sessionFactory.Directory, "wet", fileName); var tmpFileName = Path.Combine(_sessionFactory.Directory, "tmp", Id, fileName); if (!File.Exists(localFileName)) { if (!Directory.Exists(Path.GetDirectoryName(tmpFileName))) { Directory.CreateDirectory(Path.GetDirectoryName(tmpFileName)); } var remoteFileName = $"https://commoncrawl.s3.amazonaws.com/{fileName}"; const double payloadSize = 150000000; using (var client = new WebClient()) { var state = new State { Completed = false }; client.DownloadFileCompleted += Client_DownloadFileCompleted; client.DownloadFileAsync(new Uri(remoteFileName), tmpFileName, state); while (!state.Completed) { try { if (File.Exists(tmpFileName)) { var fi = new FileInfo(tmpFileName); if (fi.Length > 0) { var status = (fi.Length / (payloadSize * wetFileIds.Count)) * 100; Status["download"] = status; } } } catch { } finally { Thread.Sleep(1000); } } } } if (!File.Exists(localFileName)) { try { var localDir = Path.GetDirectoryName(localFileName); if (!Directory.Exists(localDir)) { Directory.CreateDirectory(localDir); } File.Move(tmpFileName, localFileName, true); Thread.Sleep(100); Directory.Delete(Path.GetDirectoryName(tmpFileName)); } catch (Exception ex) { _logger.LogError(ex, ex.Message); } } foreach (var document in ReadWetFile(localFileName, fileName)) { Document originalDoc; var key = (string)document.Get("url").Value; if (originalResult.TryGetValue(key, out originalDoc)) { document.Get("title").Value = originalDoc.Get("title").Value; document.Get("filename").Value = originalDoc.Get("filename").Value; writePayload.Add(document); } } } } Status["download"] = 100; if (writePayload.Count > 0) { var time = Stopwatch.StartNew(); _sessionFactory.Write(wetCollectionId, writePayload, _model, reportSize: 1000); Status["index"] = 100; _logger.LogInformation($"wet file write job took {time.Elapsed}"); } } }