Esempio n. 1
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var time          = Stopwatch.StartNew();
            var dataDirectory = args["dataDirectory"];
            var images        = new MnistReader(args["imageFileName"], args["labelFileName"]).Read();
            var collection    = args["collection"];
            var count         = 0;
            var errors        = 0;
            var model         = new LinearClassifierImageModel();

            using (var sessionFactory = new SessionFactory(directory: dataDirectory, logger: logger))
                using (var querySession = sessionFactory.CreateSearchSession(model))
                {
                    var queryParser = new QueryParser <IImage>(sessionFactory, model, logger);

                    foreach (var image in images)
                    {
                        var query  = queryParser.Parse(collection, image, field: "image", select: "label", and: true, or: false);
                        var result = querySession.Search(query, 0, 1);

                        count++;

                        if (result.Total == 0)
                        {
                            errors++;
                        }
                        else
                        {
                            var documentLabel = (string)result.Documents.First().Get("label").Value;

                            if (!documentLabel.Equals(image.Label))
                            {
                                errors++;

                                logger.LogDebug($"error. label: {image.Label} document label: {documentLabel}\n{((MnistImage)image).Print()}\n{((MnistImage)image).Print()}");
                            }
                        }

                        logger.LogInformation($"errors: {errors}. total tests {count}. error rate: {(float)errors / count * 100}%");
                    }
                }

            logger.LogInformation($"tested {count} mnist images in {time.Elapsed}");
        }
Esempio n. 2
0
        public override void Execute()
        {
            try
            {
                var query = _queryParser.Parse(
                    collections: Collections,
                    q: Q,
                    fields: Fields,
                    select: _select,
                    and: And,
                    or: Or);

                var targetCollectionId = _target.ToHash();
                IEnumerable <Document> documents;

                using (var readSession = _sessionFactory.CreateSearchSession(_model))
                {
                    documents = readSession.Search(query, _skip, _take).Documents;
                }

                if (_truncate)
                {
                    _sessionFactory.Truncate(targetCollectionId);
                }

                using (var documentWriter = new DocumentWriter(targetCollectionId, _sessionFactory))
                {
                    foreach (var field in _indexFieldNames)
                    {
                        documentWriter.EnsureKeyExists(field);
                    }
                }

                _sessionFactory.SaveAs(
                    targetCollectionId,
                    documents,
                    _model);
            }
            catch (Exception ex)
            {
                _logger.LogError($"error processing {this} {ex}");
            }
        }
Esempio n. 3
0
        public async Task <SearchResult> Read(HttpRequest request, IModel <string> model)
        {
            var timer = Stopwatch.StartNew();
            var take  = 100;
            var skip  = 0;

            if (request.Query.ContainsKey("take"))
            {
                take = int.Parse(request.Query["take"]);
            }

            if (request.Query.ContainsKey("skip"))
            {
                skip = int.Parse(request.Query["skip"]);
            }

            var query = await _httpQueryParser.ParseRequest(request);

            if (query == null)
            {
                return(new SearchResult(null, 0, 0, new Document[0]));
            }

#if DEBUG
            var debug = new Dictionary <string, object>();

            _httpQueryParser.ParseQuery(query, debug);

            var queryLog = JsonConvert.SerializeObject(debug);

            _logger.LogDebug($"incoming query: {queryLog}");
#endif

            using (var readSession = _sessionFactory.CreateSearchSession(model))
            {
                return(readSession.Search(query, skip, take));
            }
        }
Esempio n. 4
0
        private void DownloadAndIndexWetFile()
        {
            var writePayload = new List <Document>();

            var originalQuery = _queryParser.Parse(
                Collections,
                Q,
                Fields,
                select: new string[] { "url", "title", "filename" },
                and: And,
                or: Or);

            using (var readSession = _sessionFactory.CreateSearchSession(_model))
            {
                var originalResult = readSession.Search(originalQuery, _skip, _take)
                                     .Documents
                                     .ToDictionary(x => (string)x.Get("url").Value);

                var          wetFileIds      = new SortedList <string, object>();
                SearchResult wetResult       = null;
                var          wetCollectionId = "cc_wet".ToHash();

                foreach (var doc in originalResult.Values)
                {
                    var fileName  = (string)doc.Get("filename").Value;
                    var wetFileId = fileName.Replace("/warc", "/wet").Replace(".gz", ".wet.gz");

                    wetFileIds.TryAdd(wetFileId, null);

                    break;
                }

                foreach (var fileName in wetFileIds.Keys)
                {
                    var wetQuery = _queryParser.Parse(
                        collections: new string[] { "cc_wet" },
                        q: fileName,
                        fields: new string[] { "filename" },
                        select: new string[] { "filename" },
                        and: true,
                        or: false);

                    if (wetQuery != null)
                    {
                        wetResult = readSession.Search(wetQuery, 0, 1);
                    }

                    if (wetResult == null || wetResult.Total == 0)
                    {
                        var localFileName = Path.Combine(_sessionFactory.Directory, "wet", fileName);
                        var tmpFileName   = Path.Combine(_sessionFactory.Directory, "tmp", Id, fileName);

                        if (!File.Exists(localFileName))
                        {
                            if (!Directory.Exists(Path.GetDirectoryName(tmpFileName)))
                            {
                                Directory.CreateDirectory(Path.GetDirectoryName(tmpFileName));
                            }

                            var          remoteFileName = $"https://commoncrawl.s3.amazonaws.com/{fileName}";
                            const double payloadSize    = 150000000;

                            using (var client = new WebClient())
                            {
                                var state = new State {
                                    Completed = false
                                };
                                client.DownloadFileCompleted += Client_DownloadFileCompleted;
                                client.DownloadFileAsync(new Uri(remoteFileName), tmpFileName, state);

                                while (!state.Completed)
                                {
                                    try
                                    {
                                        if (File.Exists(tmpFileName))
                                        {
                                            var fi = new FileInfo(tmpFileName);

                                            if (fi.Length > 0)
                                            {
                                                var status = (fi.Length / (payloadSize * wetFileIds.Count)) * 100;

                                                Status["download"] = status;
                                            }
                                        }
                                    }
                                    catch { }
                                    finally
                                    {
                                        Thread.Sleep(1000);
                                    }
                                }
                            }
                        }

                        if (!File.Exists(localFileName))
                        {
                            try
                            {
                                var localDir = Path.GetDirectoryName(localFileName);

                                if (!Directory.Exists(localDir))
                                {
                                    Directory.CreateDirectory(localDir);
                                }

                                File.Move(tmpFileName, localFileName, true);
                                Thread.Sleep(100);
                                Directory.Delete(Path.GetDirectoryName(tmpFileName));
                            }
                            catch (Exception ex)
                            {
                                _logger.LogError(ex, ex.Message);
                            }
                        }

                        foreach (var document in ReadWetFile(localFileName, fileName))
                        {
                            Document originalDoc;
                            var      key = (string)document.Get("url").Value;

                            if (originalResult.TryGetValue(key, out originalDoc))
                            {
                                document.Get("title").Value    = originalDoc.Get("title").Value;
                                document.Get("filename").Value = originalDoc.Get("filename").Value;

                                writePayload.Add(document);
                            }
                        }
                    }
                }

                Status["download"] = 100;

                if (writePayload.Count > 0)
                {
                    var time = Stopwatch.StartNew();

                    _sessionFactory.Write(wetCollectionId, writePayload, _model, reportSize: 1000);

                    Status["index"] = 100;

                    _logger.LogInformation($"wet file write job took {time.Elapsed}");
                }
            }
        }