Exemple #1
0
        public override async Task <SortedList <float, IList <IDictionary> > > Evaluate(string formattedQuery)
        {
            var documents = await base.Evaluate(formattedQuery);

            const string modelName = "www";

            var q = new HttpQueryParser(new TermQueryParser(), new UnicodeTokenizer())
                    .FromFormattedString(modelName.ToHash(), formattedQuery);

            q.Take = 10;

            using (var session = SessionFactory.CreateReadSession(modelName, modelName.ToHash()))
            {
                var result = await session.Read(q);

                foreach (var document in result.Docs)
                {
                    IList <IDictionary> list;

                    if (!documents.TryGetValue((float)document["___score"], out list))
                    {
                        list = new List <IDictionary>();
                        documents.Add((float)document["___score"], list);
                    }

                    list.Add(document);
                }
            }

            return(documents);
        }
Exemple #2
0
        public override void Execute()
        {
            try
            {
                var query = _queryParser.Parse(
                    collections: Collections,
                    q: Q,
                    fields: Fields,
                    select: _select,
                    and: And,
                    or: Or);

                var targetCollectionId = _target.ToHash();
                IEnumerable <IDictionary <string, object> > documents;

                using (var readSession = _sessionFactory.CreateReadSession())
                {
                    documents = readSession.Read(query, _skip, _take).Docs;
                }

                //TODO: Remove this when cc_wat is rebuilt.
                var c = "cc_wat".ToHash();
                foreach (var d in documents)
                {
                    d.TryAdd(SystemFields.CollectionId, c);
                }

                if (_truncate)
                {
                    _sessionFactory.Truncate(targetCollectionId);
                }

                using (var documentWriter = new DocumentWriter(targetCollectionId, _sessionFactory))
                {
                    foreach (var field in _indexFieldNames)
                    {
                        documentWriter.EnsureKeyExists(field);
                    }
                }

                _sessionFactory.SaveAs(
                    targetCollectionId,
                    documents,
                    _indexFieldNames,
                    new HashSet <string>(),
                    _model);
            }
            catch (Exception ex)
            {
                _logger.LogError($"error processing {this} {ex}");
            }
        }
Exemple #3
0
        private static async Task Query(string dir, string collectionName)
        {
            var tokenizer      = new UnicodeTokenizer();
            var qp             = new TermQueryParser();
            var sessionFactory = new SessionFactory(
                dir,
                tokenizer,
                new IniConfiguration(Path.Combine(Directory.GetCurrentDirectory(), "sir.ini")));

            while (true)
            {
                Console.Write("query>");

                var input = Console.ReadLine();

                if (string.IsNullOrWhiteSpace(input) || input == "q" || input == "quit")
                {
                    break;
                }

                var q = qp.Parse(collectionName.ToHash(), input, tokenizer);
                q.Skip = 0;
                q.Take = 100;

                using (var session = sessionFactory.CreateReadSession(collectionName, collectionName.ToHash()))
                {
                    var result = await session.Read(q);

                    var docs = result.Docs;

                    if (docs.Count > 0)
                    {
                        var index = 0;

                        foreach (var doc in docs.Take(10))
                        {
                            Console.WriteLine("{0} {1} {2}", index++, doc["___score"], doc["title"]);
                        }
                    }
                }
            }
        }
Exemple #4
0
        private void DownloadAndIndexWetFile()
        {
            var writePayload = new List <IDictionary <string, object> >();

            var originalQuery = _queryParser.Parse(
                Collections,
                Q,
                Fields,
                select: new string[] { "url", "title", "filename" },
                and: And,
                or: Or);

            using (var readSession = _sessionFactory.CreateReadSession())
            {
                var originalResult = readSession.Read(originalQuery, _skip, _take)
                                     .Docs
                                     .ToDictionary(x => (string)x["url"]);

                var        wetFileIds      = new SortedList <string, object>();
                ReadResult wetResult       = null;
                var        wetCollectionId = "cc_wet".ToHash();

                foreach (var doc in originalResult.Values)
                {
                    if (doc["filename"] is object[])
                    {
                        continue;
                    }

                    var wetFileId = ((string)doc["filename"]).Replace("/warc", "/wet").Replace(".gz", ".wet.gz");

                    wetFileIds.TryAdd(wetFileId, null);

                    break;
                }

                foreach (var fileName in wetFileIds.Keys)
                {
                    var wetQuery = _queryParser.Parse(
                        collections: new string[] { "cc_wet" },
                        q: fileName,
                        fields: new string[] { "filename" },
                        select: new string[] { "filename" },
                        and: true,
                        or: false);

                    if (wetQuery != null)
                    {
                        wetResult = readSession.Read(wetQuery, 0, 1);
                    }

                    if (wetResult == null || wetResult.Total == 0)
                    {
                        var localFileName = Path.Combine(_sessionFactory.Dir, "wet", fileName);
                        var tmpFileName   = Path.Combine(_sessionFactory.Dir, "tmp", Id, fileName);

                        if (!File.Exists(localFileName))
                        {
                            if (!Directory.Exists(Path.GetDirectoryName(tmpFileName)))
                            {
                                Directory.CreateDirectory(Path.GetDirectoryName(tmpFileName));
                            }

                            var          remoteFileName = $"https://commoncrawl.s3.amazonaws.com/{fileName}";
                            const double payloadSize    = 150000000;

                            using (var client = new WebClient())
                            {
                                var state = new State {
                                    Completed = false
                                };
                                client.DownloadFileCompleted += Client_DownloadFileCompleted;
                                client.DownloadFileAsync(new Uri(remoteFileName), tmpFileName, state);

                                while (!state.Completed)
                                {
                                    try
                                    {
                                        if (File.Exists(tmpFileName))
                                        {
                                            var fi = new FileInfo(tmpFileName);

                                            if (fi.Length > 0)
                                            {
                                                var status = (fi.Length / (payloadSize * wetFileIds.Count)) * 100;

                                                Status["download"] = status;
                                            }
                                        }
                                    }
                                    catch { }
                                    finally
                                    {
                                        Thread.Sleep(1000);
                                    }
                                }
                            }
                        }

                        if (!File.Exists(localFileName))
                        {
                            try
                            {
                                var localDir = Path.GetDirectoryName(localFileName);

                                if (!Directory.Exists(localDir))
                                {
                                    Directory.CreateDirectory(localDir);
                                }

                                File.Move(tmpFileName, localFileName, true);
                                Thread.Sleep(100);
                                Directory.Delete(Path.GetDirectoryName(tmpFileName));
                            }
                            catch (Exception ex)
                            {
                                _logger.LogError(ex, ex.Message);
                            }
                        }

                        foreach (var document in ReadWetFile(localFileName, fileName))
                        {
                            IDictionary <string, object> originalDoc;
                            var key = (string)document["url"];

                            if (originalResult.TryGetValue(key, out originalDoc))
                            {
                                document["title"]    = originalDoc["title"];
                                document["filename"] = originalDoc["filename"];

                                writePayload.Add(document);
                            }
                        }
                    }
                }

                Status["download"] = 100;

                if (writePayload.Count > 0)
                {
                    var time = Stopwatch.StartNew();

                    var writeJob = new WriteJob(
                        wetCollectionId,
                        writePayload,
                        new BocModel(),
                        _wetStoredFieldNames,
                        _wetIndexedFieldNames);

                    _sessionFactory.Write(writeJob, reportSize: 1000);

                    Status["index"] = 100;

                    _logger.LogInformation($"wet file write job took {time.Elapsed}");
                }
            }
        }