Пример #1
0
        public static void WriteWatSegment(
            string fileName,
            string collection,
            ITextModel model,
            ILogger logger,
            string refFileName)
        {
            var time            = Stopwatch.StartNew();
            var collectionId    = collection.ToHash();
            var storeFieldNames = new HashSet <string>
            {
                "title", "description", "url", "filename"
            };
            var indexFieldNames = new HashSet <string>
            {
                "title", "description", "url"
            };

            using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger))
                using (var writeSession = sessionFactory.CreateWriteSession(collectionId))
                    using (var indexSession = sessionFactory.CreateIndexSession(collectionId, model))
                        using (var queue = new ProducerConsumerQueue <IDictionary <string, object> >(1, (document =>
                        {
                            sessionFactory.Write(document, writeSession, indexSession, storeFieldNames, indexFieldNames);
                        })))
                        {
                            foreach (var document in ReadWatFile(fileName, refFileName))
                            {
                                queue.Enqueue(document);
                            }
                        }

            logger.LogInformation($"indexed {fileName} in {time.Elapsed}");
        }
Пример #2
0
        private static async Task Index(string dir, string collection, int skip, int take, int batchSize)
        {
            var timer = new Stopwatch();

            timer.Start();

            var files          = Directory.GetFiles(dir, "*.docs");
            var sessionFactory = new SessionFactory(
                dir,
                new LatinTokenizer(),
                new IniConfiguration(Path.Combine(Directory.GetCurrentDirectory(), "sir.ini")));
            var batchNo = 0;

            foreach (var docFileName in files)
            {
                var name = Path.GetFileNameWithoutExtension(docFileName)
                           .Split(".", StringSplitOptions.RemoveEmptyEntries);

                var collectionId = name[0];

                if (collectionId == collection.ToHash().ToString())
                {
                    using (var readSession = new DocumentStreamSession(collection, sessionFactory))
                    {
                        var docs = readSession.ReadDocs();

                        if (skip > 0)
                        {
                            docs = docs.Skip(skip);
                        }

                        if (take > 0)
                        {
                            docs = docs.Take(take);
                        }

                        var writeTimer = new Stopwatch();
                        foreach (var batch in docs.Batch(batchSize))
                        {
                            writeTimer.Restart();

                            var job = new IndexingJob(collection, batch);

                            using (var indexSession = sessionFactory.CreateIndexSession(collection))
                            {
                                await indexSession.Write(job);
                            }

                            _log.Log(string.Format("batch {0} done in {1}", batchNo++, writeTimer.Elapsed));
                        }
                    }
                    break;
                }
            }

            _log.Log(string.Format("indexing took {0}", timer.Elapsed));
        }
Пример #3
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var        time         = Stopwatch.StartNew();
            var        collectionId = args["collection"].ToHash();
            var        images       = new MnistReader(args["imageFileName"], args["labelFileName"]).Read();
            var        count        = 0;
            VectorNode tree;

            using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger))
                using (var writeSession = sessionFactory.CreateWriteSession(collectionId))
                {
                    var debugger = new IndexDebugger();
                    var keyId    = writeSession.EnsureKeyExists("image");

                    using (var indexSession = sessionFactory.CreateIndexSession(collectionId, new ImageModel()))
                    {
                        foreach (var image in images)
                        {
                            var document = new Dictionary <string, object>()
                            {
                                { "label", image.Label }
                            };
                            var storeFields = new HashSet <string> {
                                "label"
                            };
                            var documentId = writeSession.Put(document, storeFields);

                            indexSession.Put(documentId, keyId, image);

                            count++;

                            var debugInfo = debugger.GetDebugInfo(indexSession);

                            if (debugInfo != null)
                            {
                                logger.LogInformation(debugInfo);
                            }
                        }

                        tree = indexSession.GetInMemoryIndex(keyId);
                    }
                }

            Print(tree);

            logger.LogInformation($"indexed {count} mnist images in {time.Elapsed}");
        }
Пример #4
0
        private static void Index(string dir, string collectionName, int skip, int take, int batchSize)
        {
            var files      = Directory.GetFiles(dir, "*.docs");
            var fullTime   = Stopwatch.StartNew();
            var batchCount = 0;

            using (var sessionFactory = new SessionFactory(dir, new LatinTokenizer(), new IniConfiguration("sir.ini")))
            {
                foreach (var docFileName in files)
                {
                    var name = Path.GetFileNameWithoutExtension(docFileName)
                               .Split(".", StringSplitOptions.RemoveEmptyEntries);

                    var collectionId = ulong.Parse(name[0]);

                    if (collectionId == collectionName.ToHash())
                    {
                        using (var readSession = sessionFactory.CreateDocumentStreamSession(name[0], collectionId))
                        {
                            var docs = readSession.ReadDocs(skip, take);

                            foreach (var batch in docs.Batch(batchSize))
                            {
                                var timer = Stopwatch.StartNew();

                                using (var indexSession = sessionFactory.CreateIndexSession(collectionName, collectionId))
                                {
                                    foreach (var doc in batch)
                                    {
                                        indexSession.EmbedTerms(doc);
                                    }
                                }

                                Logging.Log(null, string.Format("indexed batch #{0} in {1}", batchCount++, timer.Elapsed));
                            }
                        }

                        break;
                    }
                }
            }

            Logging.Log(null, string.Format("indexed {0} batches in {1}", batchCount, fullTime.Elapsed));
        }
Пример #5
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var fileName      = args["fileName"];
            var dir           = args["directory"];
            var collection    = args["collection"];
            var skip          = int.Parse(args["skip"]);
            var take          = int.Parse(args["take"]);
            var pageSize      = int.Parse(args["pageSize"]);
            var reportSize    = args.ContainsKey("reportSize") ? int.Parse(args["reportSize"]) : 1000;
            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "url", "title", "description"
            };
            var fieldsToIndex = new HashSet <string> {
                "language", "url", "title", "description"
            };
            var debugger = new IndexDebugger();
            var payload  = WikipediaHelper.ReadWP(fileName, skip, take)
                           .Select(x => new Dictionary <string, object>
            {
                { "language", x["language"].ToString() },
                { "url", string.Format("www.wikipedia.org/search-redirect.php?family=wikipedia&language={0}&search={1}", x["language"], x["title"]) },
                { "title", x["title"] },
                { "description", x["text"] }
            });

            using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger))
            {
                foreach (var page in payload.Batch(pageSize))
                {
                    using (var writeSession = sessionFactory.CreateWriteSession(collectionId))
                        using (var indexSession = sessionFactory.CreateIndexSession(collectionId, new TextModel()))
                        {
                            foreach (var batch in page.Batch(reportSize))
                            {
                                var time = Stopwatch.StartNew();

                                foreach (var document in page)
                                {
                                    var documentId = writeSession.Put(document, fieldsToStore);

                                    foreach (var kv in document)
                                    {
                                        if (fieldsToIndex.Contains(kv.Key) && kv.Value != null)
                                        {
                                            var keyId = writeSession.EnsureKeyExists(kv.Key);

                                            indexSession.Put(documentId, keyId, kv.Value.ToString());
                                        }
                                    }
                                }

                                var debugInfo = debugger.GetDebugInfo(indexSession);

                                if (debugInfo != null)
                                {
                                    logger.LogInformation(debugInfo);
                                }
                            }
                        }
                }
            }
        }