public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;
            var pageSize      = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text"
            };
            var fieldsToIndex = new HashSet <string> {
                "language", "title", "text"
            };

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var model    = new BagOfCharsModel();
            var payload  = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);
            var debugger = new BatchDebugger(logger, sampleSize);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var page in payload.Batch(pageSize))
                    {
                        using (var indexSession = new IndexSession <string>(model, model))
                        {
                            foreach (var document in page)
                            {
                                writeSession.Put(document);

                                foreach (var field in document.IndexableFields)
                                {
                                    foreach (var token in model.Tokenize((string)field.Value))
                                    {
                                        debugger.Step();
                                    }
                                }
                            }
                        }
                    }

                    logger.LogInformation($"tokenized {debugger.StepCount} in {debugger.Time}.");
                }
            }
        }
示例#2
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text", "url"
            };
            var fieldsToIndex = new HashSet <string>();

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                sessionFactory.Truncate(collectionId);

                var debugger = new BatchDebugger(logger, sampleSize);

                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var document in payload)
                    {
                        writeSession.Put(document);

                        debugger.Step();
                    }
                }
            }
        }