コード例 #1
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var fileName  = args["fileName"];
            var uri       = new Uri(args["uri"]);
            var count     = int.Parse(args["count"]);
            var batchSize = int.Parse(args["batchSize"]);
            var batchNo   = 0;

            using (var httpClient = new HttpClient())
            {
                var payload = WikipediaHelper.ReadWP(fileName, 0, count)
                              .Select(x => new Dictionary <string, object>
                {
                    { "_language", x["language"].ToString() },
                    { "_url", string.Format("www.wikipedia.org/search-redirect.php?family=wikipedia&language={0}&search={1}", x["language"], x["title"]) },
                    { "title", x["title"] },
                    { "body", x["text"] }
                });

                foreach (var batch in payload.Batch(batchSize))
                {
                    var time = Stopwatch.StartNew();
                    Submit(batch, uri, httpClient);
                    time.Stop();
                    var docsPerSecond = (int)(batchSize / time.Elapsed.TotalSeconds);
                    Console.WriteLine($"batch {batchNo++} took {time.Elapsed} {docsPerSecond} docs/s");
                }
            }
        }
コード例 #2
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;
            var pageSize      = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text", "url"
            };
            var fieldsToIndex = new HashSet <string> {
                "title", "text"
            };

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var model   = new BagOfCharsModel();
            var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                var debugger = new IndexDebugger(logger, sampleSize);

                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var page in payload.Batch(pageSize))
                    {
                        using (var indexStream = new WritableIndexStream(collectionId, sessionFactory, logger: logger))
                            using (var indexSession = new IndexSession <string>(model, model))
                            {
                                foreach (var document in page)
                                {
                                    writeSession.Put(document);

                                    foreach (var field in document.IndexableFields)
                                    {
                                        indexSession.Put(document.Id, field.KeyId, (string)field.Value);
                                    }

                                    debugger.Step(indexSession);
                                }

                                indexStream.Write(indexSession.GetInMemoryIndex());

                                //foreach (var column in indexSession.InMemoryIndex)
                                //{
                                //    Print($"wikipedia.{column.Key}", column.Value);
                                //}
                            }
                    }
                }
            }
        }
コード例 #3
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;
            var pageSize      = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text"
            };
            var fieldsToIndex = new HashSet <string> {
                "language", "title", "text"
            };

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var model    = new BagOfCharsModel();
            var payload  = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);
            var debugger = new BatchDebugger(logger, sampleSize);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var page in payload.Batch(pageSize))
                    {
                        using (var indexSession = new IndexSession <string>(model, model))
                        {
                            foreach (var document in page)
                            {
                                writeSession.Put(document);

                                foreach (var field in document.IndexableFields)
                                {
                                    foreach (var token in model.Tokenize((string)field.Value))
                                    {
                                        debugger.Step();
                                    }
                                }
                            }
                        }
                    }

                    logger.LogInformation($"tokenized {debugger.StepCount} in {debugger.Time}.");
                }
            }
        }
コード例 #4
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text", "url"
            };
            var fieldsToIndex = new HashSet <string>();

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                sessionFactory.Truncate(collectionId);

                var debugger = new BatchDebugger(logger, sampleSize);

                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var document in payload)
                    {
                        writeSession.Put(document);

                        debugger.Step();
                    }
                }
            }
        }
コード例 #5
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var fileName      = args["fileName"];
            var dir           = args["directory"];
            var collection    = args["collection"];
            var skip          = int.Parse(args["skip"]);
            var take          = int.Parse(args["take"]);
            var pageSize      = int.Parse(args["pageSize"]);
            var reportSize    = args.ContainsKey("reportSize") ? int.Parse(args["reportSize"]) : 1000;
            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "url", "title", "description"
            };
            var fieldsToIndex = new HashSet <string> {
                "language", "url", "title", "description"
            };
            var debugger = new IndexDebugger();
            var payload  = WikipediaHelper.ReadWP(fileName, skip, take)
                           .Select(x => new Dictionary <string, object>
            {
                { "language", x["language"].ToString() },
                { "url", string.Format("www.wikipedia.org/search-redirect.php?family=wikipedia&language={0}&search={1}", x["language"], x["title"]) },
                { "title", x["title"] },
                { "description", x["text"] }
            });

            using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger))
            {
                foreach (var page in payload.Batch(pageSize))
                {
                    using (var writeSession = sessionFactory.CreateWriteSession(collectionId))
                        using (var indexSession = sessionFactory.CreateIndexSession(collectionId, new TextModel()))
                        {
                            foreach (var batch in page.Batch(reportSize))
                            {
                                var time = Stopwatch.StartNew();

                                foreach (var document in page)
                                {
                                    var documentId = writeSession.Put(document, fieldsToStore);

                                    foreach (var kv in document)
                                    {
                                        if (fieldsToIndex.Contains(kv.Key) && kv.Value != null)
                                        {
                                            var keyId = writeSession.EnsureKeyExists(kv.Key);

                                            indexSession.Put(documentId, keyId, kv.Value.ToString());
                                        }
                                    }
                                }

                                var debugInfo = debugger.GetDebugInfo(indexSession);

                                if (debugInfo != null)
                                {
                                    logger.LogInformation(debugInfo);
                                }
                            }
                        }
                }
            }
        }