예제 #1
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;
            var pageSize      = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text", "url"
            };
            var fieldsToIndex = new HashSet <string> {
                "title", "text"
            };

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var model   = new BagOfCharsModel();
            var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                var debugger = new IndexDebugger(logger, sampleSize);

                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var page in payload.Batch(pageSize))
                    {
                        using (var indexStream = new WritableIndexStream(collectionId, sessionFactory, logger: logger))
                            using (var indexSession = new IndexSession <string>(model, model))
                            {
                                foreach (var document in page)
                                {
                                    writeSession.Put(document);

                                    foreach (var field in document.IndexableFields)
                                    {
                                        indexSession.Put(document.Id, field.KeyId, (string)field.Value);
                                    }

                                    debugger.Step(indexSession);
                                }

                                indexStream.Write(indexSession.GetInMemoryIndex());

                                //foreach (var column in indexSession.InMemoryIndex)
                                //{
                                //    Print($"wikipedia.{column.Key}", column.Value);
                                //}
                            }
                    }
                }
            }
        }
예제 #2
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var        time         = Stopwatch.StartNew();
            var        collectionId = args["collection"].ToHash();
            var        images       = new MnistReader(args["imageFileName"], args["labelFileName"]).Read();
            var        count        = 0;
            VectorNode tree;

            using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger))
                using (var writeSession = sessionFactory.CreateWriteSession(collectionId))
                {
                    var debugger = new IndexDebugger();
                    var keyId    = writeSession.EnsureKeyExists("image");

                    using (var indexSession = sessionFactory.CreateIndexSession(collectionId, new ImageModel()))
                    {
                        foreach (var image in images)
                        {
                            var document = new Dictionary <string, object>()
                            {
                                { "label", image.Label }
                            };
                            var storeFields = new HashSet <string> {
                                "label"
                            };
                            var documentId = writeSession.Put(document, storeFields);

                            indexSession.Put(documentId, keyId, image);

                            count++;

                            var debugInfo = debugger.GetDebugInfo(indexSession);

                            if (debugInfo != null)
                            {
                                logger.LogInformation(debugInfo);
                            }
                        }

                        tree = indexSession.GetInMemoryIndex(keyId);
                    }
                }

            Print(tree);

            logger.LogInformation($"indexed {count} mnist images in {time.Elapsed}");
        }
예제 #3
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var        time          = Stopwatch.StartNew();
            var        dataDirectory = args["dataDirectory"];
            var        collectionId  = args["collection"].ToHash();
            var        images        = new MnistReader(args["imageFileName"], args["labelFileName"]).Read();
            VectorNode tree;
            var        debugger = new IndexDebugger(logger);
            var        model    = new LinearClassifierImageModel();

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                sessionFactory.Truncate(collectionId);

                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                    using (var indexSession = new IndexSession <IImage>(model, model))
                    {
                        var imageIndexId = writeSession.EnsureKeyExists("image");

                        foreach (var image in images)
                        {
                            var imageField = new Field("image", image.Pixels, index: true, store: true);
                            var labelField = new Field("label", image.Label, index: false, store: true);
                            var document   = new Document(new Field[] { imageField, labelField });

                            writeSession.Put(document);
                            indexSession.Put(document.Id, imageField.KeyId, image);

                            debugger.Step(indexSession);
                        }

                        var indices = indexSession.GetInMemoryIndex();

                        tree = indices[imageIndexId];

                        using (var stream = new WritableIndexStream(collectionId, sessionFactory, logger: logger))
                        {
                            stream.Write(indices);
                        }
                    }
            }

            logger.LogInformation($"indexed {debugger.Steps} mnist images in {time.Elapsed}");

            Print(tree);
        }
예제 #4
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var fileName      = args["fileName"];
            var dir           = args["directory"];
            var collection    = args["collection"];
            var skip          = int.Parse(args["skip"]);
            var take          = int.Parse(args["take"]);
            var pageSize      = int.Parse(args["pageSize"]);
            var reportSize    = args.ContainsKey("reportSize") ? int.Parse(args["reportSize"]) : 1000;
            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "url", "title", "description"
            };
            var fieldsToIndex = new HashSet <string> {
                "language", "url", "title", "description"
            };
            var debugger = new IndexDebugger();
            var payload  = WikipediaHelper.ReadWP(fileName, skip, take)
                           .Select(x => new Dictionary <string, object>
            {
                { "language", x["language"].ToString() },
                { "url", string.Format("www.wikipedia.org/search-redirect.php?family=wikipedia&language={0}&search={1}", x["language"], x["title"]) },
                { "title", x["title"] },
                { "description", x["text"] }
            });

            using (var sessionFactory = new SessionFactory(new KeyValueConfiguration("sir.ini"), logger))
            {
                foreach (var page in payload.Batch(pageSize))
                {
                    using (var writeSession = sessionFactory.CreateWriteSession(collectionId))
                        using (var indexSession = sessionFactory.CreateIndexSession(collectionId, new TextModel()))
                        {
                            foreach (var batch in page.Batch(reportSize))
                            {
                                var time = Stopwatch.StartNew();

                                foreach (var document in page)
                                {
                                    var documentId = writeSession.Put(document, fieldsToStore);

                                    foreach (var kv in document)
                                    {
                                        if (fieldsToIndex.Contains(kv.Key) && kv.Value != null)
                                        {
                                            var keyId = writeSession.EnsureKeyExists(kv.Key);

                                            indexSession.Put(documentId, keyId, kv.Value.ToString());
                                        }
                                    }
                                }

                                var debugInfo = debugger.GetDebugInfo(indexSession);

                                if (debugInfo != null)
                                {
                                    logger.LogInformation(debugInfo);
                                }
                            }
                        }
                }
            }
        }