Exemple #1
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var collection    = args["collection"];
            var documentId    = long.Parse(args["documentId"]);
            var select        = new HashSet <string>(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries));
            var collectionId  = collection.ToHash();
            var model         = new BagOfCharsModel();

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
                using (var documents = new DocumentStreamSession(sessionFactory))
                    using (var documentReader = new DocumentReader(collectionId, sessionFactory))
                    {
                        var doc = documents.ReadDocument((collectionId, documentId), select, documentReader);

                        foreach (var field in doc.Fields)
                        {
                            var tokens = model.Tokenize(field.Value.ToString());
                            var tree   = new VectorNode();

                            foreach (var token in tokens)
                            {
                                tree.MergeOrAdd(new VectorNode(token), model);
                            }

                            Console.WriteLine(field.Name);
                            Console.WriteLine(PathFinder.Visualize(tree));
                            Console.WriteLine(string.Join('\n', tokens));
                        }
                    }
        }
Exemple #2
0
        private static async Task Index(string dir, string collection, int skip, int take, int batchSize)
        {
            var timer = new Stopwatch();

            timer.Start();

            var files          = Directory.GetFiles(dir, "*.docs");
            var sessionFactory = new SessionFactory(
                dir,
                new LatinTokenizer(),
                new IniConfiguration(Path.Combine(Directory.GetCurrentDirectory(), "sir.ini")));
            var batchNo = 0;

            foreach (var docFileName in files)
            {
                var name = Path.GetFileNameWithoutExtension(docFileName)
                           .Split(".", StringSplitOptions.RemoveEmptyEntries);

                var collectionId = name[0];

                if (collectionId == collection.ToHash().ToString())
                {
                    using (var readSession = new DocumentStreamSession(collection, sessionFactory))
                    {
                        var docs = readSession.ReadDocs();

                        if (skip > 0)
                        {
                            docs = docs.Skip(skip);
                        }

                        if (take > 0)
                        {
                            docs = docs.Take(take);
                        }

                        var writeTimer = new Stopwatch();
                        foreach (var batch in docs.Batch(batchSize))
                        {
                            writeTimer.Restart();

                            var job = new IndexingJob(collection, batch);

                            using (var indexSession = sessionFactory.CreateIndexSession(collection))
                            {
                                await indexSession.Write(job);
                            }

                            _log.Log(string.Format("batch {0} done in {1}", batchNo++, writeTimer.Elapsed));
                        }
                    }
                    break;
                }
            }

            _log.Log(string.Format("indexing took {0}", timer.Elapsed));
        }