Пример #1
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var model         = new BagOfCharsModel();
            var collectionId  = "cc_wet".ToHash();
            var storeFields   = new HashSet <string> {
                "url"
            };
            var indexFields = new HashSet <string> {
                "description"
            };

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                sessionFactory.Truncate(collectionId);

                sessionFactory.Write(
                    collectionId,
                    ReadWetFile(fileName)
                    .Select(dic =>
                            new Document(
                                dic.Select(kvp => new Field(
                                               kvp.Key,
                                               kvp.Value,
                                               index: indexFields.Contains(kvp.Key),
                                               store: storeFields.Contains(kvp.Key))).ToList())),
                    model,
                    reportSize: 1000);
            }
        }
Пример #2
0
        public static IServiceProvider Configure(IServiceCollection services)
        {
            var assemblyPath = Directory.GetCurrentDirectory();
            var config       = new KeyValueConfiguration(Path.Combine(assemblyPath, "sir.ini"));

            services.Add(new ServiceDescriptor(typeof(IConfigurationProvider), config));

            var loggerFactory  = services.BuildServiceProvider().GetService <ILoggerFactory>();
            var logger         = loggerFactory.CreateLogger("Sir");
            var model          = new BagOfCharsModel();
            var sessionFactory = new SessionFactory(@"c:\data\resin", logger);
            var qp             = new QueryParser <string>(sessionFactory, model, logger);
            var httpParser     = new HttpQueryParser(qp);

            services.AddSingleton(typeof(IModel <string>), model);
            services.AddSingleton(typeof(ISessionFactory), sessionFactory);
            services.AddSingleton(typeof(SessionFactory), sessionFactory);
            services.AddSingleton(typeof(QueryParser <string>), qp);
            services.AddSingleton(typeof(HttpQueryParser), httpParser);
            services.AddSingleton(typeof(IHttpWriter), new HttpWriter(sessionFactory));
            services.AddSingleton(typeof(IHttpReader), new HttpReader(
                                      sessionFactory,
                                      httpParser,
                                      loggerFactory.CreateLogger <HttpReader>()));

            return(services.BuildServiceProvider());
        }
Пример #3
0
        public void Can_traverse_index_in_memory()
        {
            var model = new BagOfCharsModel();
            var tree  = model.CreateTree(model, _data);

            Debug.WriteLine(PathFinder.Visualize(tree));

            Assert.DoesNotThrow(() =>
            {
                foreach (var word in _data)
                {
                    foreach (var queryVector in model.Tokenize(word))
                    {
                        var hit = PathFinder.ClosestMatch(tree, queryVector, model);

                        if (hit == null)
                        {
                            throw new Exception($"unable to find {word} in tree.");
                        }

                        if (hit.Score < model.IdenticalAngle)
                        {
                            throw new Exception($"unable to score {word}.");
                        }

                        Debug.WriteLine($"{word} matched with {hit.Node.Vector.Label} with {hit.Score * 100}% certainty.");
                    }
                }
            });
        }
Пример #4
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;
            var pageSize      = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text", "url"
            };
            var fieldsToIndex = new HashSet <string> {
                "title", "text"
            };

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var model   = new BagOfCharsModel();
            var payload = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                var debugger = new IndexDebugger(logger, sampleSize);

                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var page in payload.Batch(pageSize))
                    {
                        using (var indexStream = new WritableIndexStream(collectionId, sessionFactory, logger: logger))
                            using (var indexSession = new IndexSession <string>(model, model))
                            {
                                foreach (var document in page)
                                {
                                    writeSession.Put(document);

                                    foreach (var field in document.IndexableFields)
                                    {
                                        indexSession.Put(document.Id, field.KeyId, (string)field.Value);
                                    }

                                    debugger.Step(indexSession);
                                }

                                indexStream.Write(indexSession.GetInMemoryIndex());

                                //foreach (var column in indexSession.InMemoryIndex)
                                //{
                                //    Print($"wikipedia.{column.Key}", column.Value);
                                //}
                            }
                    }
                }
            }
        }
Пример #5
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var collection    = args["collection"];
            var documentId    = long.Parse(args["documentId"]);
            var select        = new HashSet <string>(args["select"].Split(new char[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries));
            var collectionId  = collection.ToHash();
            var model         = new BagOfCharsModel();

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
                using (var documents = new DocumentStreamSession(sessionFactory))
                    using (var documentReader = new DocumentReader(collectionId, sessionFactory))
                    {
                        var doc = documents.ReadDocument((collectionId, documentId), select, documentReader);

                        foreach (var field in doc.Fields)
                        {
                            var tokens = model.Tokenize(field.Value.ToString());
                            var tree   = new VectorNode();

                            foreach (var token in tokens)
                            {
                                tree.MergeOrAdd(new VectorNode(token), model);
                            }

                            Console.WriteLine(field.Name);
                            Console.WriteLine(PathFinder.Visualize(tree));
                            Console.WriteLine(string.Join('\n', tokens));
                        }
                    }
        }
Пример #6
0
        static void Main(string[] args)
        {
            var loggerFactory = LoggerFactory.Create(builder =>
            {
                builder
                .AddFilter("Microsoft", LogLevel.Warning)
                .AddFilter("System", LogLevel.Warning)
                .AddFilter("Sir", LogLevel.Information)
                .AddConsole();
            });

            var logger = loggerFactory.CreateLogger("Sir");

            logger.LogInformation($"processing command: {string.Join(" ", args)}");

            var model   = new BagOfCharsModel();
            var command = args[0].ToLower();
            var flags   = ParseArgs(args);
            var plugin  = ResolvePlugin(command);
            var time    = Stopwatch.StartNew();

            if (plugin != null)
            {
                try
                {
                    plugin.Run(flags, logger);
                }
                catch (Exception ex)
                {
                    logger.LogError(ex, ex.Message);
                }
            }
            else if ((command == "slice"))
            {
                Slice(flags);
            }
            else if (command == "truncate")
            {
                Truncate(flags["dataDirectory"], flags["collection"], logger);
            }
            else if (command == "truncate-index")
            {
                TruncateIndex(flags["dataDirectory"], flags["collection"], logger);
            }
            else if (command == "optimize")
            {
                Optimize(flags, model, logger);
            }
            else
            {
                logger.LogInformation("unknown command: {0}", command);

                return;
            }

            logger.LogInformation($"executed {command} in {time.Elapsed}");
        }
Пример #7
0
        public void Can_search_filestreamed_with_multiple_pages()
        {
            var          model        = new BagOfCharsModel();
            const string collection   = "Can_search_streamed_with_one_page_per_document";
            var          collectionId = collection.ToHash();
            const string fieldName    = "description";

            _sessionFactory.Truncate(collectionId);

            using (var stream = new WritableIndexStream(collectionId, _sessionFactory))
                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, _sessionFactory)))
                {
                    var keyId = writeSession.EnsureKeyExists(fieldName);

                    for (long i = 0; i < _data.Length; i++)
                    {
                        var data = _data[i];

                        using (var indexSession = new IndexSession <string>(model, model))
                        {
                            var doc = new Document(new Field[] { new Field(fieldName, data, index: true, store: true) });

                            writeSession.Put(doc);
                            indexSession.Put(doc.Id, keyId, data);
                            stream.Write(indexSession.GetInMemoryIndex());
                        }
                    }
                }

            var queryParser = new QueryParser <string>(_sessionFactory, model);

            using (var searchSession = new SearchSession(_sessionFactory, model, new PostingsReader(_sessionFactory)))
            {
                Assert.DoesNotThrow(() =>
                {
                    foreach (var word in _data)
                    {
                        var query    = queryParser.Parse(collection, word, fieldName, fieldName, and: true, or: false);
                        var result   = searchSession.Search(query, 0, 1);
                        var document = result.Documents.FirstOrDefault();

                        if (document == null)
                        {
                            throw new Exception($"unable to find {word}.");
                        }

                        if (document.Score < model.IdenticalAngle)
                        {
                            throw new Exception($"unable to score {word}.");
                        }

                        Debug.WriteLine($"{word} matched with {document.Score * 100}% certainty.");
                    }
                });
            }
        }
Пример #8
0
        public void Run(IDictionary <string, string> args, ILogger logger)
        {
            var dataDirectory = args["dataDirectory"];
            var fileName      = args["fileName"];
            var collection    = args["collection"];
            var skip          = args.ContainsKey("skip") ? int.Parse(args["skip"]) : 0;
            var take          = args.ContainsKey("take") ? int.Parse(args["take"]) : int.MaxValue;
            var sampleSize    = args.ContainsKey("sampleSize") ? int.Parse(args["sampleSize"]) : 1000;
            var pageSize      = args.ContainsKey("pageSize") ? int.Parse(args["pageSize"]) : 100000;

            var collectionId  = collection.ToHash();
            var fieldsToStore = new HashSet <string> {
                "language", "wikibase_item", "title", "text"
            };
            var fieldsToIndex = new HashSet <string> {
                "language", "title", "text"
            };

            if (take == 0)
            {
                take = int.MaxValue;
            }

            var model    = new BagOfCharsModel();
            var payload  = WikipediaHelper.ReadWP(fileName, skip, take, fieldsToStore, fieldsToIndex);
            var debugger = new BatchDebugger(logger, sampleSize);

            using (var sessionFactory = new SessionFactory(dataDirectory, logger))
            {
                using (var writeSession = new WriteSession(new DocumentWriter(collectionId, sessionFactory)))
                {
                    foreach (var page in payload.Batch(pageSize))
                    {
                        using (var indexSession = new IndexSession <string>(model, model))
                        {
                            foreach (var document in page)
                            {
                                writeSession.Put(document);

                                foreach (var field in document.IndexableFields)
                                {
                                    foreach (var token in model.Tokenize((string)field.Value))
                                    {
                                        debugger.Step();
                                    }
                                }
                            }
                        }
                    }

                    logger.LogInformation($"tokenized {debugger.StepCount} in {debugger.Time}.");
                }
            }
        }
Пример #9
0
        static void Main(string[] args)
        {
            var model = new BagOfCharsModel();

            if (args[0] == "--build-graph")
            {
                RunInteractiveGraphBuilder(model);
            }
            else
            {
                Similarity(args[0], args[1], model);
                CompareToBaseVector(args[0], args[1], model);
            }
        }
Пример #10
0
        public void Can_tokenize()
        {
            const string data   = "Ferriman–Gallwey score"; // NOTE: string contains "En dash" character: https://unicode-table.com/en/#2013
            var          model  = new BagOfCharsModel();
            var          tokens = model.Tokenize(data);
            var          labels = tokens.Select(x => x.Label.ToString()).ToList();

            var t0 = data.Substring(0, 8);
            var t1 = data.Substring(9, 7);
            var t2 = data.Substring(17, 5);

            Assert.IsTrue(labels.Contains(t0));
            Assert.IsTrue(labels.Contains(t1));
            Assert.IsTrue(labels.Contains(t2));
        }
Пример #11
0
        public void Can_traverse_streamed()
        {
            var model = new BagOfCharsModel();
            var tree  = model.CreateTree(model, _data);

            using (var indexStream = new MemoryStream())
                using (var vectorStream = new MemoryStream())
                    using (var pageStream = new MemoryStream())
                    {
                        using (var writer = new ColumnWriter(indexStream, keepStreamOpen: true))
                        {
                            writer.CreatePage(tree, vectorStream, new PageIndexWriter(pageStream, keepStreamOpen: true));
                        }

                        pageStream.Position = 0;

                        Assert.DoesNotThrow(() =>
                        {
                            using (var reader = new ColumnReader(new PageIndexReader(pageStream), indexStream, vectorStream, _sessionFactory, _loggerFactory.CreateLogger <ColumnReader>()))
                            {
                                foreach (var word in _data)
                                {
                                    foreach (var queryVector in model.Tokenize(word))
                                    {
                                        var hit = reader.ClosestMatch(queryVector, model);

                                        if (hit == null)
                                        {
                                            throw new Exception($"unable to find {word} in tree.");
                                        }

                                        if (hit.Score < model.IdenticalAngle)
                                        {
                                            throw new Exception($"unable to score {word}.");
                                        }

                                        Debug.WriteLine($"{word} matched vector in disk with {hit.Score * 100}% certainty.");
                                    }
                                }
                            }
                        });
                    }
        }
Пример #12
0
        public void Can_produce_traversable_in_memory_index()
        {
            var        model = new BagOfCharsModel();
            VectorNode tree;

            using (var indexSession = new IndexSession <string>(model, model))
            {
                for (long i = 0; i < _data.Length; i++)
                {
                    indexSession.Put(i, 0, _data[i]);
                }

                tree = indexSession.GetInMemoryIndex()[0];
            }

            Debug.WriteLine(PathFinder.Visualize(tree));

            Assert.DoesNotThrow(() =>
            {
                foreach (var word in _data)
                {
                    foreach (var queryVector in model.Tokenize(word))
                    {
                        var hit = PathFinder.ClosestMatch(tree, queryVector, model);

                        if (hit == null)
                        {
                            throw new Exception($"unable to find {word} in tree.");
                        }

                        if (hit.Score < model.IdenticalAngle)
                        {
                            throw new Exception($"unable to score {word}.");
                        }

                        Debug.WriteLine($"{word} matched with {hit.Node.Vector.Label} with {hit.Score * 100}% certainty.");
                    }
                }
            });
        }