Ejemplo n.º 1
0
        static void ProcessAndIndexWikipedia()
        {
            var reader       = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer);
            var index        = new DictionaryIndex <int>(rareWordThreshold: 5);
            var indexBuilder = new IndexBuilder <int, IEnumerable <int> >(index);
            var processor    = new WikitextProcessor();

            indexBuilder.IndexCorpus(processor.Transform(reader.Read()));

            Console.WriteLine("Serializing index...");
            using var file = File.Create(indexPath);
            index.Serialize(file);
        }
        public void ProcessorTest()
        {
            string text0    = @"{{template and {{subtemplate}}}}
                            == Header ==
                            <a href=""http://www.link.to/"">link</a>";
            string text1    = "Some long text;!!! with  punctuations; And!!!!?";
            var    expected = new[]
            {
                CreateExpectedResult("header link"),
                CreateExpectedResult("some long text with punctuations and"),
            };

            var corpus = CreateCorpus(text0, text1);

            var processed = processor.Transform(corpus).First().Documents.Select(d => d.Data).ToArray();

            Assert.Equal(expected, processed);
        }
Ejemplo n.º 3
0
        static void BuildExternalIndex()
        {
            PrepareOutputDirectory(externalIndexPath);

            var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer);

            using var buildableIndex = new BlockedExternalBuildableIndex <int>(
                      DictonaryBasedExternalBuildableIndex <int> .GetCreateMethodWithVarintPostingsLists(),
                      externalIndexPath,
                      BlockSize);
            var indexBuilder = new IndexBuilder <int, IEnumerable <int> >(buildableIndex);
            var processor    = new WikitextProcessor();

            indexBuilder.IndexCorpus(processor.Transform(reader.Read()));

            using var index = buildableIndex.Build();

            var serializer = new ExternalIndexSerializer <int>();

            serializer.Serialize(externalIndexPath, index);
        }