static void ProcessAndIndexWikipedia() { var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer); var index = new DictionaryIndex <int>(rareWordThreshold: 5); var indexBuilder = new IndexBuilder <int, IEnumerable <int> >(index); var processor = new WikitextProcessor(); indexBuilder.IndexCorpus(processor.Transform(reader.Read())); Console.WriteLine("Serializing index..."); using var file = File.Create(indexPath); index.Serialize(file); }
public void ProcessorTest() { string text0 = @"{{template and {{subtemplate}}}} == Header == <a href=""http://www.link.to/"">link</a>"; string text1 = "Some long text;!!! with punctuations; And!!!!?"; var expected = new[] { CreateExpectedResult("header link"), CreateExpectedResult("some long text with punctuations and"), }; var corpus = CreateCorpus(text0, text1); var processed = processor.Transform(corpus).First().Documents.Select(d => d.Data).ToArray(); Assert.Equal(expected, processed); }
static void BuildExternalIndex() { PrepareOutputDirectory(externalIndexPath); var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer); using var buildableIndex = new BlockedExternalBuildableIndex <int>( DictonaryBasedExternalBuildableIndex <int> .GetCreateMethodWithVarintPostingsLists(), externalIndexPath, BlockSize); var indexBuilder = new IndexBuilder <int, IEnumerable <int> >(buildableIndex); var processor = new WikitextProcessor(); indexBuilder.IndexCorpus(processor.Transform(reader.Read())); using var index = buildableIndex.Build(); var serializer = new ExternalIndexSerializer <int>(); serializer.Serialize(externalIndexPath, index); }