static void ProcessAndIndexWikipedia() { var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer); var index = new DictionaryIndex <int>(rareWordThreshold: 5); var indexBuilder = new IndexBuilder <int, IEnumerable <int> >(index); var processor = new WikitextProcessor(); indexBuilder.IndexCorpus(processor.Transform(reader.Read())); Console.WriteLine("Serializing index..."); using var file = File.Create(indexPath); index.Serialize(file); }
static void BuildExternalIndex() { PrepareOutputDirectory(externalIndexPath); var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer); using var buildableIndex = new BlockedExternalBuildableIndex <int>( DictonaryBasedExternalBuildableIndex <int> .GetCreateMethodWithVarintPostingsLists(), externalIndexPath, BlockSize); var indexBuilder = new IndexBuilder <int, IEnumerable <int> >(buildableIndex); var processor = new WikitextProcessor(); indexBuilder.IndexCorpus(processor.Transform(reader.Read())); using var index = buildableIndex.Build(); var serializer = new ExternalIndexSerializer <int>(); serializer.Serialize(externalIndexPath, index); }