public IndexHolder(IConfiguration config, ILogger <IndexHolder> logger) { var indexFilePath = config["IndexFile"]; _logger = logger; _logger.LogInformation($"Loading index from {Path.GetFullPath(indexFilePath)}"); using (var input = File.OpenRead(indexFilePath)) { index = YngdiengIndex.Parser.ParseFrom(input); } _logger.LogInformation($"{index.Documents.Count} documents loaded."); }
private void CreateLuceneIndex(YngdiengIndex index) { var dirInfo = Path.GetFullPath(Path.Join(outputFolder, "lucene")); Console.WriteLine($"Writing to {dirInfo}"); using (var dir = FSDirectory.Open(new DirectoryInfo(dirInfo))) { var indexConfig = new IndexWriterConfig(LuceneUtils.AppLuceneVersion, LuceneUtils.GetAnalyzer()); using (var writer = new IndexWriter(dir, indexConfig)) { writer.DeleteAll(); foreach (var yDoc in index.YngdiengDocuments) { var doc = new Lucene.Net.Documents.Document { new Int32Field(LuceneUtils.Fields.IsSourceless, yDoc.Sources.Count == 0?1:0, Field.Store.YES), new StringField(LuceneUtils.Fields.DocId, yDoc.DocId, Field.Store.YES), new TextField(LuceneUtils.Fields.Yngping, yDoc.YngpingSandhi, Field.Store.NO), new TextField(LuceneUtils.Fields.Hanzi, yDoc.HanziCanonical.Regular, Field.Store.NO), new StringField(LuceneUtils.Fields.YngpingSandhiTonePattern, GetTonePattern(yDoc.YngpingSandhi), Field.Store.NO) }; foreach (var m in yDoc.IndexingExtension.MandarinWords) { doc.Add(new TextField(LuceneUtils.Fields.Mandarin, m, Field.Store.NO)); doc.Add(new TextField(LuceneUtils.Fields.Mandarin, openCcClient.SimplifyMandarinText(m), Field.Store.NO)); } foreach (var a in yDoc.HanziAlternatives) { doc.Add(new TextField(LuceneUtils.Fields.HanziAlternative, a.Regular, Field.Store.NO)); } // Simplify Hanzi for search doc.Add(new TextField(LuceneUtils.Fields.Hanzi, openCcClient.SimplifyHukziuText(yDoc.HanziCanonical.Regular), Field.Store.NO)); foreach (var e in yDoc.IndexingExtension.ExplanationText) { doc.Add(new TextField(LuceneUtils.Fields.Explanation, e, Field.Store.NO)); } // TODO: encapsulate this in a YngpingAnalyzer foreach (var yp in yDoc.IndexingExtension.YngpingPermutations) { doc.Add(new TextField(LuceneUtils.Fields.Yngping, yp, Field.Store.NO)); } writer.AddDocument(doc); } writer.Flush(triggerMerge: false, applyAllDeletes: false); } } }
static int Main(string[] args) { if (args.Length != 2) { PrintHelp(); return(-1); } var inputFolder = args[0]; var outputFolder = args[1]; Console.WriteLine($"Input: {Path.GetFullPath(inputFolder)}"); Console.WriteLine($"Output: {Path.GetFullPath(outputFolder)}"); var index = new YngdiengIndex(); var hanziVariantsUtil = new HanziVariantsUtil(inputFolder); var aggregator = new DocumentAggregator(); var ciklin = new CreateCikLinDocumentsAction(Path.Combine(inputFolder, "ciklin.csv"), outputFolder, hanziVariantsUtil).Run(); var dfd = new CreateDFDDocumentsAction(Path.Combine(inputFolder, "DFDCharacters.csv"), outputFolder, hanziVariantsUtil).Run(); var feng = new CreateFengDocumentsAction(Path.Combine(inputFolder, "feng.txt"), outputFolder).Run(); index.Documents.Add(ciklin); index.Documents.Add(dfd); index.FengDocuments.Add(feng); foreach (var d in ciklin) { aggregator.Add(d); } foreach (var d in dfd) { aggregator.Add(d); } index.AggregatedDocument.AddRange(aggregator.GetAggregatedDocuments()); using (var outputFile = File.Create(Path.Combine(outputFolder, "yngdieng_index.bin"))) { index.WriteTo(outputFile); } return(0); }
public void StoreIndex(YngdiengIndex index) { this.index = index; }
public int Run() { Console.WriteLine($"Input: {Path.GetFullPath(inputFolder)}"); Console.WriteLine($"Output: {Path.GetFullPath(outputFolder)}"); var index = new YngdiengIndex(); var hanziVariantsUtil = new HanziVariantsUtil(inputFolder); var aggregator = new HistoricalDocAggregator(); Console.WriteLine($"Loading zingzeu_words..."); var zingzeuWords = new ZingzeuWordsLoader(Path.Combine(inputFolder, "zingzeu_words.txt")).Run(); Console.WriteLine($"Loading CikLinBekIn..."); var ciklin = new CikLingLoader(Path.Combine(inputFolder, "CikLinBekIn.csv"), Path.Combine(inputFolder, "cikling.csv"), Path.Combine(inputFolder, "cikling_mapping.csv"), outputFolder, hanziVariantsUtil) .Run(); Console.WriteLine($"Loading DFD..."); var dfd = new DFDLoader(Path.Combine(inputFolder, "DFDCharacters.csv"), outputFolder, hanziVariantsUtil) .Run(); Console.WriteLine($"Loading Feng..."); var feng = new FengLoader(Path.Combine(inputFolder, "feng.txt"), Path.Combine(inputFolder, "feng_zeu_mapping.txt"), outputFolder, openCcClient) .Run(); Console.WriteLine($"Loading Contrib..."); var contrib = new ContribLoader(Path.Combine(inputFolder, "contrib.tsv")).Run(); Console.WriteLine($"Loading Redirects..."); var redirects = new RedirectsLoader(Path.Combine(inputFolder, "redirects.txt")).Run(); index.Version = versionTag; index.FengDocuments.Add(feng); foreach (var d in ciklin) { aggregator.Add(d); } foreach (var d in dfd) { aggregator.Add(d); } index.HistoricalDocuments.AddRange(aggregator.GetHistoricalDocuments()); index.YngdiengDocuments.AddRange(YngdiengDocumentUtil.Combine( zingzeuWords, index.HistoricalDocuments, feng, contrib)); index.DocIdRedirections.Add(redirects); var debugJsonOutput = index.ToString(); File.WriteAllText(Path.Combine(outputFolder, "index_debug.json"), debugJsonOutput); using (var outputFile = File.Create(Path.Combine(outputFolder, "yngdieng_index.bin"))) { index.WriteTo(outputFile); } CreateLuceneIndex(index); return(0); }