public void Transform(ICorpusReader <TIn> reader, ICorpusWriter <TOut> writer) { var source = reader.Read(); var target = Transform(source); writer.Write(target); }
public static IList <(string Word, int Count)> CountWords( ICorpusReader <IEnumerable <string> > reader, Func <string, string> wordTransformer) { var counter = new Dictionary <string, int>(); foreach (var block in reader.Read()) { foreach (var doc in block.Documents) { foreach (var word in doc.Data.Select(wordTransformer)) { counter.TryGetValue(word, out int count); counter[word] = count + 1; } } } return(counter.OrderByDescending(p => p.Value).Select(p => (Word: p.Key, Count: p.Value)).ToArray()); }
public void BeginIndexing(ICorpusReader corpusReader, string corpusName) { if (IndexingStatus != null) return; // there's already an indexing process running // Init IndexingStatus = new IndexingProgressInfo { IndexName = corpusName, Percentage = 0, Status = "Launching", IsRunning = true, }; // Create a morphologic analyzer to be used for indexing by default var morphIndexingAnalyzer = new HtmlMorphAnalyzer(MorphAnalyzer); morphIndexingAnalyzer.alwaysSaveMarkedOriginal = true; // to allow for non-morphologic searches too var indexingAnalyzer = new PerFieldAnalyzerWrapper(morphIndexingAnalyzer); // Allow for one field to be indexed using StandardAnalyzer, for the purpose of comparison indexingAnalyzer.AddAnalyzer("TitleDefault", new HtmlStandardAnalyzer()); indexingAnalyzer.AddAnalyzer("ContentDefault", new HtmlStandardAnalyzer()); // Create the indexer var indexPath = Path.Combine(IndexesStoragePath, corpusName); var writer = new IndexWriter(FSDirectory.Open(new DirectoryInfo(indexPath)), indexingAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); writer.SetUseCompoundFile(false); // This will be called whenever a document is read by the provided ICorpusReader corpusReader.OnDocument += corpusDoc => { var content = corpusDoc.AsHtml(); // skip blank documents, they are worthless to us (even though they have a title we could index) if (string.IsNullOrEmpty(content)) return; // Create a new index document var doc = new Document(); doc.Add(new Field("Id", corpusDoc.Id, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); // Add title field var titleField = new Field("Title", corpusDoc.Title, Field.Store.COMPRESS, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); titleField.SetBoost(3.0f); doc.Add(titleField); titleField = new Field("TitleDefault", corpusDoc.Title, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); titleField.SetBoost(3.0f); doc.Add(titleField); // Add two versions of content - one will be analyzed by HebMorph and the other by Lucene's StandardAnalyzer doc.Add(new Field("Content", content, Field.Store.COMPRESS, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("ContentDefault", content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); }; // Progress reporting corpusReader.OnProgress += (percentage, status, isRunning) => { IndexingStatus = new IndexingProgressInfo { IndexName = corpusName, Percentage = percentage, Status = status, IsRunning = isRunning, }; }; // Execute corpus reading, which will trigger indexing for each document found corpusReader.Read(); // Wrap up, optimize and cleanup IndexingStatus = new IndexingProgressInfo { IndexName = corpusName, Percentage = 100, Status = "Optimizing index", IsRunning = true, }; // Clean up and close writer.SetUseCompoundFile(true); writer.Optimize(); writer.Close(); UpdateIndexesList(); IndexingStatus = null; }