public void Transform(ICorpusReader <TIn> reader, ICorpusWriter <TOut> writer)
        {
            var source = reader.Read();
            var target = Transform(source);

            writer.Write(target);
        }
Esempio n. 2
0
        public static IList <(string Word, int Count)> CountWords(
            ICorpusReader <IEnumerable <string> > reader,
            Func <string, string> wordTransformer)
        {
            var counter = new Dictionary <string, int>();

            foreach (var block in reader.Read())
            {
                foreach (var doc in block.Documents)
                {
                    foreach (var word in doc.Data.Select(wordTransformer))
                    {
                        counter.TryGetValue(word, out int count);
                        counter[word] = count + 1;
                    }
                }
            }

            return(counter.OrderByDescending(p => p.Value).Select(p => (Word: p.Key, Count: p.Value)).ToArray());
        }
		public void BeginIndexing(ICorpusReader corpusReader, string corpusName)
		{
			if (IndexingStatus != null)
				return; // there's already an indexing process running

			// Init
			IndexingStatus = new IndexingProgressInfo
			{
				IndexName = corpusName,
				Percentage = 0,
				Status = "Launching",
				IsRunning = true,
			};

			// Create a morphologic analyzer to be used for indexing by default
			var morphIndexingAnalyzer = new HtmlMorphAnalyzer(MorphAnalyzer);
			morphIndexingAnalyzer.alwaysSaveMarkedOriginal = true; // to allow for non-morphologic searches too
			var indexingAnalyzer = new PerFieldAnalyzerWrapper(morphIndexingAnalyzer);
			
			// Allow for one field to be indexed using StandardAnalyzer, for the purpose of comparison
			indexingAnalyzer.AddAnalyzer("TitleDefault", new HtmlStandardAnalyzer());
			indexingAnalyzer.AddAnalyzer("ContentDefault", new HtmlStandardAnalyzer());

			// Create the indexer
			var indexPath = Path.Combine(IndexesStoragePath, corpusName);
			var writer = new IndexWriter(FSDirectory.Open(new DirectoryInfo(indexPath)), indexingAnalyzer, true,
										 IndexWriter.MaxFieldLength.UNLIMITED);
			writer.SetUseCompoundFile(false);

			// This will be called whenever a document is read by the provided ICorpusReader
			corpusReader.OnDocument += corpusDoc =>
			{
				var content = corpusDoc.AsHtml();

				// skip blank documents, they are worthless to us (even though they have a title we could index)
				if (string.IsNullOrEmpty(content))
					return;

				// Create a new index document
				var doc = new Document();
				doc.Add(new Field("Id", corpusDoc.Id, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
				
				// Add title field
				var titleField = new Field("Title", corpusDoc.Title, Field.Store.COMPRESS, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
				titleField.SetBoost(3.0f);
				doc.Add(titleField);

				titleField = new Field("TitleDefault", corpusDoc.Title, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
				titleField.SetBoost(3.0f);
				doc.Add(titleField);
				
				// Add two versions of content - one will be analyzed by HebMorph and the other by Lucene's StandardAnalyzer
				doc.Add(new Field("Content", content, Field.Store.COMPRESS, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
				doc.Add(new Field("ContentDefault", content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));

				writer.AddDocument(doc);
			};

			// Progress reporting
			corpusReader.OnProgress += (percentage, status, isRunning) =>
			{
				IndexingStatus = new IndexingProgressInfo
				{
					IndexName = corpusName,
					Percentage = percentage,
					Status = status,
					IsRunning = isRunning,
				};
			};

			// Execute corpus reading, which will trigger indexing for each document found
			corpusReader.Read();

			// Wrap up, optimize and cleanup
			IndexingStatus = new IndexingProgressInfo
			{
				IndexName = corpusName,
				Percentage = 100,
				Status = "Optimizing index",
				IsRunning = true,
			};

			// Clean up and close
			writer.SetUseCompoundFile(true);
			writer.Optimize();
			writer.Close();

			UpdateIndexesList();

			IndexingStatus = null;
		}