public void Morphology(string path, [DefaultValue(false)] bool sequential) { var enginePool = new GrammarEnginePool(ConfigurationManager.AppSettings["GrammarPath"]); const int batchSize = 16; _log.Info($"Got {Environment.ProcessorCount} threads. Batch size for each thread: {batchSize}."); var processor = new FileProcessor <MorphologyFileContext, Sentence>( path, enginePool, batchSize, fileContextFactory: file => new MorphologyFileContext(file), sentenceContextFactory: fileContext => fileContext.CreateSentence(), action: (job, context) => { if (string.IsNullOrEmpty(job.Sentence)) { job.Context.IsReady = true; context.Flush(); return; } var engine = enginePool.GetInstance(); using (var lemmatized = engine.AnalyzeMorphology(job.Sentence, Languages.RUSSIAN_LANGUAGE, MorphologyFlags.SOL_GREN_MODEL | MorphologyFlags.SOL_GREN_MODEL_ONLY)) { if (lemmatized.Nodes.Length == 0) { return; } var tokens = new Token[lemmatized.Nodes.Length]; for (int i = 0; i < lemmatized.Nodes.Length; i++) { var node = lemmatized.Nodes[i]; tokens[i] = new Token(node.SourceWord, node.Entry.Id); } job.Context.Tokens = tokens; job.Context.IsReady = true; } enginePool.ReturnInstance(engine); context.Flush(); }, fileFinalizer: (file, context) => { context.FinalizeFile(); }, numTasks: sequential ? 1 : Environment.ProcessorCount); processor.Process(); }
public FileProcessor(string path, GrammarEnginePool enginePool, int batchSize, Func <string, TFileContext> fileContextFactory, Func <TFileContext, TSentenceContext> sentenceContextFactory, Action <SentenceJob <TSentenceContext>, TFileContext> action, Action <string, TFileContext> fileFinalizer) : this(path, enginePool, batchSize, fileContextFactory, sentenceContextFactory, action, fileFinalizer, Environment.ProcessorCount) { }
public FileProcessor(string path, GrammarEnginePool enginePool, int batchSize, Func <string, TFileContext> fileContextFactory, Func <TFileContext, TSentenceContext> sentenceContextFactory, Action <SentenceJob <TSentenceContext>, TFileContext> action, Action <string, TFileContext> fileFinalizer, int numTasks) { _enginePool = enginePool; _batchSize = batchSize; _action = action; _numTasks = numTasks; _fileFinalizer = fileFinalizer; _fileContextFactory = fileContextFactory; _sentenceContextFactory = sentenceContextFactory; _log.Info($"Path to process is: {path}"); if (File.Exists(path)) { _log.Info("This is a single file."); _files = new[] { path }; return; } string dir = Path.GetDirectoryName(path); if (!Directory.Exists(dir)) { throw new InvalidOperationException("Specified path is not a file or directory!"); } string pattern = Path.GetFileName(path); _files = Directory.GetFiles(dir, string.IsNullOrEmpty(pattern) ? "*" : pattern); _log.Info($"Discovered {_files.Length} files."); }
public void WordFrequency(string path) { var enginePool = new GrammarEnginePool(ConfigurationManager.AppSettings["GrammarPath"]); var processor = new FileProcessor <Frequency, object>( path, enginePool, 32, fileContextFactory: file => new Frequency(), sentenceContextFactory: fileContext => null, action: (job, frequency) => { if (string.IsNullOrEmpty(job.Sentence)) { return; } var engine = enginePool.GetInstance(); var tokens = engine.TokenizeSentence(job.Sentence, Languages.RUSSIAN_LANGUAGE); for (int i = 0; i < tokens.Length; i++) { string token = tokens[i]; using (var forms = engine.FindWordForms(tokens[i])) { if (forms.Count == 0) { frequency.UnknownWords.AddOrUpdate(token.ToLower(), 1, (s, cur) => cur + 1); continue; } var entry = engine.GetEntry(forms.GetEntryKey(0)); if (entry.WordClass == WordClassesRu.PUNCTUATION_class || entry.WordClass == WordClassesRu.NUMBER_CLASS_ru || entry.WordClass == WordClassesRu.NUM_WORD_CLASS) { continue; } frequency.KnownWords.AddOrUpdate(token.ToLower(), 1, (s, cur) => cur + 1); } } enginePool.ReturnInstance(engine); }, fileFinalizer: (file, frequency) => { string dir = Path.Combine(Path.GetDirectoryName(file), "frequency"); if (!Directory.Exists(dir)) { Directory.CreateDirectory(dir); } string fileName = Path.GetFileNameWithoutExtension(file); string ext = Path.GetExtension(file); string knownPath = Path.Combine(dir, fileName + "_known" + ext); string unknownPath = Path.Combine(dir, fileName + "_unknown" + ext); using (var writer = new StreamWriter(knownPath)) { foreach (var pair in frequency.KnownWords.OrderByDescending(x => x.Value)) { writer.WriteLine($"{pair.Key};{pair.Value}"); } } using (var writer = new StreamWriter(unknownPath)) { foreach (var pair in frequency.UnknownWords.OrderByDescending(x => x.Value)) { writer.WriteLine($"{pair.Key};{pair.Value}"); } } }); processor.Process(); }