public void Morphology(string path, [DefaultValue(false)] bool sequential)
        {
            var       enginePool = new GrammarEnginePool(ConfigurationManager.AppSettings["GrammarPath"]);
            const int batchSize  = 16;

            _log.Info($"Got {Environment.ProcessorCount} threads. Batch size for each thread: {batchSize}.");

            var processor = new FileProcessor <MorphologyFileContext, Sentence>(
                path,
                enginePool,
                batchSize,
                fileContextFactory: file => new MorphologyFileContext(file),
                sentenceContextFactory: fileContext => fileContext.CreateSentence(),
                action: (job, context) =>
            {
                if (string.IsNullOrEmpty(job.Sentence))
                {
                    job.Context.IsReady = true;
                    context.Flush();
                    return;
                }

                var engine = enginePool.GetInstance();
                using (var lemmatized = engine.AnalyzeMorphology(job.Sentence, Languages.RUSSIAN_LANGUAGE, MorphologyFlags.SOL_GREN_MODEL | MorphologyFlags.SOL_GREN_MODEL_ONLY))
                {
                    if (lemmatized.Nodes.Length == 0)
                    {
                        return;
                    }

                    var tokens = new Token[lemmatized.Nodes.Length];
                    for (int i = 0; i < lemmatized.Nodes.Length; i++)
                    {
                        var node  = lemmatized.Nodes[i];
                        tokens[i] = new Token(node.SourceWord, node.Entry.Id);
                    }

                    job.Context.Tokens  = tokens;
                    job.Context.IsReady = true;
                }

                enginePool.ReturnInstance(engine);
                context.Flush();
            },
                fileFinalizer: (file, context) =>
            {
                context.FinalizeFile();
            },
                numTasks: sequential ? 1 : Environment.ProcessorCount);

            processor.Process();
        }
Example #2
0
 public FileProcessor(string path,
                      GrammarEnginePool enginePool,
                      int batchSize,
                      Func <string, TFileContext> fileContextFactory,
                      Func <TFileContext, TSentenceContext> sentenceContextFactory,
                      Action <SentenceJob <TSentenceContext>, TFileContext> action,
                      Action <string, TFileContext> fileFinalizer)
     : this(path,
            enginePool,
            batchSize,
            fileContextFactory,
            sentenceContextFactory,
            action,
            fileFinalizer,
            Environment.ProcessorCount)
 {
 }
Example #3
0
        public FileProcessor(string path,
                             GrammarEnginePool enginePool,
                             int batchSize,
                             Func <string, TFileContext> fileContextFactory,
                             Func <TFileContext, TSentenceContext> sentenceContextFactory,
                             Action <SentenceJob <TSentenceContext>, TFileContext> action,
                             Action <string, TFileContext> fileFinalizer,
                             int numTasks)
        {
            _enginePool             = enginePool;
            _batchSize              = batchSize;
            _action                 = action;
            _numTasks               = numTasks;
            _fileFinalizer          = fileFinalizer;
            _fileContextFactory     = fileContextFactory;
            _sentenceContextFactory = sentenceContextFactory;
            _log.Info($"Path to process is: {path}");
            if (File.Exists(path))
            {
                _log.Info("This is a single file.");
                _files = new[] { path };
                return;
            }

            string dir = Path.GetDirectoryName(path);

            if (!Directory.Exists(dir))
            {
                throw new InvalidOperationException("Specified path is not a file or directory!");
            }

            string pattern = Path.GetFileName(path);

            _files = Directory.GetFiles(dir, string.IsNullOrEmpty(pattern) ? "*" : pattern);

            _log.Info($"Discovered {_files.Length} files.");
        }
        public void WordFrequency(string path)
        {
            var enginePool = new GrammarEnginePool(ConfigurationManager.AppSettings["GrammarPath"]);
            var processor  = new FileProcessor <Frequency, object>(
                path,
                enginePool,
                32,
                fileContextFactory: file => new Frequency(),
                sentenceContextFactory: fileContext => null,
                action: (job, frequency) =>
            {
                if (string.IsNullOrEmpty(job.Sentence))
                {
                    return;
                }

                var engine = enginePool.GetInstance();
                var tokens = engine.TokenizeSentence(job.Sentence, Languages.RUSSIAN_LANGUAGE);
                for (int i = 0; i < tokens.Length; i++)
                {
                    string token = tokens[i];
                    using (var forms = engine.FindWordForms(tokens[i]))
                    {
                        if (forms.Count == 0)
                        {
                            frequency.UnknownWords.AddOrUpdate(token.ToLower(), 1, (s, cur) => cur + 1);
                            continue;
                        }

                        var entry = engine.GetEntry(forms.GetEntryKey(0));
                        if (entry.WordClass == WordClassesRu.PUNCTUATION_class ||
                            entry.WordClass == WordClassesRu.NUMBER_CLASS_ru ||
                            entry.WordClass == WordClassesRu.NUM_WORD_CLASS)
                        {
                            continue;
                        }

                        frequency.KnownWords.AddOrUpdate(token.ToLower(), 1, (s, cur) => cur + 1);
                    }
                }

                enginePool.ReturnInstance(engine);
            },
                fileFinalizer: (file, frequency) =>
            {
                string dir = Path.Combine(Path.GetDirectoryName(file), "frequency");
                if (!Directory.Exists(dir))
                {
                    Directory.CreateDirectory(dir);
                }

                string fileName    = Path.GetFileNameWithoutExtension(file);
                string ext         = Path.GetExtension(file);
                string knownPath   = Path.Combine(dir, fileName + "_known" + ext);
                string unknownPath = Path.Combine(dir, fileName + "_unknown" + ext);

                using (var writer = new StreamWriter(knownPath))
                {
                    foreach (var pair in frequency.KnownWords.OrderByDescending(x => x.Value))
                    {
                        writer.WriteLine($"{pair.Key};{pair.Value}");
                    }
                }

                using (var writer = new StreamWriter(unknownPath))
                {
                    foreach (var pair in frequency.UnknownWords.OrderByDescending(x => x.Value))
                    {
                        writer.WriteLine($"{pair.Key};{pair.Value}");
                    }
                }
            });

            processor.Process();
        }