private static void ExecuteProcessingWorkflow( out AbstractCorpusAdapter corpus, out HashSet <string> list, out Dictionary <string, double> vecs, IEnumerable <Dictionary <string, object> > pages, Dictionary <string, object> cmeta) { // CLEAN TEXT var cleanup = new StandardCleanup(); foreach (var page in pages) { cleanup.Input.Enqueue(page); } cleanup.Execute(); // PARSE TEXT var tagger = new RawTextTagger { Input = cleanup.Output, CorpusBuilder = new CorpusBuilderWriteDirect() }; tagger.Execute(); // GET CORPUS-MODEL corpus = tagger.Output.FirstOrDefault(); if (corpus == null || corpus.CountDocuments == 0 || corpus.CountToken == 0) { corpus = null; list = null; vecs = null; return; } // POST-PRODUCTION foreach (var m in cmeta) { corpus.SetCorpusMetadata(m.Key, m.Value); } // SAVE MODEL list = new HashSet <string>(corpus.GetLayers("Wort").First().Values); vecs = ContextToVec(corpus); }
private static Dictionary <string, double> ContextToVec(AbstractCorpusAdapter corpus) { var layer = corpus?.GetLayers("Wort")?.First(); var doc = layer?[layer.DocumentGuids.First()]; if (doc == null) { return(null); } var count = 0.0; var dic = new Dictionary <string, double>(); foreach (var s in doc) { count += s.Length; foreach (var w in s) { var key = layer[w]; if (dic.ContainsKey(key)) { dic[key]++; } else { dic.Add(key, 1); } } } var min = (int)(1 + Math.Log(count / 500)); dic = dic.Where(x => x.Value > min).ToDictionary(x => x.Key, x => x.Value); var languageVectors = LanguageVectorModelRepository.GetModel((string)corpus.GetCorpusMetadata("LANGUAGE")); var model = GetVectors(languageVectors, dic.Keys.ToArray()); return(dic.Where(x => model.ContainsKey(x.Key)).ToDictionary(x => x.Key, x => x.Value / count * model[x.Key])); }