public static void InlineCorpus( ref string inlineText, out AbstractCorpusAdapter corpus) { HashSet <string> list; Dictionary <string, double> vecs; Inline(ref inlineText, out corpus, out list, out vecs); }
private static void ExecuteProcessingWorkflow( out AbstractCorpusAdapter corpus, out HashSet <string> list, out Dictionary <string, double> vecs, IEnumerable <Dictionary <string, object> > pages, Dictionary <string, object> cmeta) { // CLEAN TEXT var cleanup = new StandardCleanup(); foreach (var page in pages) { cleanup.Input.Enqueue(page); } cleanup.Execute(); // PARSE TEXT var tagger = new RawTextTagger { Input = cleanup.Output, CorpusBuilder = new CorpusBuilderWriteDirect() }; tagger.Execute(); // GET CORPUS-MODEL corpus = tagger.Output.FirstOrDefault(); if (corpus == null || corpus.CountDocuments == 0 || corpus.CountToken == 0) { corpus = null; list = null; vecs = null; return; } // POST-PRODUCTION foreach (var m in cmeta) { corpus.SetCorpusMetadata(m.Key, m.Value); } // SAVE MODEL list = new HashSet <string>(corpus.GetLayers("Wort").First().Values); vecs = ContextToVec(corpus); }
public static void Inline(ref string inlineText, out AbstractCorpusAdapter corpus, out HashSet <string> list, out Dictionary <string, double> vecs) { var pages = new List <Dictionary <string, object> > { new Dictionary <string, object> { { "Text", inlineText }, { "PAGE", 1 } } }; // DETECT LANGUAGE var cmeta = new Dictionary <string, object> { { "LANGUAGE", LanguageDetectorHelper.DetectLanguage(ref pages) } }; ExecuteProcessingWorkflow(out corpus, out list, out vecs, pages, cmeta); }
private static Dictionary <string, double> ContextToVec(AbstractCorpusAdapter corpus) { var layer = corpus?.GetLayers("Wort")?.First(); var doc = layer?[layer.DocumentGuids.First()]; if (doc == null) { return(null); } var count = 0.0; var dic = new Dictionary <string, double>(); foreach (var s in doc) { count += s.Length; foreach (var w in s) { var key = layer[w]; if (dic.ContainsKey(key)) { dic[key]++; } else { dic.Add(key, 1); } } } var min = (int)(1 + Math.Log(count / 500)); dic = dic.Where(x => x.Value > min).ToDictionary(x => x.Key, x => x.Value); var languageVectors = LanguageVectorModelRepository.GetModel((string)corpus.GetCorpusMetadata("LANGUAGE")); var model = GetVectors(languageVectors, dic.Keys.ToArray()); return(dic.Where(x => model.ContainsKey(x.Key)).ToDictionary(x => x.Key, x => x.Value / count * model[x.Key])); }
public WebService(AbstractTableWriter writer, string ip, int port, string file, int timeout = 0) : base(writer, ip, port, timeout) { System.Console.Write(Resources.WebInit, file); _corpus = CorpusLoadHelper.LoadCorpus(file); System.Console.WriteLine(Resources.Ok); }