private void menu_file_open_file_Click(object sender, EventArgs e) { var ofd = new OpenFileDialog { Filter = string.Join("|", Configuration.AddonScrapers.Select(x => x.Key)), CheckFileExists = true }; if (ofd.ShowDialog() != DialogResult.OK) return; var filter = Configuration.AddonScrapers.ToArray()[ofd.FilterIndex - 1].Value; filter.Files = ofd.FileNames; var cleanup = new StandardCleanup { ScraperResults = filter.Execute() }; var tagger = Configuration.AddonParsers.FirstOrDefault(x => x.DisplayName == "Keine Annotation - Nur Textimport"); if (tagger == null) { MessageBox.Show("RawText-Tagger not available. Please install CorpusExplorer:\nhttp://corpusexplorer.de"); return; } tagger.ScraperResults = cleanup.Execute(); Project.Add(tagger.Execute()); RefreshDataBinding(); }
private static AbstractCorpusAdapter LoadCorpusAnnotate(string path) { // Bsp.: annotate#BundestagPlenarprotokolleScraper#[TAGGER]#[LANGUAGE]#[DIRECTORY] var split = path.Split(new[] { "#" }, StringSplitOptions.RemoveEmptyEntries).ToList(); if (split.Count != 5) { return(null); } split.RemoveAt(0); // entfernt annotate# var scraper = Configuration.AddonScrapers.GetReflectedType(split[0], "Scraper"); if (scraper == null) { return(null); } split.RemoveAt(0); // entfernt [SCRAPER] // Cleaner bereinigen Meta-/Textdaten var cleaner = new StandardCleanup(); // Tagger annotieren Textdaten var tagger = Configuration.AddonTaggers.GetReflectedType(split[0], "Tagger"); if (tagger == null) { return(null); } split.RemoveAt(0); // entfernt [TAGGER] tagger.LanguageSelected = split[0]; split.RemoveAt(0); // entfernt [LANGUAGE] var files = Directory.GetFiles(split[0].Replace("\"", ""), "*.*", SearchOption.TopDirectoryOnly); // Nachdem alle Informationen vorliegen, arbeite die Dateien ab. scraper.Input.Enqueue(files); scraper.Execute(); cleaner.Input.Enqueue(scraper.Output); cleaner.Execute(); tagger.Input.Enqueue(cleaner.Output); tagger.Execute(); return(tagger.Output.FirstOrDefault()); }
private static void ExecuteProcessingWorkflow( out AbstractCorpusAdapter corpus, out HashSet <string> list, out Dictionary <string, double> vecs, IEnumerable <Dictionary <string, object> > pages, Dictionary <string, object> cmeta) { // CLEAN TEXT var cleanup = new StandardCleanup(); foreach (var page in pages) { cleanup.Input.Enqueue(page); } cleanup.Execute(); // PARSE TEXT var tagger = new RawTextTagger { Input = cleanup.Output, CorpusBuilder = new CorpusBuilderWriteDirect() }; tagger.Execute(); // GET CORPUS-MODEL corpus = tagger.Output.FirstOrDefault(); if (corpus == null || corpus.CountDocuments == 0 || corpus.CountToken == 0) { corpus = null; list = null; vecs = null; return; } // POST-PRODUCTION foreach (var m in cmeta) { corpus.SetCorpusMetadata(m.Key, m.Value); } // SAVE MODEL list = new HashSet <string>(corpus.GetLayers("Wort").First().Values); vecs = ContextToVec(corpus); }
private void UseTagger(HttpContext req, string language, Dictionary <string, object>[] docs, bool enableCleanup) { var tagger = new SimpleTreeTagger(); var available = new HashSet <string>(tagger.LanguagesAvailabel); if (!available.Contains(language)) { WriteError(req, string.Format(Resources.WebErrorWrongLanguage, string.Join(", ", available))); return; } if (enableCleanup) { var cleaner1 = new StandardCleanup(); cleaner1.Input.Enqueue(docs); cleaner1.Execute(); var cleaner2 = new RegexXmlMarkupCleanup { Input = cleaner1.Output }; cleaner2.Execute(); tagger.Input = cleaner2.Output; } else { tagger.Input.Enqueue(docs); } tagger.LanguageSelected = language; tagger.Execute(); var corpus = tagger.Output.First(); if (corpus == null || corpus.CountDocuments == 0 || corpus.CountToken == 0) { WriteError(req, Resources.WebErrorTaggingProcessError); return; } corpus.Save($"corpora/{corpus.CorporaGuids.First()}.cec6", false); req.Response.Send($"{{ \"corpusId\": \"{corpus.CorporaGuids.First()}\" }}", "application/json"); }