예제 #1
0
    private void menu_file_open_file_Click(object sender, EventArgs e)
    {
      var ofd = new OpenFileDialog
      {
        Filter = string.Join("|", Configuration.AddonScrapers.Select(x => x.Key)),
        CheckFileExists = true
      };

      if (ofd.ShowDialog() != DialogResult.OK) return;

      var filter = Configuration.AddonScrapers.ToArray()[ofd.FilterIndex - 1].Value;
      filter.Files = ofd.FileNames;

      var cleanup = new StandardCleanup
      {
        ScraperResults = filter.Execute()
      };

      var tagger = Configuration.AddonParsers.FirstOrDefault(x => x.DisplayName == "Keine Annotation - Nur Textimport");
      if (tagger == null)
      {
        MessageBox.Show("RawText-Tagger not available. Please install CorpusExplorer:\nhttp://corpusexplorer.de");
        return;
      }
      tagger.ScraperResults = cleanup.Execute();

      Project.Add(tagger.Execute());

      RefreshDataBinding();
    }
        private static AbstractCorpusAdapter LoadCorpusAnnotate(string path)
        {
            // Bsp.: annotate#BundestagPlenarprotokolleScraper#[TAGGER]#[LANGUAGE]#[DIRECTORY]
            var split = path.Split(new[] { "#" }, StringSplitOptions.RemoveEmptyEntries).ToList();

            if (split.Count != 5)
            {
                return(null);
            }

            split.RemoveAt(0); // entfernt annotate#

            var scraper = Configuration.AddonScrapers.GetReflectedType(split[0], "Scraper");

            if (scraper == null)
            {
                return(null);
            }
            split.RemoveAt(0); // entfernt [SCRAPER]

            // Cleaner bereinigen Meta-/Textdaten
            var cleaner = new StandardCleanup();

            // Tagger annotieren Textdaten
            var tagger = Configuration.AddonTaggers.GetReflectedType(split[0], "Tagger");

            if (tagger == null)
            {
                return(null);
            }
            split.RemoveAt(0); // entfernt [TAGGER]

            tagger.LanguageSelected = split[0];
            split.RemoveAt(0); // entfernt [LANGUAGE]
            var files = Directory.GetFiles(split[0].Replace("\"", ""), "*.*", SearchOption.TopDirectoryOnly);

            // Nachdem alle Informationen vorliegen, arbeite die Dateien ab.
            scraper.Input.Enqueue(files);
            scraper.Execute();
            cleaner.Input.Enqueue(scraper.Output);
            cleaner.Execute();
            tagger.Input.Enqueue(cleaner.Output);
            tagger.Execute();

            return(tagger.Output.FirstOrDefault());
        }
        private static void ExecuteProcessingWorkflow(
            out AbstractCorpusAdapter corpus,
            out HashSet <string> list,
            out Dictionary <string, double> vecs,
            IEnumerable <Dictionary <string, object> > pages,
            Dictionary <string, object> cmeta)
        {
            // CLEAN TEXT
            var cleanup = new StandardCleanup();

            foreach (var page in pages)
            {
                cleanup.Input.Enqueue(page);
            }
            cleanup.Execute();

            // PARSE TEXT
            var tagger = new RawTextTagger
            {
                Input         = cleanup.Output,
                CorpusBuilder = new CorpusBuilderWriteDirect()
            };

            tagger.Execute();

            // GET CORPUS-MODEL
            corpus = tagger.Output.FirstOrDefault();
            if (corpus == null || corpus.CountDocuments == 0 || corpus.CountToken == 0)
            {
                corpus = null;
                list   = null;
                vecs   = null;
                return;
            }

            // POST-PRODUCTION
            foreach (var m in cmeta)
            {
                corpus.SetCorpusMetadata(m.Key, m.Value);
            }

            // SAVE MODEL
            list = new HashSet <string>(corpus.GetLayers("Wort").First().Values);
            vecs = ContextToVec(corpus);
        }
        private void UseTagger(HttpContext req, string language, Dictionary <string, object>[] docs,
                               bool enableCleanup)
        {
            var tagger    = new SimpleTreeTagger();
            var available = new HashSet <string>(tagger.LanguagesAvailabel);

            if (!available.Contains(language))
            {
                WriteError(req, string.Format(Resources.WebErrorWrongLanguage, string.Join(", ", available)));
                return;
            }

            if (enableCleanup)
            {
                var cleaner1 = new StandardCleanup();
                cleaner1.Input.Enqueue(docs);
                cleaner1.Execute();
                var cleaner2 = new RegexXmlMarkupCleanup {
                    Input = cleaner1.Output
                };
                cleaner2.Execute();
                tagger.Input = cleaner2.Output;
            }
            else
            {
                tagger.Input.Enqueue(docs);
            }

            tagger.LanguageSelected = language;
            tagger.Execute();
            var corpus = tagger.Output.First();

            if (corpus == null || corpus.CountDocuments == 0 || corpus.CountToken == 0)
            {
                WriteError(req, Resources.WebErrorTaggingProcessError);
                return;
            }

            corpus.Save($"corpora/{corpus.CorporaGuids.First()}.cec6", false);
            req.Response.Send($"{{ \"corpusId\": \"{corpus.CorporaGuids.First()}\" }}", "application/json");
        }