public string Tag(string text, bool xmlOutput) { while (!Global.mReady) { Thread.Sleep(100); } Corpus corpus = new Corpus(); corpus.LoadFromTextSsjTokenizer(text); int lemmaCorrect, lemmaCorrectLowercase, lemmaWords; Global.mPosTagger.Tag(corpus, out lemmaCorrect, out lemmaCorrectLowercase, out lemmaWords, /*xmlMode=*/ false); return(xmlOutput ? corpus.ToString("XML-MI") : corpus.ToString("TBL")); }
static void Main(string[] args) { try { if (args.Length < 5) { OutputHelp(); } else { string tags_1_file_name = null, tags_2_file_name = null, tbl_file_name = null, tree_file_name = null, output_file_name = null; if (ParseParams(args, ref m_verbose, ref m_consistency_check, ref tags_1_file_name, ref tags_2_file_name, ref tbl_file_name, ref tree_file_name, ref output_file_name)) { Verbose("Nalagam izhod prvega označevalnika ...\r\n"); Corpus corpus_1 = new Corpus(); corpus_1.LoadFromXml(tags_1_file_name, /*tag_len=*/ -1); Verbose("Nalagam izhod drugega označevalnika ...\r\n"); Corpus corpus_2 = new Corpus(); corpus_2.LoadFromXml(tags_2_file_name, /*tag_len=*/ -1); if (!m_consistency_check) { if (corpus_1.TaggedWords.Count != corpus_2.TaggedWords.Count) { Console.WriteLine("*** Napaka! Dolžini vhodnih korpusov se ne ujemata."); return; } } else { Verbose("Preverjam ujemanje besed v vhodnih korpusih ...\r\n"); for (int i = 0; i < corpus_1.TaggedWords.Count; i++) { if (corpus_1.TaggedWords[i].Word.ToLower() != corpus_2.TaggedWords[i].Word.ToLower()) { Console.WriteLine("*** Napaka! Besede v vhodnih korpusih se ne ujemajo."); return; } } } Verbose("Nalagam tabelo oznak ...\r\n"); MetaTaggerData.LoadAttributes(tbl_file_name); Verbose("Nalagam odločitveno drevo ...\r\n"); Tree tree = new Tree(tree_file_name); Verbose("Označujem besedilo ...\r\n"); MetaTaggerData.LoadTestData(corpus_1, corpus_2); for (int i = 0; i < MetaTaggerData.Items.Count; i++) { if (MetaTaggerData.Items[i].Tag1 != MetaTaggerData.Items[i].Tag2) { ArrayList <KeyDat <string, string> > test_example = MetaTaggerData.CreateExample(i); if (tree.Classify(test_example) != 1) { corpus_1.TaggedWords[i].Lemma = MetaTaggerData.Items[i].Lemma2; corpus_1.TaggedWords[i].Tag = MetaTaggerData.Items[i].Tag2; } } } Verbose("Pišem izhodno datoteko ...\r\n"); StreamWriter writer = new StreamWriter(output_file_name); writer.Write(corpus_1.ToString("XML-MI")); writer.Close(); Verbose("Končano.\r\n"); } } } catch (Exception exception) { Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }
static void Main(string[] args) { Console.WriteLine("Nalagam meta-podatke o blogih..."); LoadBlogMetaData(); Console.WriteLine("Nalagam oznacevalnik..."); PartOfSpeechTagger posTagger = new PartOfSpeechTagger(Config.PosTaggerModel, Config.LemmatizerModel); string pattern = "*.xml"; if (args.Length > 0) { pattern = args[0]; } Queue <string> fileNames = new Queue <string>(Directory.GetFiles(Config.DataFolder, pattern)); while (fileNames.Count > 0) { int n = 0; XmlDocument fullDoc = null; while (fileNames.Count > 0 && n < Config.BatchSize) { string fileName = fileNames.Dequeue(); if (File.Exists(MakeOutputFileName(fileName))) { Console.WriteLine("Ze obdelano: {0}.", fileName); continue; } if (File.Exists(MakeOutputFileName(fileName) + ".locked")) { Console.WriteLine("Zaklenjeno: {0}.", fileName); continue; } // load text Console.WriteLine("Datoteka: {0}...", fileName); XmlDocument tmpDoc = new XmlDocument(); string xml = File.ReadAllText(fileName); xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", ""); tmpDoc.LoadXml(xml); string text = tmpDoc.SelectSingleNode("//besedilo").InnerText; if (text.Trim() == "") // *** empty documents are ignored { Console.WriteLine("*** Datoteka ne vsebuje besedila."); continue; } Corpus corpus = new Corpus(); corpus.LoadFromTextSsjTokenizer(text); // tag text Console.WriteLine("Oznacujem besedilo..."); posTagger.Tag(corpus); XmlDocument doc = new XmlDocument(); doc.LoadXml(corpus.ToString("XML-MI").Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", "")); // *** remove this f***ing namespace ((XmlElement)doc.SelectSingleNode("//text")).SetAttribute("fileName", fileName); // append text to fullDoc if (fullDoc == null) { fullDoc = doc; } else { XmlDocumentFragment xmlFrag = fullDoc.CreateDocumentFragment(); xmlFrag.InnerXml = doc.SelectSingleNode("//text").OuterXml; fullDoc.DocumentElement.AppendChild(xmlFrag); } n++; // check if meta-data exists //string key = tmpDoc.SelectSingleNode("//header/blog").InnerText; //if (!mBlogMetaData.ContainsKey(key)) //{ // Console.WriteLine("*** Cannot find meta-data for " + key); // return; //} } // nothing to do? if (fullDoc == null) { continue; } // save tagged text for parsing Console.WriteLine("Pripravljam datoteke za razclenjevanje..."); Guid tmpId = Guid.NewGuid(); string tmpFileNameIn = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".tmp").FullName; string tmpFileNameOut = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".out.tmp").FullName; XmlWriterSettings xmlSettings = new XmlWriterSettings(); xmlSettings.Encoding = Encoding.UTF8; xmlSettings.Indent = true; using (XmlWriter w = XmlWriter.Create(tmpFileNameIn, xmlSettings)) { fullDoc.Save(w); } // parse text Console.WriteLine("Zaganjam razclenjevalnik..."); Parser.Parse(tmpFileNameIn, tmpFileNameOut); // load results if (!File.Exists(tmpFileNameOut)) { // lock files and continue Console.WriteLine("*** Prislo je do napake pri razclenjevanju. Nadaljujem z obdelavo."); fullDoc.SelectNodes("//text").Cast <XmlElement>().ToList().ForEach(x => LockFile(x.Attributes["fileName"].Value)); continue; } fullDoc = new XmlDocument(); fullDoc.Load(tmpFileNameOut); // create output files Console.WriteLine("Pisem izhodne datoteke..."); foreach (XmlNode txtNode in fullDoc.SelectNodes("//text")) { string fileName = txtNode.Attributes["fileName"].Value; ((XmlElement)txtNode).RemoveAttribute("fileName"); Console.WriteLine("Datoteka: {0}...", fileName); XmlDocument tmpDoc = new XmlDocument(); string xml = File.ReadAllText(fileName); xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", ""); tmpDoc.LoadXml(xml); // insert input XML into TEI-XML XmlDocument doc = new XmlDocument(); doc.LoadXml("<TEI>" + txtNode.OuterXml + "</TEI>"); XmlDocumentFragment docPart = doc.CreateDocumentFragment(); docPart.InnerXml = tmpDoc.OuterXml; doc.DocumentElement.PrependChild(docPart); // insert blog meta-data string key = doc.SelectSingleNode("//header/blog").InnerText; BlogMetaData metaData; if (!mBlogMetaData.ContainsKey(key)) { Console.WriteLine("*** Ne najdem podatkov o blogu \"{0}\".", key); continue; } else { Console.WriteLine("Vstavljam meta-podatke o blogu..."); metaData = mBlogMetaData[key]; XmlNode node = doc.SelectSingleNode("//header"); node.AppendChild(doc.CreateElement("blogSpletniNaslov")).InnerText = metaData.mBlogUrl; node.AppendChild(doc.CreateElement("blogNaslov")).InnerText = metaData.mBlogTitle; node.AppendChild(doc.CreateElement("blogNaslovKratek")).InnerText = metaData.mBlogTitleShort; //node.AppendChild(doc.CreateElement("avtorEMail")).InnerText = metaData.mAuthorEMail; node.AppendChild(doc.CreateElement("avtorSpol")).InnerText = metaData.mAuthorGender; node.AppendChild(doc.CreateElement("avtorStarost")).InnerText = metaData.mAuthorAge; node.AppendChild(doc.CreateElement("avtorRegija")).InnerText = metaData.mAuthorLocation; node.AppendChild(doc.CreateElement("avtorIzobrazba")).InnerText = metaData.mAuthorEducation; } // write results Console.WriteLine("Zapisujem rezultate..."); using (XmlWriter w = XmlWriter.Create(MakeOutputFileName(fileName), xmlSettings)) { doc.Save(w); } } } // purge temp folder Directory.GetFiles(Config.TmpFolder, "*.tmp").ToList().ForEach(x => File.Delete(x)); // all done Console.WriteLine("Koncano."); }