Beispiel #1
0
    public string Tag(string text, bool xmlOutput)
    {
        while (!Global.mReady)
        {
            Thread.Sleep(100);
        }
        Corpus corpus = new Corpus();

        corpus.LoadFromTextSsjTokenizer(text);
        int lemmaCorrect, lemmaCorrectLowercase, lemmaWords;

        Global.mPosTagger.Tag(corpus, out lemmaCorrect, out lemmaCorrectLowercase, out lemmaWords, /*xmlMode=*/ false);
        return(xmlOutput ? corpus.ToString("XML-MI") : corpus.ToString("TBL"));
    }
Beispiel #2
0
 static void Main(string[] args)
 {
     try
     {
         if (args.Length < 5)
         {
             OutputHelp();
         }
         else
         {
             string tags_1_file_name = null, tags_2_file_name = null, tbl_file_name = null, tree_file_name = null,
                    output_file_name = null;
             if (ParseParams(args, ref m_verbose, ref m_consistency_check, ref tags_1_file_name, ref tags_2_file_name,
                             ref tbl_file_name, ref tree_file_name, ref output_file_name))
             {
                 Verbose("Nalagam izhod prvega označevalnika ...\r\n");
                 Corpus corpus_1 = new Corpus();
                 corpus_1.LoadFromXml(tags_1_file_name, /*tag_len=*/ -1);
                 Verbose("Nalagam izhod drugega označevalnika ...\r\n");
                 Corpus corpus_2 = new Corpus();
                 corpus_2.LoadFromXml(tags_2_file_name, /*tag_len=*/ -1);
                 if (!m_consistency_check)
                 {
                     if (corpus_1.TaggedWords.Count != corpus_2.TaggedWords.Count)
                     {
                         Console.WriteLine("*** Napaka! Dolžini vhodnih korpusov se ne ujemata.");
                         return;
                     }
                 }
                 else
                 {
                     Verbose("Preverjam ujemanje besed v vhodnih korpusih ...\r\n");
                     for (int i = 0; i < corpus_1.TaggedWords.Count; i++)
                     {
                         if (corpus_1.TaggedWords[i].Word.ToLower() != corpus_2.TaggedWords[i].Word.ToLower())
                         {
                             Console.WriteLine("*** Napaka! Besede v vhodnih korpusih se ne ujemajo.");
                             return;
                         }
                     }
                 }
                 Verbose("Nalagam tabelo oznak ...\r\n");
                 MetaTaggerData.LoadAttributes(tbl_file_name);
                 Verbose("Nalagam odločitveno drevo ...\r\n");
                 Tree tree = new Tree(tree_file_name);
                 Verbose("Označujem besedilo ...\r\n");
                 MetaTaggerData.LoadTestData(corpus_1, corpus_2);
                 for (int i = 0; i < MetaTaggerData.Items.Count; i++)
                 {
                     if (MetaTaggerData.Items[i].Tag1 != MetaTaggerData.Items[i].Tag2)
                     {
                         ArrayList <KeyDat <string, string> > test_example = MetaTaggerData.CreateExample(i);
                         if (tree.Classify(test_example) != 1)
                         {
                             corpus_1.TaggedWords[i].Lemma = MetaTaggerData.Items[i].Lemma2;
                             corpus_1.TaggedWords[i].Tag   = MetaTaggerData.Items[i].Tag2;
                         }
                     }
                 }
                 Verbose("Pišem izhodno datoteko ...\r\n");
                 StreamWriter writer = new StreamWriter(output_file_name);
                 writer.Write(corpus_1.ToString("XML-MI"));
                 writer.Close();
                 Verbose("Končano.\r\n");
             }
         }
     }
     catch (Exception exception)
     {
         Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace);
     }
 }
Beispiel #3
0
        static void Main(string[] args)
        {
            Console.WriteLine("Nalagam meta-podatke o blogih...");
            LoadBlogMetaData();
            Console.WriteLine("Nalagam oznacevalnik...");
            PartOfSpeechTagger posTagger = new PartOfSpeechTagger(Config.PosTaggerModel, Config.LemmatizerModel);
            string             pattern   = "*.xml";

            if (args.Length > 0)
            {
                pattern = args[0];
            }
            Queue <string> fileNames = new Queue <string>(Directory.GetFiles(Config.DataFolder, pattern));

            while (fileNames.Count > 0)
            {
                int         n       = 0;
                XmlDocument fullDoc = null;
                while (fileNames.Count > 0 && n < Config.BatchSize)
                {
                    string fileName = fileNames.Dequeue();
                    if (File.Exists(MakeOutputFileName(fileName)))
                    {
                        Console.WriteLine("Ze obdelano: {0}.", fileName);
                        continue;
                    }
                    if (File.Exists(MakeOutputFileName(fileName) + ".locked"))
                    {
                        Console.WriteLine("Zaklenjeno: {0}.", fileName);
                        continue;
                    }
                    // load text
                    Console.WriteLine("Datoteka: {0}...", fileName);
                    XmlDocument tmpDoc = new XmlDocument();
                    string      xml    = File.ReadAllText(fileName);
                    xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", "");
                    tmpDoc.LoadXml(xml);
                    string text = tmpDoc.SelectSingleNode("//besedilo").InnerText;
                    if (text.Trim() == "") // *** empty documents are ignored
                    {
                        Console.WriteLine("*** Datoteka ne vsebuje besedila.");
                        continue;
                    }
                    Corpus corpus = new Corpus();
                    corpus.LoadFromTextSsjTokenizer(text);
                    // tag text
                    Console.WriteLine("Oznacujem besedilo...");
                    posTagger.Tag(corpus);
                    XmlDocument doc = new XmlDocument();
                    doc.LoadXml(corpus.ToString("XML-MI").Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", "")); // *** remove this f***ing namespace
                    ((XmlElement)doc.SelectSingleNode("//text")).SetAttribute("fileName", fileName);
                    // append text to fullDoc
                    if (fullDoc == null)
                    {
                        fullDoc = doc;
                    }
                    else
                    {
                        XmlDocumentFragment xmlFrag = fullDoc.CreateDocumentFragment();
                        xmlFrag.InnerXml = doc.SelectSingleNode("//text").OuterXml;
                        fullDoc.DocumentElement.AppendChild(xmlFrag);
                    }
                    n++;
                    // check if meta-data exists
                    //string key = tmpDoc.SelectSingleNode("//header/blog").InnerText;
                    //if (!mBlogMetaData.ContainsKey(key))
                    //{
                    //    Console.WriteLine("*** Cannot find meta-data for " + key);
                    //    return;
                    //}
                }
                // nothing to do?
                if (fullDoc == null)
                {
                    continue;
                }
                // save tagged text for parsing
                Console.WriteLine("Pripravljam datoteke za razclenjevanje...");
                Guid              tmpId          = Guid.NewGuid();
                string            tmpFileNameIn  = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".tmp").FullName;
                string            tmpFileNameOut = new FileInfo(Config.TmpFolder + "\\" + tmpId.ToString("N") + ".out.tmp").FullName;
                XmlWriterSettings xmlSettings    = new XmlWriterSettings();
                xmlSettings.Encoding = Encoding.UTF8;
                xmlSettings.Indent   = true;
                using (XmlWriter w = XmlWriter.Create(tmpFileNameIn, xmlSettings))
                {
                    fullDoc.Save(w);
                }
                // parse text
                Console.WriteLine("Zaganjam razclenjevalnik...");
                Parser.Parse(tmpFileNameIn, tmpFileNameOut);
                // load results
                if (!File.Exists(tmpFileNameOut))
                {
                    // lock files and continue
                    Console.WriteLine("*** Prislo je do napake pri razclenjevanju. Nadaljujem z obdelavo.");
                    fullDoc.SelectNodes("//text").Cast <XmlElement>().ToList().ForEach(x => LockFile(x.Attributes["fileName"].Value));
                    continue;
                }
                fullDoc = new XmlDocument();
                fullDoc.Load(tmpFileNameOut);
                // create output files
                Console.WriteLine("Pisem izhodne datoteke...");
                foreach (XmlNode txtNode in fullDoc.SelectNodes("//text"))
                {
                    string fileName = txtNode.Attributes["fileName"].Value;
                    ((XmlElement)txtNode).RemoveAttribute("fileName");
                    Console.WriteLine("Datoteka: {0}...", fileName);
                    XmlDocument tmpDoc = new XmlDocument();
                    string      xml    = File.ReadAllText(fileName);
                    xml = xml.Replace("// ]]>", "").Replace("//--><!]]>", "");
                    tmpDoc.LoadXml(xml);
                    // insert input XML into TEI-XML
                    XmlDocument doc = new XmlDocument();
                    doc.LoadXml("<TEI>" + txtNode.OuterXml + "</TEI>");
                    XmlDocumentFragment docPart = doc.CreateDocumentFragment();
                    docPart.InnerXml = tmpDoc.OuterXml;
                    doc.DocumentElement.PrependChild(docPart);
                    // insert blog meta-data
                    string       key = doc.SelectSingleNode("//header/blog").InnerText;
                    BlogMetaData metaData;
                    if (!mBlogMetaData.ContainsKey(key))
                    {
                        Console.WriteLine("*** Ne najdem podatkov o blogu \"{0}\".", key);
                        continue;
                    }
                    else
                    {
                        Console.WriteLine("Vstavljam meta-podatke o blogu...");
                        metaData = mBlogMetaData[key];
                        XmlNode node = doc.SelectSingleNode("//header");
                        node.AppendChild(doc.CreateElement("blogSpletniNaslov")).InnerText = metaData.mBlogUrl;
                        node.AppendChild(doc.CreateElement("blogNaslov")).InnerText        = metaData.mBlogTitle;
                        node.AppendChild(doc.CreateElement("blogNaslovKratek")).InnerText  = metaData.mBlogTitleShort;
                        //node.AppendChild(doc.CreateElement("avtorEMail")).InnerText = metaData.mAuthorEMail;
                        node.AppendChild(doc.CreateElement("avtorSpol")).InnerText      = metaData.mAuthorGender;
                        node.AppendChild(doc.CreateElement("avtorStarost")).InnerText   = metaData.mAuthorAge;
                        node.AppendChild(doc.CreateElement("avtorRegija")).InnerText    = metaData.mAuthorLocation;
                        node.AppendChild(doc.CreateElement("avtorIzobrazba")).InnerText = metaData.mAuthorEducation;
                    }
                    // write results
                    Console.WriteLine("Zapisujem rezultate...");
                    using (XmlWriter w = XmlWriter.Create(MakeOutputFileName(fileName), xmlSettings))
                    {
                        doc.Save(w);
                    }
                }
            }
            // purge temp folder
            Directory.GetFiles(Config.TmpFolder, "*.tmp").ToList().ForEach(x => File.Delete(x));
            // all done
            Console.WriteLine("Koncano.");
        }