示例#1
0
        public void LoadFromGigaFidaFile(string fileName)
        {
            Utils.ThrowException(fileName == null ? new ArgumentNullException("fileName") : null);
            Utils.ThrowException(!Utils.VerifyFileNameOpen(fileName) ? new ArgumentValueException("fileName") : null);
            XmlTextReader xmlReader = null;

            try
            {
                bool hasHeader = false;
                mTaggedWords.Clear();
                mTeiHeader = null;
                xmlReader  = new XmlTextReader(new FileStream(fileName, FileMode.Open));
                while (xmlReader.Read())
                {
                    if (xmlReader.NodeType == XmlNodeType.Element && xmlReader.Name == "teiHeader") // header
                    {
                        hasHeader = true;
                        Utils.XmlSkip(xmlReader, "teiHeader");
                    }
                    else if (xmlReader.NodeType == XmlNodeType.Element && xmlReader.Name == "p") // paragraph
                    {
                        ThreadHandler.AbortCheckpoint();                                         // TODO: do this at various appropriate places
                        xmlReader.Read();
                        Corpus aux = new Corpus();
                        aux.LoadFromTextSsjTokenizer(xmlReader.Value);
                        if (aux.TaggedWords.Count > 0)
                        {
                            foreach (TaggedWord word in aux.TaggedWords)
                            {
                                word.MoreInfo.RemoveEndOfParagraphFlag();
                                mTaggedWords.Add(word);
                            }
                            aux.TaggedWords.Last.MoreInfo.SetEndOfParagraphFlag();
                        }
                    }
                }
                xmlReader.Close();
                if (hasHeader)
                {
                    ReadTeiHeader(fileName);
                }
            }
            catch
            {
                try { xmlReader.Close(); } catch { }
                throw;
            }
        }
示例#2
0
        private static void Main(string[] args)
        {
            // initialize logger
            mLogger.LocalLevel      = Logger.Level.Debug;
            mLogger.LocalOutputType = Logger.OutputType.Custom;
            Logger.CustomOutput     = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e,
                                                                               string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); });
#endif
            try
            {
                if (args.Length < 4)
                {
                    OutputHelp();
                }
                else
                {
                    string inputFolder = null, searchPattern = null, taggerModelFile = null, lemmatizerModelFile = null, outputFileOrFolder = null;
                    bool   ssjTokenizer = false, searchSubfolders = false, verbose = false, overwrite = false;
                    if (ParseParams(args, ref verbose, ref inputFolder, ref searchPattern, ref taggerModelFile, ref lemmatizerModelFile,
                                    ref outputFileOrFolder, ref ssjTokenizer, ref searchSubfolders, ref overwrite))
                    {
                        if (!verbose)
                        {
                            mLogger.LocalLevel = Logger.Level.Info;
                            mLogger.LocalProgressOutputType = Logger.ProgressOutputType.Off;
                        }
                        PartOfSpeechTagger tagger = new PartOfSpeechTagger();
                        tagger.LoadModels(taggerModelFile, lemmatizerModelFile);
                        mLogger.Debug(null, "Mapa z vhodnimi datotekami: {0}", inputFolder);
                        mLogger.Debug(null, "Iskalni vzorec: {0}", searchPattern);
                        foreach (FileInfo file in new DirectoryInfo(inputFolder).GetFiles(searchPattern,
                                                                                          searchSubfolders ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly))
                        {
                            string outputFileName = outputFileOrFolder;
                            if (searchPattern.Contains("*") || searchPattern.Contains("?")) // search pattern contains wildcards thus output is a folder
                            {
                                outputFileName = outputFileOrFolder.TrimEnd('\\') + "\\" +
                                                 file.Name.Substring(0, file.Name.Length - file.Extension.Length) + ".out" + file.Extension;
                                DirectoryInfo dirInfo = new FileInfo(outputFileName).Directory;
                                if (!dirInfo.Exists)
                                {
                                    dirInfo.Create();
                                }
                            }
                            if (File.Exists(outputFileName) && !overwrite)
                            {
                                mLogger.Debug(null, "Datoteka {0} že obstaja. Pripadajoča vhodna datoteka ni bila ponovno označena.", outputFileName);
                                continue;
                            }
                            mLogger.Debug(null, "Nalagam {0} ...", file.FullName);
                            Corpus corpus;
                            bool   xmlMode = false;
                            try
                            {
                                corpus = new Corpus();
                                corpus.LoadFromXmlFile(file.FullName, /*tagLen=*/ -1);
                                if (corpus.TaggedWords.Count > 0)
                                {
                                    xmlMode = true;
                                }
                                else
                                {
                                    corpus.LoadFromGigaFidaFile(file.FullName);
                                }
                            }
                            catch (ThreadHandler.AbortedByUserException)
                            {
                                throw;
                            }
                            catch (Exception e)
                            {
                                corpus = new Corpus();
                                string content = File.ReadAllText(file.FullName);
                                if (IsXmlTei(content))
                                {
                                    mLogger.Debug(null, "*** Opozorilo: Datoteka z besedilom vsebuje značke XML-TEI, vendar nima pravilne oblike. Podrobnosti: {0}", e.Message);
                                }
                                if (ssjTokenizer)
                                {
                                    corpus.LoadFromTextSsjTokenizer(content);
                                }
                                else
                                {
                                    corpus.LoadFromText(content);
                                }
                            }
                            int      knownWordsCorrect      = 0;
                            int      knownWordsPosCorrect   = 0;
                            int      knownWords             = 0;
                            int      unknownWordsCorrect    = 0;
                            int      unknownWordsPosCorrect = 0;
                            int      unknownWords           = 0;
                            int      eosCount                      = 0;
                            int      eosCorrect                    = 0;
                            int      lemmaCorrect                  = 0;
                            int      lemmaCorrectLowercase         = 0;
                            int      lemmaWords                    = 0;
                            int      knownWordsCorrectNoPunct      = 0;
                            int      knownWordsPosCorrectNoPunct   = 0;
                            int      knownWordsNoPunct             = 0;
                            int      unknownWordsCorrectNoPunct    = 0;
                            int      unknownWordsPosCorrectNoPunct = 0;
                            int      unknownWordsNoPunct           = 0;
                            string[] goldTags                      = new string[corpus.TaggedWords.Count];
                            for (int i = 0; i < corpus.TaggedWords.Count; i++)
                            {
                                goldTags[i] = corpus.TaggedWords[i].Tag;
                            }
                            tagger.Tag(corpus, out lemmaCorrect, out lemmaCorrectLowercase, out lemmaWords, xmlMode);
                            mLogger.Debug(null, "Zapisujem označeno besedilo v datoteko {0} ...", outputFileName);
                            StreamWriter writer = new StreamWriter(outputFileName);
                            writer.Write(corpus.ToString(xmlMode || ssjTokenizer ? "XML-MI" : "XML"));
                            writer.Close();
                            mLogger.Debug(null, "Končano.");
                            if (xmlMode)
                            {
                                for (int i = 0; i < corpus.TaggedWords.Count; i++)
                                {
                                    string wordLower = corpus.TaggedWords[i].WordLower;
                                    string tag       = corpus.TaggedWords[i].Tag;
                                    bool   isKnown   = tagger.IsKnownWord(wordLower);
                                    if (!PartOfSpeechTagger.mNonWordRegex.Match(corpus.TaggedWords[i].Word).Success)
                                    {
                                        if (tag == goldTags[i])
                                        {
                                            if (isKnown)
                                            {
                                                knownWordsCorrectNoPunct++;
                                            }
                                            else
                                            {
                                                unknownWordsCorrectNoPunct++;
                                            }
                                        }
                                        if (goldTags[i] != null && tag[0] == goldTags[i][0])
                                        {
                                            if (isKnown)
                                            {
                                                knownWordsPosCorrectNoPunct++;
                                            }
                                            else
                                            {
                                                unknownWordsPosCorrectNoPunct++;
                                            }
                                        }
                                        if (isKnown)
                                        {
                                            knownWordsNoPunct++;
                                        }
                                        else
                                        {
                                            unknownWordsNoPunct++;
                                        }
                                    }
                                    if (tag == goldTags[i])
                                    {
                                        if (isKnown)
                                        {
                                            knownWordsCorrect++;
                                        }
                                        else
                                        {
                                            unknownWordsCorrect++;
                                        }
                                    }
                                    if (goldTags[i] != null && tag[0] == goldTags[i][0])
                                    {
                                        if (isKnown)
                                        {
                                            knownWordsPosCorrect++;
                                        }
                                        else
                                        {
                                            unknownWordsPosCorrect++;
                                        }
                                    }
                                    if (isKnown)
                                    {
                                        knownWords++;
                                    }
                                    else
                                    {
                                        unknownWords++;
                                    }
                                    if (corpus.TaggedWords[i].MoreInfo.EndOfSentence)
                                    {
                                        eosCount++;
                                        if (tag.EndsWith("<eos>"))
                                        {
                                            eosCorrect++;
                                        }
                                    }
                                }
                                int allWords                  = knownWords + unknownWords;
                                int allWordsCorrect           = knownWordsCorrect + unknownWordsCorrect;
                                int allWordsPosCorrect        = knownWordsPosCorrect + unknownWordsPosCorrect;
                                int allWordsNoPunct           = knownWordsNoPunct + unknownWordsNoPunct;
                                int allWordsCorrectNoPunct    = knownWordsCorrectNoPunct + unknownWordsCorrectNoPunct;
                                int allWordsPosCorrectNoPunct = knownWordsPosCorrectNoPunct + unknownWordsPosCorrectNoPunct;
                                mLogger.Info(null, "Točnost na znanih besedah: ................... {2:0.00}% ({0} / {1})", knownWordsCorrect, knownWords,
                                             (double)knownWordsCorrect / (double)knownWords * 100.0);
                                mLogger.Info(null, "Točnost na neznanih besedah: ................. {2:0.00}% ({0} / {1})", unknownWordsCorrect, unknownWords,
                                             (double)unknownWordsCorrect / (double)unknownWords * 100.0);
                                mLogger.Info(null, "Skupna točnost: .............................. {2:0.00}% ({0} / {1})", allWordsCorrect, allWords,
                                             (double)allWordsCorrect / (double)allWords * 100.0);
                                mLogger.Info(null, "Točnost na znanih besedah (POS): ............. {2:0.00}% ({0} / {1})", knownWordsPosCorrect, knownWords,
                                             (double)knownWordsPosCorrect / (double)knownWords * 100.0);
                                mLogger.Info(null, "Točnost na neznanih besedah (POS): ........... {2:0.00}% ({0} / {1})", unknownWordsPosCorrect, unknownWords,
                                             (double)unknownWordsPosCorrect / (double)unknownWords * 100.0);
                                mLogger.Info(null, "Skupna točnost (POS): ........................ {2:0.00}% ({0} / {1})", allWordsPosCorrect, allWords,
                                             (double)allWordsPosCorrect / (double)allWords * 100.0);
                                mLogger.Info(null, "Točnost na znanih besedah (brez ločil): ...... {2:0.00}% ({0} / {1})", knownWordsCorrectNoPunct, knownWordsNoPunct,
                                             (double)knownWordsCorrectNoPunct / (double)knownWordsNoPunct * 100.0);
                                mLogger.Info(null, "Točnost na neznanih besedah (brez ločil): .... {2:0.00}% ({0} / {1})", unknownWordsCorrectNoPunct, unknownWordsNoPunct,
                                             (double)unknownWordsCorrectNoPunct / (double)unknownWordsNoPunct * 100.0);
                                mLogger.Info(null, "Skupna točnost (brez ločil): ................. {2:0.00}% ({0} / {1})", allWordsCorrectNoPunct, allWordsNoPunct,
                                             (double)allWordsCorrectNoPunct / (double)allWordsNoPunct * 100.0);
                                mLogger.Info(null, "Točnost na znanih besedah (POS, brez ločil):   {2:0.00}% ({0} / {1})", knownWordsPosCorrectNoPunct, knownWordsNoPunct,
                                             (double)knownWordsPosCorrectNoPunct / (double)knownWordsNoPunct * 100.0);
                                mLogger.Info(null, "Točnost na neznanih besedah (POS, brez ločil): {2:0.00}% ({0} / {1})", unknownWordsPosCorrectNoPunct, unknownWordsNoPunct,
                                             (double)unknownWordsPosCorrectNoPunct / (double)unknownWordsNoPunct * 100.0);
                                mLogger.Info(null, "Skupna točnost (POS, brez ločil): ............ {2:0.00}% ({0} / {1})", allWordsPosCorrectNoPunct, allWordsNoPunct,
                                             (double)allWordsPosCorrectNoPunct / (double)allWordsNoPunct * 100.0);
                                if (lemmatizerModelFile != null)
                                {
                                    mLogger.Info(null, "Točnost lematizacije (brez ločil): ........... {2:0.00}% ({0} / {1})", lemmaCorrect, lemmaWords,
                                                 (double)lemmaCorrect / (double)lemmaWords * 100.0);
                                    mLogger.Info(null, "Točnost lematizacije (male črke, brez ločil):  {2:0.00}% ({0} / {1})", lemmaCorrectLowercase, lemmaWords,
                                                 (double)lemmaCorrectLowercase / (double)lemmaWords * 100.0);
                                }
                                mLogger.Info(null, "Točnost detekcije konca stavka: .............. {2:0.00}% ({0} / {1})", eosCorrect, eosCount,
                                             (double)eosCorrect / (double)eosCount * 100.0);
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                mLogger.Info(null, "");
                mLogger.Info(null, "*** Nepričakovana napaka. Podrobnosti: {0}", e);
            }
        }