private static void Main(string[] args) { // initialize logger mLogger.LocalLevel = Logger.Level.Debug; mLogger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); #endif try { if (args.Length < 4) { OutputHelp(); } else { string inputFolder = null, searchPattern = null, taggerModelFile = null, lemmatizerModelFile = null, outputFileOrFolder = null; bool ssjTokenizer = false, searchSubfolders = false, verbose = false, overwrite = false; if (ParseParams(args, ref verbose, ref inputFolder, ref searchPattern, ref taggerModelFile, ref lemmatizerModelFile, ref outputFileOrFolder, ref ssjTokenizer, ref searchSubfolders, ref overwrite)) { if (!verbose) { mLogger.LocalLevel = Logger.Level.Info; mLogger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } PartOfSpeechTagger tagger = new PartOfSpeechTagger(); tagger.LoadModels(taggerModelFile, lemmatizerModelFile); mLogger.Debug(null, "Mapa z vhodnimi datotekami: {0}", inputFolder); mLogger.Debug(null, "Iskalni vzorec: {0}", searchPattern); foreach (FileInfo file in new DirectoryInfo(inputFolder).GetFiles(searchPattern, searchSubfolders ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly)) { string outputFileName = outputFileOrFolder; if (searchPattern.Contains("*") || searchPattern.Contains("?")) // search pattern contains wildcards thus output is a folder { outputFileName = outputFileOrFolder.TrimEnd('\\') + "\\" + file.Name.Substring(0, file.Name.Length - file.Extension.Length) + ".out" + file.Extension; DirectoryInfo dirInfo = new FileInfo(outputFileName).Directory; if (!dirInfo.Exists) { dirInfo.Create(); } } if (File.Exists(outputFileName) && !overwrite) { mLogger.Debug(null, "Datoteka {0} že obstaja. Pripadajoča vhodna datoteka ni bila ponovno označena.", outputFileName); continue; } mLogger.Debug(null, "Nalagam {0} ...", file.FullName); Corpus corpus; bool xmlMode = false; try { corpus = new Corpus(); corpus.LoadFromXmlFile(file.FullName, /*tagLen=*/ -1); if (corpus.TaggedWords.Count > 0) { xmlMode = true; } else { corpus.LoadFromGigaFidaFile(file.FullName); } } catch (ThreadHandler.AbortedByUserException) { throw; } catch (Exception e) { corpus = new Corpus(); string content = File.ReadAllText(file.FullName); if (IsXmlTei(content)) { mLogger.Debug(null, "*** Opozorilo: Datoteka z besedilom vsebuje značke XML-TEI, vendar nima pravilne oblike. Podrobnosti: {0}", e.Message); } if (ssjTokenizer) { corpus.LoadFromTextSsjTokenizer(content); } else { corpus.LoadFromText(content); } } int knownWordsCorrect = 0; int knownWordsPosCorrect = 0; int knownWords = 0; int unknownWordsCorrect = 0; int unknownWordsPosCorrect = 0; int unknownWords = 0; int eosCount = 0; int eosCorrect = 0; int lemmaCorrect = 0; int lemmaCorrectLowercase = 0; int lemmaWords = 0; int knownWordsCorrectNoPunct = 0; int knownWordsPosCorrectNoPunct = 0; int knownWordsNoPunct = 0; int unknownWordsCorrectNoPunct = 0; int unknownWordsPosCorrectNoPunct = 0; int unknownWordsNoPunct = 0; string[] goldTags = new string[corpus.TaggedWords.Count]; for (int i = 0; i < corpus.TaggedWords.Count; i++) { goldTags[i] = corpus.TaggedWords[i].Tag; } tagger.Tag(corpus, out lemmaCorrect, out lemmaCorrectLowercase, out lemmaWords, xmlMode); mLogger.Debug(null, "Zapisujem označeno besedilo v datoteko {0} ...", outputFileName); StreamWriter writer = new StreamWriter(outputFileName); writer.Write(corpus.ToString(xmlMode || ssjTokenizer ? "XML-MI" : "XML")); writer.Close(); mLogger.Debug(null, "Končano."); if (xmlMode) { for (int i = 0; i < corpus.TaggedWords.Count; i++) { string wordLower = corpus.TaggedWords[i].WordLower; string tag = corpus.TaggedWords[i].Tag; bool isKnown = tagger.IsKnownWord(wordLower); if (!PartOfSpeechTagger.mNonWordRegex.Match(corpus.TaggedWords[i].Word).Success) { if (tag == goldTags[i]) { if (isKnown) { knownWordsCorrectNoPunct++; } else { unknownWordsCorrectNoPunct++; } } if (goldTags[i] != null && tag[0] == goldTags[i][0]) { if (isKnown) { knownWordsPosCorrectNoPunct++; } else { unknownWordsPosCorrectNoPunct++; } } if (isKnown) { knownWordsNoPunct++; } else { unknownWordsNoPunct++; } } if (tag == goldTags[i]) { if (isKnown) { knownWordsCorrect++; } else { unknownWordsCorrect++; } } if (goldTags[i] != null && tag[0] == goldTags[i][0]) { if (isKnown) { knownWordsPosCorrect++; } else { unknownWordsPosCorrect++; } } if (isKnown) { knownWords++; } else { unknownWords++; } if (corpus.TaggedWords[i].MoreInfo.EndOfSentence) { eosCount++; if (tag.EndsWith("<eos>")) { eosCorrect++; } } } int allWords = knownWords + unknownWords; int allWordsCorrect = knownWordsCorrect + unknownWordsCorrect; int allWordsPosCorrect = knownWordsPosCorrect + unknownWordsPosCorrect; int allWordsNoPunct = knownWordsNoPunct + unknownWordsNoPunct; int allWordsCorrectNoPunct = knownWordsCorrectNoPunct + unknownWordsCorrectNoPunct; int allWordsPosCorrectNoPunct = knownWordsPosCorrectNoPunct + unknownWordsPosCorrectNoPunct; mLogger.Info(null, "Točnost na znanih besedah: ................... {2:0.00}% ({0} / {1})", knownWordsCorrect, knownWords, (double)knownWordsCorrect / (double)knownWords * 100.0); mLogger.Info(null, "Točnost na neznanih besedah: ................. {2:0.00}% ({0} / {1})", unknownWordsCorrect, unknownWords, (double)unknownWordsCorrect / (double)unknownWords * 100.0); mLogger.Info(null, "Skupna točnost: .............................. {2:0.00}% ({0} / {1})", allWordsCorrect, allWords, (double)allWordsCorrect / (double)allWords * 100.0); mLogger.Info(null, "Točnost na znanih besedah (POS): ............. {2:0.00}% ({0} / {1})", knownWordsPosCorrect, knownWords, (double)knownWordsPosCorrect / (double)knownWords * 100.0); mLogger.Info(null, "Točnost na neznanih besedah (POS): ........... {2:0.00}% ({0} / {1})", unknownWordsPosCorrect, unknownWords, (double)unknownWordsPosCorrect / (double)unknownWords * 100.0); mLogger.Info(null, "Skupna točnost (POS): ........................ {2:0.00}% ({0} / {1})", allWordsPosCorrect, allWords, (double)allWordsPosCorrect / (double)allWords * 100.0); mLogger.Info(null, "Točnost na znanih besedah (brez ločil): ...... {2:0.00}% ({0} / {1})", knownWordsCorrectNoPunct, knownWordsNoPunct, (double)knownWordsCorrectNoPunct / (double)knownWordsNoPunct * 100.0); mLogger.Info(null, "Točnost na neznanih besedah (brez ločil): .... {2:0.00}% ({0} / {1})", unknownWordsCorrectNoPunct, unknownWordsNoPunct, (double)unknownWordsCorrectNoPunct / (double)unknownWordsNoPunct * 100.0); mLogger.Info(null, "Skupna točnost (brez ločil): ................. {2:0.00}% ({0} / {1})", allWordsCorrectNoPunct, allWordsNoPunct, (double)allWordsCorrectNoPunct / (double)allWordsNoPunct * 100.0); mLogger.Info(null, "Točnost na znanih besedah (POS, brez ločil): {2:0.00}% ({0} / {1})", knownWordsPosCorrectNoPunct, knownWordsNoPunct, (double)knownWordsPosCorrectNoPunct / (double)knownWordsNoPunct * 100.0); mLogger.Info(null, "Točnost na neznanih besedah (POS, brez ločil): {2:0.00}% ({0} / {1})", unknownWordsPosCorrectNoPunct, unknownWordsNoPunct, (double)unknownWordsPosCorrectNoPunct / (double)unknownWordsNoPunct * 100.0); mLogger.Info(null, "Skupna točnost (POS, brez ločil): ............ {2:0.00}% ({0} / {1})", allWordsPosCorrectNoPunct, allWordsNoPunct, (double)allWordsPosCorrectNoPunct / (double)allWordsNoPunct * 100.0); if (lemmatizerModelFile != null) { mLogger.Info(null, "Točnost lematizacije (brez ločil): ........... {2:0.00}% ({0} / {1})", lemmaCorrect, lemmaWords, (double)lemmaCorrect / (double)lemmaWords * 100.0); mLogger.Info(null, "Točnost lematizacije (male črke, brez ločil): {2:0.00}% ({0} / {1})", lemmaCorrectLowercase, lemmaWords, (double)lemmaCorrectLowercase / (double)lemmaWords * 100.0); } mLogger.Info(null, "Točnost detekcije konca stavka: .............. {2:0.00}% ({0} / {1})", eosCorrect, eosCount, (double)eosCorrect / (double)eosCount * 100.0); } } } } } catch (Exception e) { mLogger.Info(null, ""); mLogger.Info(null, "*** Nepričakovana napaka. Podrobnosti: {0}", e); } }
static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool considerTag = false; bool treeOpt = false; bool verbose = false; if (ParseParams(args, ref verbose, ref considerTag, ref treeOpt, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ int.MaxValue); LemmatizerSettings lemmatizerSettings = new LemmatizerSettings(); lemmatizerSettings.eMsdConsider = considerTag ? LemmatizerSettings.MsdConsideration.Distinct : LemmatizerSettings.MsdConsideration.Ignore; lemmatizerSettings.bUseFromInRules = true; lemmatizerSettings.iMaxRulesPerNode = 0; lemmatizerSettings.bBuildFrontLemmatizer = false; lemmatizerSettings.bStoreAllFullKnownWords = false; lemmatizerSettings.bUseMsdSplitTreeOptimization = treeOpt; Lemmatizer lemmatizer = new Lemmatizer(lemmatizerSettings); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); TaggedWord word = corpus.TaggedWords[i]; if (!word.MoreInfo.Punctuation) { lemmatizer.AddExample(word.WordLower, word.Lemma.ToLower(), 1, word.Tag); } } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; int i = 0; while ((lexLine = lexReader.ReadLine()) != null) { // lexicon format: word \t lemma \t tag \t freq logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", ++i, /*numSteps=*/ 0); string[] lexData = lexLine.Split('\t'); string word = lexData[0]; string lemma = lexData[1]; string tag = lexData[2]; double freq = Math.Max(0.1, Convert.ToDouble(lexData[3])); lemmatizer.AddExample(word.ToLower(), lemma.ToLower(), freq, tag); } logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", i, i); lexReader.Close(); } logger.Info(/*funcName=*/ null, "Gradim model za lematizacijo ..."); if (treeOpt) { string msdSpec = Utils.GetManifestResourceString(typeof(Program), "MsdSpecsSloSloCodes.txt"); MsdSplitTree.BeamSearchParams beamSearchParams = new MsdSplitTree.BeamSearchParams(); beamSearchParams.beamsPerLevel[0] = 2; lemmatizer.BuildModel(msdSpec, beamSearchParams); } else { lemmatizer.BuildModel(); } logger.Info(/*funcName=*/ null, "Optimiram lematizacijsko drevo ..."); lemmatizer.OptimizeMemorySize(); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); writer.WriteBool(considerTag); lemmatizer.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }
static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { int cutOff = 2; int numIter = 50; int numThreads = 1; string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool verbose = false; if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1); GC.Collect(); long oldMemUse = Process.GetCurrentProcess().PrivateMemorySize64; PatriciaTree suffixTree = new PatriciaTree(); foreach (TaggedWord word in corpus.TaggedWords) { suffixTree.AddWordTagPair(word.WordLower, word.Tag); } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; while ((lexLine = lexReader.ReadLine()) != null) { string[] lexData = lexLine.Split('\t'); suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]); } lexReader.Close(); } GC.Collect(); long memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); oldMemUse = memUse; suffixTree.PropagateTags(); GC.Collect(); memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); MaximumEntropyClassifierFast <string> model = new MaximumEntropyClassifierFast <string>(); LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>(); Dictionary <string, int> featureSpace = new Dictionary <string, int>(); logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ..."); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree); dataset.Add(corpus.TaggedWords[i].Tag, featureVector); } logger.Info(/*funcName=*/ null, "Gradim model ..."); DateTime startTime = DateTime.Now; model.CutOff = cutOff; model.NumThreads = numThreads; model.NumIter = numIter; model.Train(dataset); TimeSpan span = DateTime.Now - startTime; logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); suffixTree.Save(writer); Utils.SaveDictionary(featureSpace, writer); model.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }