public void LoadFromGigaFidaFile(string fileName) { Utils.ThrowException(fileName == null ? new ArgumentNullException("fileName") : null); Utils.ThrowException(!Utils.VerifyFileNameOpen(fileName) ? new ArgumentValueException("fileName") : null); XmlTextReader xmlReader = null; try { bool hasHeader = false; mTaggedWords.Clear(); mTeiHeader = null; xmlReader = new XmlTextReader(new FileStream(fileName, FileMode.Open)); while (xmlReader.Read()) { if (xmlReader.NodeType == XmlNodeType.Element && xmlReader.Name == "teiHeader") // header { hasHeader = true; Utils.XmlSkip(xmlReader, "teiHeader"); } else if (xmlReader.NodeType == XmlNodeType.Element && xmlReader.Name == "p") // paragraph { ThreadHandler.AbortCheckpoint(); // TODO: do this at various appropriate places xmlReader.Read(); Corpus aux = new Corpus(); aux.LoadFromTextSsjTokenizer(xmlReader.Value); if (aux.TaggedWords.Count > 0) { foreach (TaggedWord word in aux.TaggedWords) { word.MoreInfo.RemoveEndOfParagraphFlag(); mTaggedWords.Add(word); } aux.TaggedWords.Last.MoreInfo.SetEndOfParagraphFlag(); } } } xmlReader.Close(); if (hasHeader) { ReadTeiHeader(fileName); } } catch { try { xmlReader.Close(); } catch { } throw; } }
static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool considerTag = false; bool treeOpt = false; bool verbose = false; if (ParseParams(args, ref verbose, ref considerTag, ref treeOpt, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ int.MaxValue); LemmatizerSettings lemmatizerSettings = new LemmatizerSettings(); lemmatizerSettings.eMsdConsider = considerTag ? LemmatizerSettings.MsdConsideration.Distinct : LemmatizerSettings.MsdConsideration.Ignore; lemmatizerSettings.bUseFromInRules = true; lemmatizerSettings.iMaxRulesPerNode = 0; lemmatizerSettings.bBuildFrontLemmatizer = false; lemmatizerSettings.bStoreAllFullKnownWords = false; lemmatizerSettings.bUseMsdSplitTreeOptimization = treeOpt; Lemmatizer lemmatizer = new Lemmatizer(lemmatizerSettings); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); TaggedWord word = corpus.TaggedWords[i]; if (!word.MoreInfo.Punctuation) { lemmatizer.AddExample(word.WordLower, word.Lemma.ToLower(), 1, word.Tag); } } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; int i = 0; while ((lexLine = lexReader.ReadLine()) != null) { // lexicon format: word \t lemma \t tag \t freq logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", ++i, /*numSteps=*/ 0); string[] lexData = lexLine.Split('\t'); string word = lexData[0]; string lemma = lexData[1]; string tag = lexData[2]; double freq = Math.Max(0.1, Convert.ToDouble(lexData[3])); lemmatizer.AddExample(word.ToLower(), lemma.ToLower(), freq, tag); } logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0}", i, i); lexReader.Close(); } logger.Info(/*funcName=*/ null, "Gradim model za lematizacijo ..."); if (treeOpt) { string msdSpec = Utils.GetManifestResourceString(typeof(Program), "MsdSpecsSloSloCodes.txt"); MsdSplitTree.BeamSearchParams beamSearchParams = new MsdSplitTree.BeamSearchParams(); beamSearchParams.beamsPerLevel[0] = 2; lemmatizer.BuildModel(msdSpec, beamSearchParams); } else { lemmatizer.BuildModel(); } logger.Info(/*funcName=*/ null, "Optimiram lematizacijsko drevo ..."); lemmatizer.OptimizeMemorySize(); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); writer.WriteBool(considerTag); lemmatizer.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }
public void Tag(Corpus corpus) { int foo, bar, foobar; Tag(corpus, out foo, out bar, out foobar, /*xmlMode=*/ false); // throws InvalidOperationException, ArgumentNullException }
// *** End of Dec-2011 *** public void Tag(Corpus corpus, out int lemmaCorrect, out int lemmaCorrectLowercase, out int lemmaWords, bool xmlMode) { DateTime startTime = DateTime.Now; mLogger.Debug("Tag", "Označujem besedilo ..."); lemmaCorrect = 0; lemmaCorrectLowercase = 0; lemmaWords = 0; for (int i = 0; i < corpus.TaggedWords.Count; i++) { //mLogger.ProgressFast(Logger.Level.Info, /*sender=*/this, "Tag", "{0} / {1}", i + 1, corpus.TaggedWords.Count); //BinaryVector featureVector = corpus.GenerateFeatureVector(i, mFeatureSpace, /*extendFeatureSpace=*/false, mSuffixTree); //Prediction<string> result = mModel.Predict(featureVector); if ((corpus.TaggedWords[i].MoreInfo != null && corpus.TaggedWords[i].MoreInfo.Punctuation) || (corpus.TaggedWords[i].MoreInfo == null && mNonWordRegex.Match(corpus.TaggedWords[i].WordLower).Success)) // non-word { /*bool flag = false; * foreach (KeyDat<double, string> item in result) * { * if (corpus.TaggedWords[i].Word == item.Dat || corpus.TaggedWords[i].Word + "<eos>" == item.Dat) * { * corpus.TaggedWords[i].Tag = item.Dat; * flag = true; * break; * } * } * if (!flag) * { * corpus.TaggedWords[i].Tag = corpus.TaggedWords[i].Word; * }*/ } else // word { string wordLower = corpus.TaggedWords[i].WordLower; //Set<string> filter = mSuffixTree.Contains(wordLower) ? mSuffixTree.GetTags(wordLower) : null; //result = ProcessResult(result, filter);//???!!! string goldTag = corpus.TaggedWords[i].Tag; string word = corpus.TaggedWords[i].Word; string rule; /*if (filter == null) * { * filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); * } * else * { * filter = Rules.ApplyTaggerRules(filter, word, out rule); * if (filter.Count == 0) { filter = Rules.ApplyTaggerRules(CreateFilterFromResult(result), word, out rule); } * } * result = ProcessResult(result, filter);//???!!! * string predictedTag; * if (result.Count == 0) * { * predictedTag = Rules.GetMostFrequentTag(wordLower, filter); * } * else * { * predictedTag = result.BestClassLabel; * } * corpus.TaggedWords[i].Tag = predictedTag;*/ if (mLemmatizer != null) { string lemma; lemma = /*mConsiderTags ? mLemmatizer.Lemmatize(wordLower, predictedTag) : */ mLemmatizer.Lemmatize(wordLower); //lemma = Rules.FixLemma(lemma, corpus.TaggedWords[i].Word, predictedTag); if (string.IsNullOrEmpty(lemma)) { lemma = wordLower; } if (xmlMode) { lemmaWords++; if (lemma == corpus.TaggedWords[i].Lemma) { lemmaCorrect++; } if (corpus.TaggedWords[i].Lemma != null && lemma.ToLower() == corpus.TaggedWords[i].Lemma.ToLower()) { lemmaCorrectLowercase++; } } corpus.TaggedWords[i].Lemma = lemma; } } } TimeSpan span = DateTime.Now - startTime; mLogger.Debug("Tag", "Trajanje označevanja: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); }
private static void Main(string[] args) { // initialize logger mLogger.LocalLevel = Logger.Level.Debug; mLogger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); #endif try { if (args.Length < 4) { OutputHelp(); } else { string inputFolder = null, searchPattern = null, taggerModelFile = null, lemmatizerModelFile = null, outputFileOrFolder = null; bool ssjTokenizer = false, searchSubfolders = false, verbose = false, overwrite = false; if (ParseParams(args, ref verbose, ref inputFolder, ref searchPattern, ref taggerModelFile, ref lemmatizerModelFile, ref outputFileOrFolder, ref ssjTokenizer, ref searchSubfolders, ref overwrite)) { if (!verbose) { mLogger.LocalLevel = Logger.Level.Info; mLogger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } PartOfSpeechTagger tagger = new PartOfSpeechTagger(); tagger.LoadModels(taggerModelFile, lemmatizerModelFile); mLogger.Debug(null, "Mapa z vhodnimi datotekami: {0}", inputFolder); mLogger.Debug(null, "Iskalni vzorec: {0}", searchPattern); foreach (FileInfo file in new DirectoryInfo(inputFolder).GetFiles(searchPattern, searchSubfolders ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly)) { string outputFileName = outputFileOrFolder; if (searchPattern.Contains("*") || searchPattern.Contains("?")) // search pattern contains wildcards thus output is a folder { outputFileName = outputFileOrFolder.TrimEnd('\\') + "\\" + file.Name.Substring(0, file.Name.Length - file.Extension.Length) + ".out" + file.Extension; DirectoryInfo dirInfo = new FileInfo(outputFileName).Directory; if (!dirInfo.Exists) { dirInfo.Create(); } } if (File.Exists(outputFileName) && !overwrite) { mLogger.Debug(null, "Datoteka {0} že obstaja. Pripadajoča vhodna datoteka ni bila ponovno označena.", outputFileName); continue; } mLogger.Debug(null, "Nalagam {0} ...", file.FullName); Corpus corpus; bool xmlMode = false; try { corpus = new Corpus(); corpus.LoadFromXmlFile(file.FullName, /*tagLen=*/ -1); if (corpus.TaggedWords.Count > 0) { xmlMode = true; } else { corpus.LoadFromGigaFidaFile(file.FullName); } } catch (ThreadHandler.AbortedByUserException) { throw; } catch (Exception e) { corpus = new Corpus(); string content = File.ReadAllText(file.FullName); if (IsXmlTei(content)) { mLogger.Debug(null, "*** Opozorilo: Datoteka z besedilom vsebuje značke XML-TEI, vendar nima pravilne oblike. Podrobnosti: {0}", e.Message); } if (ssjTokenizer) { corpus.LoadFromTextSsjTokenizer(content); } else { corpus.LoadFromText(content); } } int knownWordsCorrect = 0; int knownWordsPosCorrect = 0; int knownWords = 0; int unknownWordsCorrect = 0; int unknownWordsPosCorrect = 0; int unknownWords = 0; int eosCount = 0; int eosCorrect = 0; int lemmaCorrect = 0; int lemmaCorrectLowercase = 0; int lemmaWords = 0; int knownWordsCorrectNoPunct = 0; int knownWordsPosCorrectNoPunct = 0; int knownWordsNoPunct = 0; int unknownWordsCorrectNoPunct = 0; int unknownWordsPosCorrectNoPunct = 0; int unknownWordsNoPunct = 0; string[] goldTags = new string[corpus.TaggedWords.Count]; for (int i = 0; i < corpus.TaggedWords.Count; i++) { goldTags[i] = corpus.TaggedWords[i].Tag; } tagger.Tag(corpus, out lemmaCorrect, out lemmaCorrectLowercase, out lemmaWords, xmlMode); mLogger.Debug(null, "Zapisujem označeno besedilo v datoteko {0} ...", outputFileName); StreamWriter writer = new StreamWriter(outputFileName); writer.Write(corpus.ToString(xmlMode || ssjTokenizer ? "XML-MI" : "XML")); writer.Close(); mLogger.Debug(null, "Končano."); if (xmlMode) { for (int i = 0; i < corpus.TaggedWords.Count; i++) { string wordLower = corpus.TaggedWords[i].WordLower; string tag = corpus.TaggedWords[i].Tag; bool isKnown = tagger.IsKnownWord(wordLower); if (!PartOfSpeechTagger.mNonWordRegex.Match(corpus.TaggedWords[i].Word).Success) { if (tag == goldTags[i]) { if (isKnown) { knownWordsCorrectNoPunct++; } else { unknownWordsCorrectNoPunct++; } } if (goldTags[i] != null && tag[0] == goldTags[i][0]) { if (isKnown) { knownWordsPosCorrectNoPunct++; } else { unknownWordsPosCorrectNoPunct++; } } if (isKnown) { knownWordsNoPunct++; } else { unknownWordsNoPunct++; } } if (tag == goldTags[i]) { if (isKnown) { knownWordsCorrect++; } else { unknownWordsCorrect++; } } if (goldTags[i] != null && tag[0] == goldTags[i][0]) { if (isKnown) { knownWordsPosCorrect++; } else { unknownWordsPosCorrect++; } } if (isKnown) { knownWords++; } else { unknownWords++; } if (corpus.TaggedWords[i].MoreInfo.EndOfSentence) { eosCount++; if (tag.EndsWith("<eos>")) { eosCorrect++; } } } int allWords = knownWords + unknownWords; int allWordsCorrect = knownWordsCorrect + unknownWordsCorrect; int allWordsPosCorrect = knownWordsPosCorrect + unknownWordsPosCorrect; int allWordsNoPunct = knownWordsNoPunct + unknownWordsNoPunct; int allWordsCorrectNoPunct = knownWordsCorrectNoPunct + unknownWordsCorrectNoPunct; int allWordsPosCorrectNoPunct = knownWordsPosCorrectNoPunct + unknownWordsPosCorrectNoPunct; mLogger.Info(null, "Točnost na znanih besedah: ................... {2:0.00}% ({0} / {1})", knownWordsCorrect, knownWords, (double)knownWordsCorrect / (double)knownWords * 100.0); mLogger.Info(null, "Točnost na neznanih besedah: ................. {2:0.00}% ({0} / {1})", unknownWordsCorrect, unknownWords, (double)unknownWordsCorrect / (double)unknownWords * 100.0); mLogger.Info(null, "Skupna točnost: .............................. {2:0.00}% ({0} / {1})", allWordsCorrect, allWords, (double)allWordsCorrect / (double)allWords * 100.0); mLogger.Info(null, "Točnost na znanih besedah (POS): ............. {2:0.00}% ({0} / {1})", knownWordsPosCorrect, knownWords, (double)knownWordsPosCorrect / (double)knownWords * 100.0); mLogger.Info(null, "Točnost na neznanih besedah (POS): ........... {2:0.00}% ({0} / {1})", unknownWordsPosCorrect, unknownWords, (double)unknownWordsPosCorrect / (double)unknownWords * 100.0); mLogger.Info(null, "Skupna točnost (POS): ........................ {2:0.00}% ({0} / {1})", allWordsPosCorrect, allWords, (double)allWordsPosCorrect / (double)allWords * 100.0); mLogger.Info(null, "Točnost na znanih besedah (brez ločil): ...... {2:0.00}% ({0} / {1})", knownWordsCorrectNoPunct, knownWordsNoPunct, (double)knownWordsCorrectNoPunct / (double)knownWordsNoPunct * 100.0); mLogger.Info(null, "Točnost na neznanih besedah (brez ločil): .... {2:0.00}% ({0} / {1})", unknownWordsCorrectNoPunct, unknownWordsNoPunct, (double)unknownWordsCorrectNoPunct / (double)unknownWordsNoPunct * 100.0); mLogger.Info(null, "Skupna točnost (brez ločil): ................. {2:0.00}% ({0} / {1})", allWordsCorrectNoPunct, allWordsNoPunct, (double)allWordsCorrectNoPunct / (double)allWordsNoPunct * 100.0); mLogger.Info(null, "Točnost na znanih besedah (POS, brez ločil): {2:0.00}% ({0} / {1})", knownWordsPosCorrectNoPunct, knownWordsNoPunct, (double)knownWordsPosCorrectNoPunct / (double)knownWordsNoPunct * 100.0); mLogger.Info(null, "Točnost na neznanih besedah (POS, brez ločil): {2:0.00}% ({0} / {1})", unknownWordsPosCorrectNoPunct, unknownWordsNoPunct, (double)unknownWordsPosCorrectNoPunct / (double)unknownWordsNoPunct * 100.0); mLogger.Info(null, "Skupna točnost (POS, brez ločil): ............ {2:0.00}% ({0} / {1})", allWordsPosCorrectNoPunct, allWordsNoPunct, (double)allWordsPosCorrectNoPunct / (double)allWordsNoPunct * 100.0); if (lemmatizerModelFile != null) { mLogger.Info(null, "Točnost lematizacije (brez ločil): ........... {2:0.00}% ({0} / {1})", lemmaCorrect, lemmaWords, (double)lemmaCorrect / (double)lemmaWords * 100.0); mLogger.Info(null, "Točnost lematizacije (male črke, brez ločil): {2:0.00}% ({0} / {1})", lemmaCorrectLowercase, lemmaWords, (double)lemmaCorrectLowercase / (double)lemmaWords * 100.0); } mLogger.Info(null, "Točnost detekcije konca stavka: .............. {2:0.00}% ({0} / {1})", eosCorrect, eosCount, (double)eosCorrect / (double)eosCount * 100.0); } } } } } catch (Exception e) { mLogger.Info(null, ""); mLogger.Info(null, "*** Nepričakovana napaka. Podrobnosti: {0}", e); } }
static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { int cutOff = 2; int numIter = 50; int numThreads = 1; string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool verbose = false; if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1); GC.Collect(); long oldMemUse = Process.GetCurrentProcess().PrivateMemorySize64; PatriciaTree suffixTree = new PatriciaTree(); foreach (TaggedWord word in corpus.TaggedWords) { suffixTree.AddWordTagPair(word.WordLower, word.Tag); } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; while ((lexLine = lexReader.ReadLine()) != null) { string[] lexData = lexLine.Split('\t'); suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]); } lexReader.Close(); } GC.Collect(); long memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); oldMemUse = memUse; suffixTree.PropagateTags(); GC.Collect(); memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); MaximumEntropyClassifierFast <string> model = new MaximumEntropyClassifierFast <string>(); LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>(); Dictionary <string, int> featureSpace = new Dictionary <string, int>(); logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ..."); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree); dataset.Add(corpus.TaggedWords[i].Tag, featureVector); } logger.Info(/*funcName=*/ null, "Gradim model ..."); DateTime startTime = DateTime.Now; model.CutOff = cutOff; model.NumThreads = numThreads; model.NumIter = numIter; model.Train(dataset); TimeSpan span = DateTime.Now - startTime; logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); suffixTree.Save(writer); Utils.SaveDictionary(featureSpace, writer); model.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }