public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { int wordId = iTW.Word(); int tagId = iTW.Tag(); // Force 1-best path to go through the boundary symbol // (deterministic tagging) int boundaryId = wordIndex.IndexOf(LexiconConstants.Boundary); int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag); if (wordId == boundaryId && tagId == boundaryTagId) { return(0.0f); } // Morphological features string tag = tagIndex.Get(iTW.Tag()); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec); string lemma = lemmaMorph.First(); int lemmaId = wordIndex.IndexOf(lemma); string richMorphTag = lemmaMorph.Second(); string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim(); reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag; int morphId = morphIndex.AddToIndex(reducedMorphTag); // Score the factors and create the rule score p_W_T double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId)); // double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId)); double p_L_T = 0.0; double p_M_T = Math.Log(ProbMorphTag(tagId, morphId)); double p_W_T = p_W_Tf + p_L_T + p_M_T; // String tag = tagIndex.get(tagId); // Filter low probability taggings return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity); }
public override Tree TransformTree(Tree t, Tree root) { // Perform tregex-powered annotations t = base.TransformTree(t, root); string cat = t.Value(); //Add morphosyntactic features if this is a POS tag if (t.IsPreTerminal() && tagSpec != null) { if (!(t.FirstChild().Label() is CoreLabel) || ((CoreLabel)t.FirstChild().Label()).OriginalText() == null) { throw new Exception(string.Format("%s: Term lacks morpho analysis: %s", this.GetType().FullName, t.ToString())); } string morphoStr = ((CoreLabel)t.FirstChild().Label()).OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(string.Empty, morphoStr); MorphoFeatures feats = tagSpec.StrToFeatures(lemmaMorph.Second()); cat = feats.GetTag(cat); } //Update the label(s) t.SetValue(cat); if (t.IsPreTerminal() && t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(cat); } return(t); }
public UniversalPOSMapper() : base(false) { //Don't add the determiner split universalMap = Generics.NewHashMap(); morphoSpec = new ArabicMorphoFeatureSpecification(); }
/// <summary>Configures morpho-syntactic annotations for POS tags.</summary> /// <param name="activeFeats"> /// A comma-separated list of feature values with names according /// to MorphoFeatureType. /// </param> private string SetupMorphoFeatures(string activeFeats) { string[] feats = activeFeats.Split(","); morphoSpec = tlp.MorphFeatureSpec(); foreach (string feat in feats) { MorphoFeatureSpecification.MorphoFeatureType fType = MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feat.Trim()); morphoSpec.Activate(fType); } return(morphoSpec.ToString()); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); for (Tree tree1; (tree1 = tr.ReadTree()) != null;) { IList <ILabel> pretermYield = tree1.PreTerminalYield(); IList <ILabel> yield = tree1.Yield(); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel rawToken = (CoreLabel)yield[i]; string word = rawToken.Value(); string morphStr = rawToken.OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr); string lemma = lemmaMorph.First(); string morph = lemmaMorph.Second(); if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX")) { morph = ((CoreLabel)pretermYield[i]).Value(); } System.Console.Out.Printf("%s %s %s%n", word, lemma, morph); } System.Console.Out.WriteLine(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
/// <summary> /// Convert a treebank to factored lexicon events for fast iteration in the /// optimizer. /// </summary> private static IList <FactoredLexiconEvent> TreebankToLexiconEvents(IList <Tree> treebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon) { IList <FactoredLexiconEvent> events = new List <FactoredLexiconEvent>(70000); foreach (Tree tree in treebank) { IList <ILabel> yield = tree.Yield(); IList <ILabel> preterm = tree.PreTerminalYield(); System.Diagnostics.Debug.Assert(yield.Count == preterm.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = preterm[i].Value(); int tagId = lexicon.tagIndex.IndexOf(tag); string word = yield[i].Value(); int wordId = lexicon.wordIndex.IndexOf(word); // Two checks to see if we keep this example if (tagId < 0) { log.Info("Discarding training example: " + word + " " + tag); continue; } // if (counts.probWordTag(wordId, tagId) == 0.0) { // log.info("Discarding low counts <w,t> pair: " + word + " " + tag); // continue; // } string featureStr = ((CoreLabel)yield[i]).OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureStr); string lemma = lemmaMorph.First(); string richTag = lemmaMorph.Second(); string reducedTag = lexicon.morphoSpec.StrToFeatures(richTag).ToString(); reducedTag = reducedTag.Length == 0 ? NoMorphAnalysis : reducedTag; int lemmaId = lexicon.wordIndex.IndexOf(lemma); int morphId = lexicon.morphIndex.IndexOf(reducedTag); FactoredLexiconEvent @event = new FactoredLexiconEvent(wordId, tagId, lemmaId, morphId, i, word, featureStr); events.Add(@event); } } return(events); }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
public override int SetOptionFlag(string[] args, int i) { if (annotations.Contains(args[i])) { AddFeature(args[i]); i++; } else { if (args[i].Equals("-collinizerRetainsPunctuation")) { optionsString.Append("Collinizer retains punctuation.\n"); collinizerRetainsPunctuation = true; i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-headFinder") && (i + 1 < args.Length)) { try { IHeadFinder hf = (IHeadFinder)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); SetHeadFinder(hf); optionsString.Append("HeadFinder: " + args[i + 1] + "\n"); } catch (Exception e) { log.Info(e); log.Info(this.GetType().FullName + ": Could not load head finder " + args[i + 1]); } i += 2; } else { if (args[i].Equals("-xmlFormat")) { optionsString.Append("Reading trees in XML format.\n"); readPennFormat = false; SetInputEncoding(tlp.GetEncoding()); i++; } else { if (args[i].Equals("-frenchFactored")) { foreach (string feature in factoredFeatures) { AddFeature(feature); } i++; } else { if (args[i].Equals("-frenchMWMap")) { LoadMWMap(args[i + 1]); i += 2; } else { if (args[i].Equals("-tsg")) { //wsg2011: These features should be removed for TSG extraction. //If they are retained, the resulting grammar seems to be too brittle.... optionsString.Append("Removing baseline features: -markVN, -coord1"); RemoveFeature("-markVN"); optionsString.Append(" (removed -markVN)"); RemoveFeature("-coord1"); optionsString.Append(" (removed -coord1)\n"); i++; } else { if (args[i].Equals("-factlex") && (i + 1 < args.Length)) { string activeFeats = SetupMorphoFeatures(args[i + 1]); optionsString.Append("Factored Lexicon: active features: ").Append(activeFeats); // WSGDEBUG Maybe add -mweTag in place of -tagPAFr? RemoveFeature("-tagPAFr"); optionsString.Append(" (removed -tagPAFr)\n"); // Add -mweTag string[] option = new string[] { "-mweTag" }; SetOptionFlag(option, 0); i += 2; } else { if (args[i].Equals("-noFeatures")) { foreach (string feature in annotations.Keys) { RemoveFeature(feature); } optionsString.Append("Removed all manual features.\n"); i++; } else { if (args[i].Equals("-ccTagsetAnnotations")) { tagSpec = new FrenchMorphoFeatureSpecification(); tagSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Other); optionsString.Append("Adding CC tagset as POS state splits.\n"); ++i; } } } } } } } } } } return(i); }
public FactoredLexicon(Options op, MorphoFeatureSpecification morphoSpec, IIndex <string> wordIndex, IIndex <string> tagIndex) : base(op, wordIndex, tagIndex) { this.morphoSpec = morphoSpec; }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
/// <summary>This method should populate wordIndex, tagIndex, and morphIndex.</summary> public override void Train(ICollection <Tree> trees, ICollection <Tree> rawTrees) { double weight = 1.0; // Train uw model on words uwModelTrainer.Train(trees, weight); double numTrees = trees.Count; IEnumerator <Tree> rawTreesItr = rawTrees == null ? null : rawTrees.GetEnumerator(); IEnumerator <Tree> treeItr = trees.GetEnumerator(); // Train factored lexicon on lemmas and morph tags int treeId = 0; while (treeItr.MoveNext()) { Tree tree = treeItr.Current; // CoreLabels, with morph analysis in the originalText annotation IList <ILabel> yield = rawTrees == null?tree.Yield() : rawTreesItr.Current.Yield(); // Annotated, binarized tree for the tags (labels are usually CategoryWordTag) IList <ILabel> pretermYield = tree.PreTerminalYield(); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string word = yield[i].Value(); int wordId = wordIndex.AddToIndex(word); // Don't do anything with words string tag = pretermYield[i].Value(); int tagId = tagIndex.AddToIndex(tag); // Use the word as backup if there is no lemma string featureStr = ((CoreLabel)yield[i]).OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureStr); string lemma = lemmaMorph.First(); int lemmaId = wordIndex.AddToIndex(lemma); string richMorphTag = lemmaMorph.Second(); string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim(); reducedMorphTag = reducedMorphTag.IsEmpty() ? NoMorphAnalysis : reducedMorphTag; int morphId = morphIndex.AddToIndex(reducedMorphTag); // Seen event counts wordTag.IncrementCount(wordId, tagId); lemmaTag.IncrementCount(lemmaId, tagId); morphTag.IncrementCount(morphId, tagId); tagCounter.IncrementCount(tagId); // Unseen event counts if (treeId > op.trainOptions.fractionBeforeUnseenCounting * numTrees) { if (!wordTag.FirstKeySet().Contains(wordId) || wordTag.GetCounter(wordId).TotalCount() < 2) { wordTagUnseen.IncrementCount(tagId); } if (!lemmaTag.FirstKeySet().Contains(lemmaId) || lemmaTag.GetCounter(lemmaId).TotalCount() < 2) { lemmaTagUnseen.IncrementCount(tagId); } if (!morphTag.FirstKeySet().Contains(morphId) || morphTag.GetCounter(morphId).TotalCount() < 2) { morphTagUnseen.IncrementCount(tagId); } } } ++treeId; if (Debug && (treeId % 100) == 0) { System.Console.Error.Printf("[%d]", treeId); } if (Debug && (treeId % 10000) == 0) { log.Info(); } } }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); foreach (Tree t in tree) { if (t.IsLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark)) { string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark); if (toks.Length != 2) { log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value())); } else { if (t.Label() is CoreLabel) { CoreLabel cl = (CoreLabel)t.Label(); cl.SetValue(string.Intern(toks[0].Trim())); cl.SetWord(string.Intern(toks[0].Trim())); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]); string lemma = lemmaMorph.First(); string morphAnalysis = lemmaMorph.Second(); if (lemma.Equals(toks[0])) { cl.SetOriginalText(string.Intern(toks[1].Trim())); } else { // TODO(spenceg): Does this help? string newLemma = lexMapper.Map(null, lemma); if (newLemma == null || newLemma.Trim().IsEmpty()) { newLemma = lemma; } string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis; cl.SetOriginalText(string.Intern(newMorphAnalysis)); } } else { log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName)); } } } } else { if (t.IsPreTerminal()) { if (t.Value() == null || t.Value().IsEmpty()) { log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString())); } else { if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(t.Value()); } } } else { //Phrasal nodes // there are some nodes "/" missing preterminals. We'll splice in a tag for these. int nk = t.NumChildren(); IList <Tree> newKids = new List <Tree>(nk); for (int j = 0; j < nk; j++) { Tree child = t.GetChild(j); if (child.IsLeaf()) { log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString())); newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child))); } else { newKids.Add(child); } } t.SetChildren(newKids); } } } //Every node in the tree has now been processed // // Additional processing for specific phrasal annotations // // special global coding for moving PRD annotation from constituent to verb tag. if (markPRDverb) { TregexMatcher m = prdVerbPattern.Matcher(tree); Tree match = null; while (m.Find()) { if (m.GetMatch() != match) { match = m.GetMatch(); match.Label().SetValue(match.Label().Value() + "-PRDverb"); Tree prd = m.GetNode("prd"); prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value())); } } } //Mark *only* subjects in verb-initial clauses if (retainNPSbj) { TregexMatcher m = npSbjPattern.Matcher(tree); while (m.Find()) { Tree match = m.GetMatch(); match.Label().SetValue("NP"); } } if (tree.IsPreTerminal()) { // The whole tree is a bare tag: bad! string val = tree.Label().Value(); if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ")) { log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString())); tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree)); } else { log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString())); } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } if (tree != null && !tree.Value().Equals(rootLabel)) { tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree)); } return(tree); }