public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { int wordId = iTW.Word(); int tagId = iTW.Tag(); // Force 1-best path to go through the boundary symbol // (deterministic tagging) int boundaryId = wordIndex.IndexOf(LexiconConstants.Boundary); int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag); if (wordId == boundaryId && tagId == boundaryTagId) { return(0.0f); } // Morphological features string tag = tagIndex.Get(iTW.Tag()); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec); string lemma = lemmaMorph.First(); int lemmaId = wordIndex.IndexOf(lemma); string richMorphTag = lemmaMorph.Second(); string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim(); reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag; int morphId = morphIndex.AddToIndex(reducedMorphTag); // Score the factors and create the rule score p_W_T double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId)); // double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId)); double p_L_T = 0.0; double p_M_T = Math.Log(ProbMorphTag(tagId, morphId)); double p_W_T = p_W_Tf + p_L_T + p_M_T; // String tag = tagIndex.get(tagId); // Filter low probability taggings return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity); }
public override Tree TransformTree(Tree t, Tree root) { // Perform tregex-powered annotations t = base.TransformTree(t, root); string cat = t.Value(); //Add morphosyntactic features if this is a POS tag if (t.IsPreTerminal() && tagSpec != null) { if (!(t.FirstChild().Label() is CoreLabel) || ((CoreLabel)t.FirstChild().Label()).OriginalText() == null) { throw new Exception(string.Format("%s: Term lacks morpho analysis: %s", this.GetType().FullName, t.ToString())); } string morphoStr = ((CoreLabel)t.FirstChild().Label()).OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(string.Empty, morphoStr); MorphoFeatures feats = tagSpec.StrToFeatures(lemmaMorph.Second()); cat = feats.GetTag(cat); } //Update the label(s) t.SetValue(cat); if (t.IsPreTerminal() && t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(cat); } return(t); }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
/// <summary>First map to the LDC short tags.</summary> /// <remarks> /// First map to the LDC short tags. Then map to the Universal POS. Then add /// morphological annotations. /// </remarks> public override string Map(string posTag, string terminal) { string rawTag = posTag.Trim(); string shortTag = tagsToEscape.Contains(rawTag) ? rawTag : tagMap[rawTag]; if (shortTag == null) { System.Console.Error.Printf("%s: No LDC shortened tag for %s%n", this.GetType().FullName, rawTag); return(rawTag); } string universalTag = universalMap[shortTag]; if (!universalMap.Contains(shortTag)) { System.Console.Error.Printf("%s: No universal tag for LDC tag %s%n", this.GetType().FullName, shortTag); universalTag = shortTag; } MorphoFeatures feats = new MorphoFeatures(morphoSpec.StrToFeatures(rawTag)); string functionalTag = feats.GetTag(universalTag); return(functionalTag); }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }