public override Tree TransformTree(Tree t, Tree root) { // Perform tregex-powered annotations t = base.TransformTree(t, root); string cat = t.Value(); //Add morphosyntactic features if this is a POS tag if (t.IsPreTerminal() && tagSpec != null) { if (!(t.FirstChild().Label() is CoreLabel) || ((CoreLabel)t.FirstChild().Label()).OriginalText() == null) { throw new Exception(string.Format("%s: Term lacks morpho analysis: %s", this.GetType().FullName, t.ToString())); } string morphoStr = ((CoreLabel)t.FirstChild().Label()).OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(string.Empty, morphoStr); MorphoFeatures feats = tagSpec.StrToFeatures(lemmaMorph.Second()); cat = feats.GetTag(cat); } //Update the label(s) t.SetValue(cat); if (t.IsPreTerminal() && t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(cat); } return(t); }
private static void ReplacePOSTags(Tree tree) { IList <ILabel> yield = tree.Yield(); IList <ILabel> preYield = tree.PreTerminalYield(); System.Diagnostics.Debug.Assert(yield.Count == preYield.Count); MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification(); for (int i = 0; i < yield.Count; i++) { // Morphological Analysis string morphStr = ((CoreLabel)yield[i]).OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = preYield[i].Value(); // POS subcategory string subCat = ((CoreLabel)yield[i]).Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = spec.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { CoreLabel cl = (CoreLabel)preYield[i]; cl.SetValue(feats.GetAltTag()); cl.SetTag(feats.GetAltTag()); } } }
/// <summary>For debugging</summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(FrenchMorphoFeatureSpecification).FullName); System.Environment.Exit(-1); } try { BufferedReader br = new BufferedReader(new FileReader(args[0])); MorphoFeatureSpecification mfs = new FrenchMorphoFeatureSpecification(); //Activate all features for debugging mfs.Activate(MorphoFeatureSpecification.MorphoFeatureType.Gen); mfs.Activate(MorphoFeatureSpecification.MorphoFeatureType.Num); mfs.Activate(MorphoFeatureSpecification.MorphoFeatureType.Per); for (string line; (line = br.ReadLine()) != null;) { MorphoFeatures feats = mfs.StrToFeatures(line); System.Console.Out.Printf("%s\t%s%n", line.Trim(), feats.ToString()); } br.Close(); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private void ProcessInflectionalFeaturesHelper(MorphoFeatures feats, string spec) { if (IsActive(MorphoFeatureSpecification.MorphoFeatureType.Gen)) { if (spec.Contains("M")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Gen, genVals[0]); } else { if (spec.Contains("F")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Gen, genVals[1]); } } } if (IsActive(MorphoFeatureSpecification.MorphoFeatureType.Num)) { if (spec.EndsWith("S")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Num, numVals[0]); } else { if (spec.EndsWith("D")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Num, numVals[1]); } else { if (spec.EndsWith("P")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Num, numVals[2]); } } } } if (IsActive(MorphoFeatureSpecification.MorphoFeatureType.Per)) { if (spec.Contains("1")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Per, perVals[0]); } else { if (spec.Contains("2")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Per, perVals[1]); } else { if (spec.Contains("3")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Per, perVals[2]); } } } } }
/// <summary>Extract features from a standard phi feature specification.</summary> /// <param name="feats"/> /// <param name="spec"/> private void ProcessInflectionalFeatures(MorphoFeatures feats, string spec) { // Extract the feature tuple Matcher m = pFeatureTuple.Matcher(spec); if (m.Find()) { spec = m.Group(1); ProcessInflectionalFeaturesHelper(feats, spec); } }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
/// <summary>First map to the LDC short tags.</summary> /// <remarks> /// First map to the LDC short tags. Then map to the Universal POS. Then add /// morphological annotations. /// </remarks> public override string Map(string posTag, string terminal) { string rawTag = posTag.Trim(); string shortTag = tagsToEscape.Contains(rawTag) ? rawTag : tagMap[rawTag]; if (shortTag == null) { System.Console.Error.Printf("%s: No LDC shortened tag for %s%n", this.GetType().FullName, rawTag); return(rawTag); } string universalTag = universalMap[shortTag]; if (!universalMap.Contains(shortTag)) { System.Console.Error.Printf("%s: No universal tag for LDC tag %s%n", this.GetType().FullName, shortTag); universalTag = shortTag; } MorphoFeatures feats = new MorphoFeatures(morphoSpec.StrToFeatures(rawTag)); string functionalTag = feats.GetTag(universalTag); return(functionalTag); }
/// <summary>For debugging.</summary> /// <remarks> /// For debugging. Converts a set of long tags (BAMA analyses as in the ATB) to their morpho /// feature specification. The input file should have one long tag per line. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Error.Printf("Usage: java %s filename feats%n", typeof(ArabicMorphoFeatureSpecification).FullName); System.Environment.Exit(-1); } MorphoFeatureSpecification fSpec = new ArabicMorphoFeatureSpecification(); string[] feats = args[1].Split(","); foreach (string feat in feats) { MorphoFeatureSpecification.MorphoFeatureType fType = MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feat); fSpec.Activate(fType); } File fName = new File(args[0]); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fName))); int nLine = 0; for (string line; (line = br.ReadLine()) != null; nLine++) { MorphoFeatures mFeats = fSpec.StrToFeatures(line.Trim()); System.Console.Out.Printf("%s\t%s%n", line.Trim(), mFeats.ToString()); } br.Close(); System.Console.Out.Printf("%nRead %d lines%n", nLine); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public override MorphoFeatures StrToFeatures(string spec) { MorphoFeatures feats = new MorphoFeatures(); //Usually this is the boundary symbol if (spec == null || spec.Equals(string.Empty)) { return(feats); } bool isOtherActive = IsActive(MorphoFeatureSpecification.MorphoFeatureType.Other); if (spec.StartsWith("ADV")) { feats.SetAltTag("ADV"); if (spec.Contains("int")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "advint"); } feats.SetAltTag("ADVWH"); } } else { if (spec.StartsWith("A")) { feats.SetAltTag("ADJ"); if (spec.Contains("int")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "adjint"); } feats.SetAltTag("ADJWH"); } AddPhiFeatures(feats, spec); } else { if (spec.Equals("CC") || spec.Equals("C-C")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Cc"); } feats.SetAltTag("CC"); } else { if (spec.Equals("CS") || spec.Equals("C-S")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Cs"); } feats.SetAltTag("CS"); } else { if (spec.StartsWith("CL")) { feats.SetAltTag("CL"); if (spec.Contains("suj") || spec.Equals("CL-S-3fp")) { //"CL-S-3fp" is equivalent to suj if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Sbj"); } feats.SetAltTag("CLS"); } else { if (spec.Contains("obj")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Obj"); } feats.SetAltTag("CLO"); } else { if (spec.Contains("refl")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Rfl"); } feats.SetAltTag("CLR"); } } } AddPhiFeatures(feats, spec); } else { if (spec.StartsWith("D")) { feats.SetAltTag("DET"); if (spec.Contains("int")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "dint"); } feats.SetAltTag("DETWH"); } AddPhiFeatures(feats, spec); } else { if (spec.StartsWith("N")) { feats.SetAltTag("N"); //TODO These are usually N-card...make these CD? if (spec.Contains("P")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Np"); } feats.SetAltTag("NPP"); } else { if (spec.Contains("C")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Nc"); } feats.SetAltTag("NC"); } } AddPhiFeatures(feats, spec); } else { if (spec.StartsWith("PRO")) { feats.SetAltTag("PRO"); if (spec.Contains("int")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Ni"); } feats.SetAltTag("PROWH"); } else { if (spec.Contains("rel")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Nr"); } feats.SetAltTag("PROREL"); } } AddPhiFeatures(feats, spec); } else { if (spec.StartsWith("V")) { feats.SetAltTag("V"); if (spec.Contains("Y")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Vp"); } feats.SetAltTag("VIMP"); } else { if (spec.Contains("W")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Vf"); } feats.SetAltTag("VINF"); } else { if (spec.Contains("S") || spec.Contains("T")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Vs"); } feats.SetAltTag("VS"); } else { if (spec.Contains("K")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Vp"); } feats.SetAltTag("VPP"); } else { if (spec.Contains("G")) { if (isOtherActive) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Other, "Vr"); } feats.SetAltTag("VPR"); } } } } } AddPhiFeatures(feats, spec); } else { if (spec.Equals("P") || spec.Equals("I")) { feats.SetAltTag(spec); } } } } } } } } } } // else { // log.info("Could not map spec: " + spec); // } return(feats); }
private void AddPhiFeatures(MorphoFeatures feats, string spec) { string[] toks = spec.Split("\\-+"); string morphStr; if (toks.Length == 3 && toks[0].Equals("PRO") && toks[2].Equals("neg")) { morphStr = toks[1]; } else { morphStr = toks[toks.Length - 1]; } //wsg2011: The analyses have mixed casing.... morphStr = morphStr.ToLower(); if (IsActive(MorphoFeatureSpecification.MorphoFeatureType.Gen)) { if (morphStr.Contains("m")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Gen, genVals[0]); } else { if (morphStr.Contains("f")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Gen, genVals[1]); } } } if (IsActive(MorphoFeatureSpecification.MorphoFeatureType.Per)) { if (morphStr.Contains("1")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Per, perVals[0]); } else { if (morphStr.Contains("2")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Per, perVals[1]); } else { if (morphStr.Contains("3")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Per, perVals[2]); } } } } if (IsActive(MorphoFeatureSpecification.MorphoFeatureType.Num)) { if (morphStr.Contains("s")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Num, numVals[0]); } else { if (morphStr.Contains("p")) { feats.AddFeature(MorphoFeatureSpecification.MorphoFeatureType.Num, numVals[1]); } } } }
/// <summary>Convert token to a sequence of datums and add to iobList.</summary> /// <param name="iobList"/> /// <param name="token"/> /// <param name="tokType"/> /// <param name="tokenLabel"/> /// <param name="lastToken"/> /// <param name="applyRewriteRules"/> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText ) { if (token.IsEmpty()) { return; } string lastLabel = ContinuationSymbol; string firstLabel = BeginSymbol; string rewritten = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation)); bool crossRefRewrites = true; if (rewritten == null) { rewritten = token; crossRefRewrites = false; } else { rewritten = StripSegmentationMarkers(rewritten, tokType); } if (applyRewriteRules) { // Apply Arabic-specific re-write rules string rawToken = tokenLabel.Word(); string tag = tokenLabel.Tag(); MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification(); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense); MorphoFeatures features = featureSpec.StrToFeatures(tag); // Rule #1 : ت --> ة if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites) { lastLabel = RewriteSymbol; } else { if (rawToken.EndsWith("ة-")) { System.Diagnostics.Debug.Assert(token.EndsWith("ة")); token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت"; lastLabel = RewriteSymbol; } } // Rule #2 : لل --> ل ال if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D")) { if (rawToken.StartsWith("-ال")) { if (!token.StartsWith("ا")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } token = Sharpen.Runtime.Substring(token, 1); rewritten = Sharpen.Runtime.Substring(rewritten, 1); if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { if (rawToken.StartsWith("-ل")) { if (!token.StartsWith("ل")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { log.Info("Ignoring REWAL: " + rawToken + " / " + token); } } } // Rule #3 : ي --> ى // Rule #4 : ا --> ى if (rawToken.EndsWith("ى-")) { if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null) { // verb: ى becomes ا token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا"; } else { // assume preposition: token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي"; } if (!stripRewrites) { lastLabel = RewriteSymbol; } } else { if (rawToken.Equals("علي-") || rawToken.Equals("-علي-")) { if (!stripRewrites) { lastLabel = RewriteSymbol; } } } } string origWord; if (origText == null) { origWord = tokenLabel.Word(); } else { origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition()); } int origIndex = 0; while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } // Create datums and add to iobList if (token.IsEmpty()) { log.Info("Rewriting resulted in empty token: " + tokenLabel.Word()); } string firstChar = token[0].ToString(); // Start at 0 to make sure we include the whole token according to the tokenizer iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1)); int numChars = token.Length; if (crossRefRewrites && rewritten.Length != numChars) { System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten); crossRefRewrites = false; } ++origIndex; for (int j = 1; j < numChars; ++j, ++origIndex) { while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } if (origIndex >= origWord.Length) { origIndex = origWord.Length - 1; } string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol; string thisChar = token[j].ToString(); if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar)) { charLabel = RewriteSymbol; } if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1) { charLabel = RewriteSymbol; } // Assume all mid-word alef maqsura are supposed to be yah iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1)); } // End at endPosition to make sure we include the whole token according to the tokenizer if (!iobList.IsEmpty()) { iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition()); } }