/// <summary>Converts a tree to the Morfette training format.</summary> private static string TreeToMorfette(Tree tree) { StringBuilder sb = new StringBuilder(); IList <ILabel> yield = tree.Yield(); IList <ILabel> tagYield = tree.PreTerminalYield(); System.Diagnostics.Debug.Assert(yield.Count == tagYield.Count); int listLen = yield.Count; for (int i = 0; i < listLen; ++i) { CoreLabel token = (CoreLabel)yield[i]; CoreLabel tag = (CoreLabel)tagYield[i]; string morphStr = token.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = tag.Value(); } string lemma = token.Lemma(); if (lemma == null || lemma.Equals(string.Empty)) { lemma = token.Value(); } sb.Append(string.Format("%s %s %s%n", token.Value(), lemma, morphStr)); } return(sb.ToString()); }
public static string GetFeatureFromCoreLabel(CoreLabel label, FeatureFactory.FeatureComponent feature) { string value = null; switch (feature) { case FeatureFactory.FeatureComponent.Headword: { value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)).Value(); break; } case FeatureFactory.FeatureComponent.Headtag: { value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)).Value(); break; } case FeatureFactory.FeatureComponent.Value: { value = (label == null) ? Null : label.Value(); break; } default: { throw new ArgumentException("Unexpected feature type: " + feature); } } return(value); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); for (Tree tree1; (tree1 = tr.ReadTree()) != null;) { IList <ILabel> pretermYield = tree1.PreTerminalYield(); IList <ILabel> yield = tree1.Yield(); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel rawToken = (CoreLabel)yield[i]; string word = rawToken.Value(); string morphStr = rawToken.OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr); string lemma = lemmaMorph.First(); string morph = lemmaMorph.Second(); if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX")) { morph = ((CoreLabel)pretermYield[i]).Value(); } System.Console.Out.Printf("%s %s %s%n", word, lemma, morph); } System.Console.Out.WriteLine(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
/// <summary> /// Determine if the given tree contains a leaf which matches the /// part-of-speech and lexical criteria. /// </summary> /// <param name="pos"> /// Regular expression to match part of speech (may be null, /// in which case any POS is allowed) /// </param> /// <param name="pos"> /// Regular expression to match word (may be null, in which /// case any word is allowed) /// </param> private static bool ShouldPrintTree(Tree tree, Pattern pos, Pattern word) { foreach (Tree t in tree) { if (t.IsPreTerminal()) { CoreLabel label = (CoreLabel)t.Label(); string tpos = label.Value(); Tree wordNode = t.FirstChild(); CoreLabel wordLabel = (CoreLabel)wordNode.Label(); string tword = wordLabel.Value(); if ((pos == null || pos.Matcher(tpos).Find()) && (word == null || word.Matcher(tword).Find())) { return(true); } } } return(false); }
protected internal virtual Tree FindSyntacticHead(Mention m, Tree root, IList <CoreLabel> tokens) { // mention ends with 's int endIdx = m.endIndex; if (m.originalSpan.Count > 0) { string lastWord = m.originalSpan[m.originalSpan.Count - 1].Get(typeof(CoreAnnotations.TextAnnotation)); if ((lastWord.Equals("'s") || lastWord.Equals("'")) && m.originalSpan.Count != 1) { endIdx--; } } Tree exactMatch = FindTreeWithSpan(root, m.startIndex, endIdx); // // found an exact match // if (exactMatch != null) { return(SafeHead(exactMatch, endIdx)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) if (allowReparsing) { int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = m.startIndex; i < endIdx; i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { // necessary to copy tokens in case the parser does things like // put new indices on the tokens extentTokens.Add((CoreLabel)label.LabelFactory().NewLabel(label)); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, Pattern.Compile(".*")); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); ConvertToCoreLabels(tree); // now unnecessary, as parser uses CoreLabels? tree.IndexSpans(m.startIndex - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, m.startIndex); // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word! // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something // passed the right end (that is, just that final period). Tree extentHead = SafeHead(subtree, endIdx); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); System.Diagnostics.Debug.Assert((realHead != null)); return(realHead); } // If reparsing wasn't allowed, try to find a span in the tree // which happens to have the head Tree wordMatch = FindTreeWithSmallestSpan(root, m.startIndex, endIdx); if (wordMatch != null) { Tree head = SafeHead(wordMatch, endIdx); if (head != null) { int index = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; if (index >= m.startIndex && index < endIdx) { return(head); } } } // If that didn't work, guess that it's the last word int lastNounIdx = endIdx - 1; for (int i_1 = m.startIndex; i_1 < m.endIndex; i_1++) { if (tokens[i_1].Tag().StartsWith("N")) { lastNounIdx = i_1; } else { if (tokens[i_1].Tag().StartsWith("W")) { break; } } } IList <Tree> leaves = root.GetLeaves(); Tree endLeaf = leaves[lastNounIdx]; return(endLeaf); }
public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves) { IList <ILabel> labels = tree.Yield(); foreach (ILabel label in labels) { ++nTokens; if (!(label is CoreLabel)) { throw new ArgumentException("Only works with CoreLabels trees"); } CoreLabel coreLabel = (CoreLabel)label; string lemma = coreLabel.Lemma(); //PTB escaping since we're going to put this in the leaf if (lemma == null) { // No lemma, so just add the surface form lemma = coreLabel.Word(); } else { if (lemma.Equals("(")) { lemma = "-LRB-"; } else { if (lemma.Equals(")")) { lemma = "-RRB-"; } } } if (lemmasAsLeaves) { string escapedLemma = lemma; coreLabel.SetWord(escapedLemma); coreLabel.SetValue(escapedLemma); coreLabel.SetLemma(lemma); } if (addMorphoToLeaves) { string morphStr = coreLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = MorphoFeatureSpecification.NoAnalysis; } else { ++nMorphAnalyses; } // Normalize punctuation analyses if (morphStr.StartsWith("PONCT")) { morphStr = "PUNC"; } string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr); coreLabel.SetValue(newLeaf); coreLabel.SetWord(newLeaf); } } }
/// <summary>Finds the syntactic head of the given entity mention.</summary> /// <param name="ent">The entity mention</param> /// <param name="root">The Tree for the entire sentence in which it occurs.</param> /// <param name="tokens">The Sentence in which it occurs</param> /// <returns> /// The tree object corresponding to the head. This MUST be a child of root. /// It will be a leaf in the parse tree. /// </returns> public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens) { if (!useNewHeadFinder) { return(OriginalFindSyntacticHead(ent, root, tokens)); } logger.Fine("Searching for tree matching " + ent); Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd()); // // found an exact match // if (exactMatch != null) { logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch)); return(SafeHead(exactMatch)); } // no exact match found // in this case, we parse the actual extent of the mention, embedded in a sentence // context, so as to make the parser work better :-) int approximateness = 0; IList <CoreLabel> extentTokens = new List <CoreLabel>(); extentTokens.Add(InitCoreLabel("It")); extentTokens.Add(InitCoreLabel("was")); int AddedWords = 2; for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++) { // Add everything except separated dashes! The separated dashes mess with the parser too badly. CoreLabel label = tokens[i]; if (!"-".Equals(label.Word())) { extentTokens.Add(tokens[i]); } else { approximateness++; } } extentTokens.Add(InitCoreLabel(".")); // constrain the parse to the part we're interested in. // Starting from ADDED_WORDS comes from skipping "It was". // -1 to exclude the period. // We now let it be any kind of nominal constituent, since there // are VP and S ones ParserConstraint constraint = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*"); IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint); Tree tree = Parse(extentTokens, constraints); logger.Fine("No exact match found. Local parse:\n" + tree.PennString()); ConvertToCoreLabels(tree); tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords); // remember it has ADDED_WORDS extra words at the beginning Tree subtree = FindPartialSpan(tree, ent.GetExtentTokenStart()); Tree extentHead = SafeHead(subtree); logger.Fine("Head is: " + extentHead); System.Diagnostics.Debug.Assert((extentHead != null)); // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree // Because we deleted dashes, it's index will be >= the index in the extent parse tree CoreLabel l = (CoreLabel)extentHead.Label(); // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class)); Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness); if (realHead != null) { logger.Fine("Chosen head: " + realHead); } return(realHead); }
/// <summary> /// transformTree does all language-specific tree /// transformations. /// </summary> /// <remarks> /// transformTree does all language-specific tree /// transformations. Any parameterizations should be inside the /// specific TreebankLangParserarams class. /// </remarks> public override Tree TransformTree(Tree t, Tree root) { if (t == null || t.IsLeaf()) { return(t); } IList <string> annotations = new List <string>(); CoreLabel lab = (CoreLabel)t.Label(); string word = lab.Word(); string tag = lab.Tag(); string cat = lab.Value(); string baseCat = TreebankLanguagePack().BasicCategory(cat); //Tree parent = t.parent(root); // String mcat = ""; // if (parent != null) { // mcat = parent.label().value(); // } //categories -- at present there is no tag annotation!! if (t.IsPhrasal()) { IList <string> childBasicCats = ChildBasicCats(t); // mark vp's headed by "zu" verbs if (markZuVP && baseCat.Equals("VP") && (childBasicCats.Contains("VZ") || childBasicCats.Contains("VVIZU"))) { annotations.Add("%ZU"); } // mark relative clause S's if (markRC && (t.Label() is NegraLabel) && baseCat.Equals("S") && ((NegraLabel)t.Label()).GetEdge() != null && ((NegraLabel)t.Label()).GetEdge().Equals("RC")) { //throw new RuntimeException("damn, not a Negra Label"); annotations.Add("%RC"); } // if(t.children().length == 1) { // annotations.add("%U"); // } if (markContainsV && ContainsVP(t)) { annotations.Add("%vp"); } if (markLP && LeftPhrasal(t)) { annotations.Add("%LP"); } if (markKonjParent) { // this depends on functional tags being present foreach (string cCat in childBasicCats) { if (cCat.Contains("-KONJ")) { annotations.Add("%konjp"); break; } } } if (markHDParent) { // this depends on functional tags being present foreach (string cCat in childBasicCats) { if (cCat.Contains("-HD")) { annotations.Add("%hdp"); break; } } } } else { //t.isPreTerminal() case if (markColon && cat.Equals("$.") && (word.Equals(":") || word.Equals(";"))) { annotations.Add("-%colon"); } } // if(t.isPreTerminal()) { // if(parent != null) { // String parentVal = parent.label().value(); // int cutOffPtD = parentVal.indexOf('-'); // int cutOffPtC = parentVal.indexOf('^'); // int curMin = parentVal.length(); // if(cutOffPtD != -1) { // curMin = cutOffPtD; // } // if(cutOffPtC != -1) { // curMin = Math.min(curMin, cutOffPtC); // } // parentVal = parentVal.substring(0, curMin); // annotations.add("^" + parentVal); // } // } // put on all the annotations StringBuilder catSB = new StringBuilder(cat); foreach (string annotation in annotations) { catSB.Append(annotation); } t.SetLabel(new CategoryWordTag(catSB.ToString(), word, tag)); return(t); }