/// <summary>Converts a tree to the Morfette training format.</summary>
        private static string TreeToMorfette(Tree tree)
        {
            StringBuilder  sb       = new StringBuilder();
            IList <ILabel> yield    = tree.Yield();
            IList <ILabel> tagYield = tree.PreTerminalYield();

            System.Diagnostics.Debug.Assert(yield.Count == tagYield.Count);
            int listLen = yield.Count;

            for (int i = 0; i < listLen; ++i)
            {
                CoreLabel token    = (CoreLabel)yield[i];
                CoreLabel tag      = (CoreLabel)tagYield[i];
                string    morphStr = token.OriginalText();
                if (morphStr == null || morphStr.Equals(string.Empty))
                {
                    morphStr = tag.Value();
                }
                string lemma = token.Lemma();
                if (lemma == null || lemma.Equals(string.Empty))
                {
                    lemma = token.Value();
                }
                sb.Append(string.Format("%s %s %s%n", token.Value(), lemma, morphStr));
            }
            return(sb.ToString());
        }
Exemplo n.º 2
0
        public static string GetFeatureFromCoreLabel(CoreLabel label, FeatureFactory.FeatureComponent feature)
        {
            string value = null;

            switch (feature)
            {
            case FeatureFactory.FeatureComponent.Headword:
            {
                value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)).Value();
                break;
            }

            case FeatureFactory.FeatureComponent.Headtag:
            {
                value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)).Value();
                break;
            }

            case FeatureFactory.FeatureComponent.Value:
            {
                value = (label == null) ? Null : label.Value();
                break;
            }

            default:
            {
                throw new ArgumentException("Unexpected feature type: " + feature);
            }
            }
            return(value);
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile = args[0];
            ITreeReaderFactory trf      = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                for (Tree tree1; (tree1 = tr.ReadTree()) != null;)
                {
                    IList <ILabel> pretermYield = tree1.PreTerminalYield();
                    IList <ILabel> yield        = tree1.Yield();
                    int            yieldLen     = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel             rawToken   = (CoreLabel)yield[i];
                        string                word       = rawToken.Value();
                        string                morphStr   = rawToken.OriginalText();
                        Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr);
                        string                lemma      = lemmaMorph.First();
                        string                morph      = lemmaMorph.Second();
                        if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX"))
                        {
                            morph = ((CoreLabel)pretermYield[i]).Value();
                        }
                        System.Console.Out.Printf("%s %s %s%n", word, lemma, morph);
                    }
                    System.Console.Out.WriteLine();
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho)
        {
            if (!t.IsPreTerminal())
            {
                throw new ArgumentException("Can only operate on preterminals");
            }
            if (!(t.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel label = (CoreLabel)t.Label();
            Tree      child = t.Children()[0];

            if (!(child.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel childLabel = (CoreLabel)child.Label();
            // Morphological Analysis
            string morphStr = childLabel.OriginalText();

            if (morphStr == null || morphStr.Equals(string.Empty))
            {
                morphStr = label.Value();
                // POS subcategory
                string subCat = childLabel.Category();
                if (subCat != null && subCat != string.Empty)
                {
                    morphStr += "-" + subCat + "--";
                }
                else
                {
                    morphStr += "---";
                }
            }
            MorphoFeatures feats = morpho.StrToFeatures(morphStr);

            if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty))
            {
                label.SetValue(feats.GetAltTag());
                label.SetTag(feats.GetAltTag());
            }
        }
 /// <summary>
 /// Determine if the given tree contains a leaf which matches the
 /// part-of-speech and lexical criteria.
 /// </summary>
 /// <param name="pos">
 /// Regular expression to match part of speech (may be null,
 /// in which case any POS is allowed)
 /// </param>
 /// <param name="pos">
 /// Regular expression to match word (may be null, in which
 /// case any word is allowed)
 /// </param>
 private static bool ShouldPrintTree(Tree tree, Pattern pos, Pattern word)
 {
     foreach (Tree t in tree)
     {
         if (t.IsPreTerminal())
         {
             CoreLabel label     = (CoreLabel)t.Label();
             string    tpos      = label.Value();
             Tree      wordNode  = t.FirstChild();
             CoreLabel wordLabel = (CoreLabel)wordNode.Label();
             string    tword     = wordLabel.Value();
             if ((pos == null || pos.Matcher(tpos).Find()) && (word == null || word.Matcher(tword).Find()))
             {
                 return(true);
             }
         }
     }
     return(false);
 }
Exemplo n.º 6
0
        protected internal virtual Tree FindSyntacticHead(Mention m, Tree root, IList <CoreLabel> tokens)
        {
            // mention ends with 's
            int endIdx = m.endIndex;

            if (m.originalSpan.Count > 0)
            {
                string lastWord = m.originalSpan[m.originalSpan.Count - 1].Get(typeof(CoreAnnotations.TextAnnotation));
                if ((lastWord.Equals("'s") || lastWord.Equals("'")) && m.originalSpan.Count != 1)
                {
                    endIdx--;
                }
            }
            Tree exactMatch = FindTreeWithSpan(root, m.startIndex, endIdx);

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                return(SafeHead(exactMatch, endIdx));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            if (allowReparsing)
            {
                int approximateness            = 0;
                IList <CoreLabel> extentTokens = new List <CoreLabel>();
                extentTokens.Add(InitCoreLabel("It"));
                extentTokens.Add(InitCoreLabel("was"));
                int AddedWords = 2;
                for (int i = m.startIndex; i < endIdx; i++)
                {
                    // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                    CoreLabel label = tokens[i];
                    if (!"-".Equals(label.Word()))
                    {
                        // necessary to copy tokens in case the parser does things like
                        // put new indices on the tokens
                        extentTokens.Add((CoreLabel)label.LabelFactory().NewLabel(label));
                    }
                    else
                    {
                        approximateness++;
                    }
                }
                extentTokens.Add(InitCoreLabel("."));
                // constrain the parse to the part we're interested in.
                // Starting from ADDED_WORDS comes from skipping "It was".
                // -1 to exclude the period.
                // We now let it be any kind of nominal constituent, since there
                // are VP and S ones
                ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, Pattern.Compile(".*"));
                IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
                Tree tree = Parse(extentTokens, constraints);
                ConvertToCoreLabels(tree);
                // now unnecessary, as parser uses CoreLabels?
                tree.IndexSpans(m.startIndex - AddedWords);
                // remember it has ADDED_WORDS extra words at the beginning
                Tree subtree = FindPartialSpan(tree, m.startIndex);
                // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
                // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
                // passed the right end (that is, just that final period).
                Tree extentHead = SafeHead(subtree, endIdx);
                System.Diagnostics.Debug.Assert((extentHead != null));
                // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
                // Because we deleted dashes, it's index will be >= the index in the extent parse tree
                CoreLabel l        = (CoreLabel)extentHead.Label();
                Tree      realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);
                System.Diagnostics.Debug.Assert((realHead != null));
                return(realHead);
            }
            // If reparsing wasn't allowed, try to find a span in the tree
            // which happens to have the head
            Tree wordMatch = FindTreeWithSmallestSpan(root, m.startIndex, endIdx);

            if (wordMatch != null)
            {
                Tree head = SafeHead(wordMatch, endIdx);
                if (head != null)
                {
                    int index = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1;
                    if (index >= m.startIndex && index < endIdx)
                    {
                        return(head);
                    }
                }
            }
            // If that didn't work, guess that it's the last word
            int lastNounIdx = endIdx - 1;

            for (int i_1 = m.startIndex; i_1 < m.endIndex; i_1++)
            {
                if (tokens[i_1].Tag().StartsWith("N"))
                {
                    lastNounIdx = i_1;
                }
                else
                {
                    if (tokens[i_1].Tag().StartsWith("W"))
                    {
                        break;
                    }
                }
            }
            IList <Tree> leaves  = root.GetLeaves();
            Tree         endLeaf = leaves[lastNounIdx];

            return(endLeaf);
        }
        public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves)
        {
            IList <ILabel> labels = tree.Yield();

            foreach (ILabel label in labels)
            {
                ++nTokens;
                if (!(label is CoreLabel))
                {
                    throw new ArgumentException("Only works with CoreLabels trees");
                }
                CoreLabel coreLabel = (CoreLabel)label;
                string    lemma     = coreLabel.Lemma();
                //PTB escaping since we're going to put this in the leaf
                if (lemma == null)
                {
                    // No lemma, so just add the surface form
                    lemma = coreLabel.Word();
                }
                else
                {
                    if (lemma.Equals("("))
                    {
                        lemma = "-LRB-";
                    }
                    else
                    {
                        if (lemma.Equals(")"))
                        {
                            lemma = "-RRB-";
                        }
                    }
                }
                if (lemmasAsLeaves)
                {
                    string escapedLemma = lemma;
                    coreLabel.SetWord(escapedLemma);
                    coreLabel.SetValue(escapedLemma);
                    coreLabel.SetLemma(lemma);
                }
                if (addMorphoToLeaves)
                {
                    string morphStr = coreLabel.OriginalText();
                    if (morphStr == null || morphStr.Equals(string.Empty))
                    {
                        morphStr = MorphoFeatureSpecification.NoAnalysis;
                    }
                    else
                    {
                        ++nMorphAnalyses;
                    }
                    // Normalize punctuation analyses
                    if (morphStr.StartsWith("PONCT"))
                    {
                        morphStr = "PUNC";
                    }
                    string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr);
                    coreLabel.SetValue(newLeaf);
                    coreLabel.SetWord(newLeaf);
                }
            }
        }
Exemplo n.º 8
0
        /// <summary>Finds the syntactic head of the given entity mention.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            if (!useNewHeadFinder)
            {
                return(OriginalFindSyntacticHead(ent, root, tokens));
            }
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            int approximateness            = 0;
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            extentTokens.Add(InitCoreLabel("It"));
            extentTokens.Add(InitCoreLabel("was"));
            int AddedWords = 2;

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                CoreLabel label = tokens[i];
                if (!"-".Equals(label.Word()))
                {
                    extentTokens.Add(tokens[i]);
                }
                else
                {
                    approximateness++;
                }
            }
            extentTokens.Add(InitCoreLabel("."));
            // constrain the parse to the part we're interested in.
            // Starting from ADDED_WORDS comes from skipping "It was".
            // -1 to exclude the period.
            // We now let it be any kind of nominal constituent, since there
            // are VP and S ones
            ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*");
            IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
            Tree tree = Parse(extentTokens, constraints);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords);
            // remember it has ADDED_WORDS extra words at the beginning
            Tree subtree    = FindPartialSpan(tree, ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(subtree);

            logger.Fine("Head is: " + extentHead);
            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
            // Because we deleted dashes, it's index will be >= the index in the extent parse tree
            CoreLabel l = (CoreLabel)extentHead.Label();
            // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
            Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);

            if (realHead != null)
            {
                logger.Fine("Chosen head: " + realHead);
            }
            return(realHead);
        }
        /// <summary>
        /// transformTree does all language-specific tree
        /// transformations.
        /// </summary>
        /// <remarks>
        /// transformTree does all language-specific tree
        /// transformations. Any parameterizations should be inside the
        /// specific TreebankLangParserarams class.
        /// </remarks>
        public override Tree TransformTree(Tree t, Tree root)
        {
            if (t == null || t.IsLeaf())
            {
                return(t);
            }
            IList <string> annotations = new List <string>();
            CoreLabel      lab         = (CoreLabel)t.Label();
            string         word        = lab.Word();
            string         tag         = lab.Tag();
            string         cat         = lab.Value();
            string         baseCat     = TreebankLanguagePack().BasicCategory(cat);

            //Tree parent = t.parent(root);
            // String mcat = "";
            // if (parent != null) {
            //   mcat = parent.label().value();
            // }
            //categories -- at present there is no tag annotation!!
            if (t.IsPhrasal())
            {
                IList <string> childBasicCats = ChildBasicCats(t);
                // mark vp's headed by "zu" verbs
                if (markZuVP && baseCat.Equals("VP") && (childBasicCats.Contains("VZ") || childBasicCats.Contains("VVIZU")))
                {
                    annotations.Add("%ZU");
                }
                // mark relative clause S's
                if (markRC && (t.Label() is NegraLabel) && baseCat.Equals("S") && ((NegraLabel)t.Label()).GetEdge() != null && ((NegraLabel)t.Label()).GetEdge().Equals("RC"))
                {
                    //throw new RuntimeException("damn, not a Negra Label");
                    annotations.Add("%RC");
                }
                //      if(t.children().length == 1) {
                //        annotations.add("%U");
                //      }
                if (markContainsV && ContainsVP(t))
                {
                    annotations.Add("%vp");
                }
                if (markLP && LeftPhrasal(t))
                {
                    annotations.Add("%LP");
                }
                if (markKonjParent)
                {
                    // this depends on functional tags being present
                    foreach (string cCat in childBasicCats)
                    {
                        if (cCat.Contains("-KONJ"))
                        {
                            annotations.Add("%konjp");
                            break;
                        }
                    }
                }
                if (markHDParent)
                {
                    // this depends on functional tags being present
                    foreach (string cCat in childBasicCats)
                    {
                        if (cCat.Contains("-HD"))
                        {
                            annotations.Add("%hdp");
                            break;
                        }
                    }
                }
            }
            else
            {
                //t.isPreTerminal() case
                if (markColon && cat.Equals("$.") && (word.Equals(":") || word.Equals(";")))
                {
                    annotations.Add("-%colon");
                }
            }
            //    if(t.isPreTerminal()) {
            //      if(parent != null) {
            //        String parentVal = parent.label().value();
            //        int cutOffPtD = parentVal.indexOf('-');
            //        int cutOffPtC = parentVal.indexOf('^');
            //        int curMin = parentVal.length();
            //        if(cutOffPtD != -1) {
            //          curMin = cutOffPtD;
            //        }
            //        if(cutOffPtC != -1) {
            //          curMin = Math.min(curMin, cutOffPtC);
            //        }
            //        parentVal = parentVal.substring(0, curMin);
            //        annotations.add("^" + parentVal);
            //      }
            //    }
            // put on all the annotations
            StringBuilder catSB = new StringBuilder(cat);

            foreach (string annotation in annotations)
            {
                catSB.Append(annotation);
            }
            t.SetLabel(new CategoryWordTag(catSB.ToString(), word, tag));
            return(t);
        }