Ejemplo n.º 1
0
 internal static bool ConstraintMatchesTreeTop(Tree top, ParserConstraint constraint)
 {
     while (true)
     {
         if (constraint.state.Matcher(top.Value()).Matches())
         {
             return(true);
         }
         else
         {
             if (top.Children().Length == 1)
             {
                 top = top.Children()[0];
             }
             else
             {
                 return(false);
             }
         }
     }
 }
Ejemplo n.º 2
0
        protected internal virtual Tree FindSyntacticHead(Mention m, Tree root, IList <CoreLabel> tokens)
        {
            // mention ends with 's
            int endIdx = m.endIndex;

            if (m.originalSpan.Count > 0)
            {
                string lastWord = m.originalSpan[m.originalSpan.Count - 1].Get(typeof(CoreAnnotations.TextAnnotation));
                if ((lastWord.Equals("'s") || lastWord.Equals("'")) && m.originalSpan.Count != 1)
                {
                    endIdx--;
                }
            }
            Tree exactMatch = FindTreeWithSpan(root, m.startIndex, endIdx);

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                return(SafeHead(exactMatch, endIdx));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            if (allowReparsing)
            {
                int approximateness            = 0;
                IList <CoreLabel> extentTokens = new List <CoreLabel>();
                extentTokens.Add(InitCoreLabel("It"));
                extentTokens.Add(InitCoreLabel("was"));
                int AddedWords = 2;
                for (int i = m.startIndex; i < endIdx; i++)
                {
                    // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                    CoreLabel label = tokens[i];
                    if (!"-".Equals(label.Word()))
                    {
                        // necessary to copy tokens in case the parser does things like
                        // put new indices on the tokens
                        extentTokens.Add((CoreLabel)label.LabelFactory().NewLabel(label));
                    }
                    else
                    {
                        approximateness++;
                    }
                }
                extentTokens.Add(InitCoreLabel("."));
                // constrain the parse to the part we're interested in.
                // Starting from ADDED_WORDS comes from skipping "It was".
                // -1 to exclude the period.
                // We now let it be any kind of nominal constituent, since there
                // are VP and S ones
                ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, Pattern.Compile(".*"));
                IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
                Tree tree = Parse(extentTokens, constraints);
                ConvertToCoreLabels(tree);
                // now unnecessary, as parser uses CoreLabels?
                tree.IndexSpans(m.startIndex - AddedWords);
                // remember it has ADDED_WORDS extra words at the beginning
                Tree subtree = FindPartialSpan(tree, m.startIndex);
                // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
                // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
                // passed the right end (that is, just that final period).
                Tree extentHead = SafeHead(subtree, endIdx);
                System.Diagnostics.Debug.Assert((extentHead != null));
                // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
                // Because we deleted dashes, it's index will be >= the index in the extent parse tree
                CoreLabel l        = (CoreLabel)extentHead.Label();
                Tree      realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);
                System.Diagnostics.Debug.Assert((realHead != null));
                return(realHead);
            }
            // If reparsing wasn't allowed, try to find a span in the tree
            // which happens to have the head
            Tree wordMatch = FindTreeWithSmallestSpan(root, m.startIndex, endIdx);

            if (wordMatch != null)
            {
                Tree head = SafeHead(wordMatch, endIdx);
                if (head != null)
                {
                    int index = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1;
                    if (index >= m.startIndex && index < endIdx)
                    {
                        return(head);
                    }
                }
            }
            // If that didn't work, guess that it's the last word
            int lastNounIdx = endIdx - 1;

            for (int i_1 = m.startIndex; i_1 < m.endIndex; i_1++)
            {
                if (tokens[i_1].Tag().StartsWith("N"))
                {
                    lastNounIdx = i_1;
                }
                else
                {
                    if (tokens[i_1].Tag().StartsWith("W"))
                    {
                        break;
                    }
                }
            }
            IList <Tree> leaves  = root.GetLeaves();
            Tree         endLeaf = leaves[lastNounIdx];

            return(endLeaf);
        }
Ejemplo n.º 3
0
        private bool Load(InputStream stream)
        {
            DocumentBuilder parser = XMLUtils.GetXmlParser();

            if (parser == null)
            {
                return(false);
            }
            try
            {
                IDocument xmlDocument = parser.Parse(stream);
                IElement  root        = xmlDocument.GetDocumentElement();
                INodeList sentences   = root.GetElementsByTagName(Sentence);
                for (int i = 0; i < sentences.GetLength(); i++)
                {
                    IElement sentence = (IElement)sentences.Item(i);
                    Lattice  lattice  = new Lattice();
                    //Create the node map
                    ISortedSet <int> nodes    = new TreeSet <int>();
                    INodeList        xmlNodes = sentence.GetElementsByTagName(Node);
                    for (int nodeIdx = 0; nodeIdx < xmlNodes.GetLength(); nodeIdx++)
                    {
                        IElement xmlNode  = (IElement)xmlNodes.Item(nodeIdx);
                        int      nodeName = System.Convert.ToInt32(xmlNode.GetAttribute(NodeId));
                        nodes.Add(nodeName);
                    }
                    IDictionary <int, int> nodeMap = Generics.NewHashMap();
                    int realNodeIdx      = 0;
                    int lastBoundaryNode = -1;
                    foreach (int nodeName_1 in nodes)
                    {
                        if (lastBoundaryNode == -1)
                        {
                            System.Diagnostics.Debug.Assert(nodeName_1 % NodeOffset == 0);
                            lastBoundaryNode = realNodeIdx;
                        }
                        else
                        {
                            if (nodeName_1 % NodeOffset == 0)
                            {
                                ParserConstraint c = new ParserConstraint(lastBoundaryNode, realNodeIdx, ".*");
                                lattice.AddConstraint(c);
                            }
                        }
                        nodeMap[nodeName_1] = realNodeIdx;
                        realNodeIdx++;
                    }
                    //Read the edges
                    INodeList xmlEdges = sentence.GetElementsByTagName(Edge);
                    for (int edgeIdx = 0; edgeIdx < xmlEdges.GetLength(); edgeIdx++)
                    {
                        IElement xmlEdge = (IElement)xmlEdges.Item(edgeIdx);
                        string   segment = xmlEdge.GetAttribute(Segment);
                        double   weight  = double.Parse(xmlEdge.GetAttribute(Weight));
                        //Input weights should be log scale
                        int         from     = System.Convert.ToInt32(xmlEdge.GetAttribute(FromNode));
                        int         normFrom = nodeMap[from];
                        int         to       = System.Convert.ToInt32(xmlEdge.GetAttribute(ToNode));
                        int         normTo   = nodeMap[to];
                        LatticeEdge e        = new LatticeEdge(segment, weight, normFrom, normTo);
                        // Set attributes below here
                        INodeList xmlAttrs = xmlEdge.GetElementsByTagName(EAttrNode);
                        for (int attrIdx = 0; attrIdx < xmlAttrs.GetLength(); attrIdx++)
                        {
                            IElement xmlAttr = (IElement)xmlAttrs.Item(attrIdx);
                            string   key     = xmlAttr.GetAttribute(EAttr);
                            string   value   = xmlAttr.GetAttribute(EAttrVal);
                            e.SetAttr(key, value);
                        }
                        lattice.AddEdge(e);
                    }
                    //Configure for parsing in ExhaustivePCFG parser
                    lattice.AddBoundary();
                    lattices.Add(lattice);
                }
            }
            catch (IOException e)
            {
                System.Console.Error.Printf("%s: Error reading XML from input stream.%n", this.GetType().FullName);
                Sharpen.Runtime.PrintStackTrace(e);
                return(false);
            }
            catch (SAXException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
                return(false);
            }
            return(true);
        }
Ejemplo n.º 4
0
 public virtual void AddConstraint(ParserConstraint c)
 {
     constraints.Add(c);
 }
Ejemplo n.º 5
0
        /// <summary>Finds the syntactic head of the given entity mention.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            if (!useNewHeadFinder)
            {
                return(OriginalFindSyntacticHead(ent, root, tokens));
            }
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            int approximateness            = 0;
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            extentTokens.Add(InitCoreLabel("It"));
            extentTokens.Add(InitCoreLabel("was"));
            int AddedWords = 2;

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                CoreLabel label = tokens[i];
                if (!"-".Equals(label.Word()))
                {
                    extentTokens.Add(tokens[i]);
                }
                else
                {
                    approximateness++;
                }
            }
            extentTokens.Add(InitCoreLabel("."));
            // constrain the parse to the part we're interested in.
            // Starting from ADDED_WORDS comes from skipping "It was".
            // -1 to exclude the period.
            // We now let it be any kind of nominal constituent, since there
            // are VP and S ones
            ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*");
            IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
            Tree tree = Parse(extentTokens, constraints);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords);
            // remember it has ADDED_WORDS extra words at the beginning
            Tree subtree    = FindPartialSpan(tree, ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(subtree);

            logger.Fine("Head is: " + extentHead);
            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
            // Because we deleted dashes, it's index will be >= the index in the extent parse tree
            CoreLabel l = (CoreLabel)extentHead.Label();
            // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
            Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);

            if (realHead != null)
            {
                logger.Fine("Chosen head: " + realHead);
            }
            return(realHead);
        }