Example #1
0
        /// <summary>
        /// Given a path to a file containing a list of SsurgeonPatterns, returns
        /// TODO: deal with resources
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual IList <SsurgeonPattern> ReadFromFile(File file)
        {
            IList <SsurgeonPattern> retList = new List <SsurgeonPattern>();
            IDocument doc          = DocumentBuilderFactory.NewInstance().NewDocumentBuilder().Parse(file);
            INodeList patternNodes = doc.GetElementsByTagName(SsurgeonPattern.SsurgeonElemTag);

            for (int i = 0; i < patternNodes.GetLength(); i++)
            {
                INode node = patternNodes.Item(i);
                if (node.GetNodeType() == NodeConstants.ElementNode)
                {
                    IElement        elt     = (IElement)node;
                    SsurgeonPattern pattern = SsurgeonPatternFromXML(elt);
                    retList.Add(pattern);
                }
            }
            INodeList resourceNodes = doc.GetElementsByTagName(SsurgeonPattern.ResourceTag);

            for (int i_1 = 0; i_1 < resourceNodes.GetLength(); i_1++)
            {
                INode node = patternNodes.Item(i_1);
                if (node.GetNodeType() == NodeConstants.ElementNode)
                {
                    IElement         resourceElt = (IElement)node;
                    SsurgeonWordlist wlRsrc      = new SsurgeonWordlist(resourceElt);
                    AddResource(wlRsrc);
                }
            }
            return(retList);
        }
Example #2
0
        private static void GetMatchingNodes(INode node, string[] nodePath, int cur, IList <INode> res)
        {
            if (cur < 0 || cur >= nodePath.Length)
            {
                return;
            }
            bool   last = (cur == nodePath.Length - 1);
            string name = nodePath[cur];

            if (node.HasChildNodes())
            {
                INodeList children = node.GetChildNodes();
                for (int i = 0; i < children.GetLength(); i++)
                {
                    INode c = children.Item(i);
                    if (name.Equals(c.GetNodeName()))
                    {
                        if (last)
                        {
                            res.Add(c);
                        }
                        else
                        {
                            GetMatchingNodes(c, nodePath, cur + 1, res);
                        }
                    }
                }
            }
        }
Example #3
0
        /// <summary>
        /// Given the root Element for a SemgrexPattern (SSURGEON_ELEM_TAG), converts
        /// it into its corresponding SemgrexPattern object.
        /// </summary>
        /// <exception cref="System.Exception"/>
        public static SsurgeonPattern SsurgeonPatternFromXML(IElement elt)
        {
            string          uid            = GetTagText(elt, SsurgeonPattern.UidElemTag);
            string          notes          = GetTagText(elt, SsurgeonPattern.NotesElemTag);
            string          semgrexString  = GetTagText(elt, SsurgeonPattern.SemgrexElemTag);
            SemgrexPattern  semgrexPattern = SemgrexPattern.Compile(semgrexString);
            SsurgeonPattern retPattern     = new SsurgeonPattern(uid, semgrexPattern);

            retPattern.SetNotes(notes);
            INodeList editNodes = elt.GetElementsByTagName(SsurgeonPattern.EditListElemTag);

            for (int i = 0; i < editNodes.GetLength(); i++)
            {
                INode node = editNodes.Item(i);
                if (node.GetNodeType() == NodeConstants.ElementNode)
                {
                    IElement editElt = (IElement)node;
                    string   editVal = GetEltText(editElt);
                    retPattern.AddEdit(Edu.Stanford.Nlp.Semgraph.Semgrex.Ssurgeon.Ssurgeon.ParseEditLine(editVal));
                }
            }
            // If predicate available, parse
            IElement predElt = GetFirstTag(elt, SsurgeonPattern.PredicateTag);

            if (predElt != null)
            {
                ISsurgPred pred = AssemblePredFromXML(GetFirstChildElement(predElt));
                retPattern.SetPredicate(pred);
            }
            return(retPattern);
        }
Example #4
0
        private static void GetMatchingNodes(INode node, Pattern[] nodePath, int cur, IList <INode> res)
        {
            if (cur < 0 || cur >= nodePath.Length)
            {
                return;
            }
            bool      last     = (cur == nodePath.Length - 1);
            Pattern   pattern  = nodePath[cur];
            INodeList children = node.GetChildNodes();

            for (int i = 0; i < children.GetLength(); i++)
            {
                INode c = children.Item(i);
                if (pattern.Matcher(c.GetNodeName()).Matches())
                {
                    if (last)
                    {
                        res.Add(c);
                    }
                    else
                    {
                        GetMatchingNodes(c, nodePath, cur + 1, res);
                    }
                }
            }
        }
Example #5
0
        public static void RemoveChildren(INode e)
        {
            INodeList list = e.GetChildNodes();

            for (int i = 0; i < list.GetLength(); i++)
            {
                INode n = list.Item(i);
                e.RemoveChild(n);
            }
        }
        /// <summary>Reconstructs the resource from the XML file</summary>
        public SsurgeonWordlist(IElement rootElt)
        {
            id = rootElt.GetAttribute("id");
            INodeList wordEltNL = rootElt.GetElementsByTagName(WordElt);

            for (int i = 0; i < wordEltNL.GetLength(); i++)
            {
                INode node = wordEltNL.Item(i);
                if (node.GetNodeType() == NodeConstants.ElementNode)
                {
                    string word = Edu.Stanford.Nlp.Semgraph.Semgrex.Ssurgeon.Ssurgeon.GetEltText((IElement)node);
                    words.Add(word);
                }
            }
        }
        /// <summary>Searches for all immediate children with the given name</summary>
        protected internal static IList <INode> GetChildrenByName(INode node, string name)
        {
            IList <INode> matches  = new List <INode>();
            INodeList     children = node.GetChildNodes();

            // search children
            for (int i = 0; i < children.GetLength(); i++)
            {
                INode child = children.Item(i);
                if (child.GetNodeName().Equals(name))
                {
                    matches.Add(child);
                }
            }
            return(matches);
        }
        public static string GetJustText(INode text)
        {
            StringBuilder sb        = new StringBuilder();
            INodeList     textElems = text.GetChildNodes();

            for (int i = 0; i < textElems.GetLength(); i++)
            {
                INode  child = textElems.Item(i);
                string str   = child.GetTextContent();
                //replace single occurrence of \n with " ", double occurrences with a single one.
                str = str.ReplaceAll("\n(?!\n)", " ");
                str = str.ReplaceAll("_", string.Empty);
                //bug fix for sentence splitting
                sb.Append(str + " ");
            }
            return(sb.ToString());
        }
Example #9
0
 /// <summary>
 /// For a given Element, treats the first child as a text element
 /// and returns its value.
 /// </summary>
 public static string GetEltText(IElement element)
 {
     try
     {
         INodeList childNodeList = element.GetChildNodes();
         if (childNodeList.GetLength() == 0)
         {
             return(string.Empty);
         }
         return(childNodeList.Item(0).GetNodeValue());
     }
     catch (Exception e)
     {
         log.Warning("Exception e=" + e.Message + " thrown calling getEltText on element=" + element);
     }
     return(string.Empty);
 }
Example #10
0
 /// <summary>Returns the first child whose node type is Element under the given Element.</summary>
 private static IElement GetFirstChildElement(IElement element)
 {
     try
     {
         INodeList nodeList = element.GetChildNodes();
         for (int i = 0; i < nodeList.GetLength(); i++)
         {
             INode node = nodeList.Item(i);
             if (node.GetNodeType() == NodeConstants.ElementNode)
             {
                 return((IElement)node);
             }
         }
     }
     catch (Exception e)
     {
         log.Warning("Error getting first child Element for element=" + element + ", exception=" + e);
     }
     return(null);
 }
        //Silently ignore
        public virtual Tree ReadTree()
        {
            Tree t = null;

            while (t == null && sentences != null && sentIdx < sentences.GetLength())
            {
                INode sentRoot = sentences.Item(sentIdx++);
                t = GetTreeFromXML(sentRoot);
                if (t != null)
                {
                    t = treeNormalizer.NormalizeWholeTree(t, treeFactory);
                    if (t.Label() is CoreLabel)
                    {
                        string ftbId = ((IElement)sentRoot).GetAttribute(AttrNumber);
                        ((CoreLabel)t.Label()).Set(typeof(CoreAnnotations.SentenceIDAnnotation), ftbId);
                    }
                }
            }
            return(t);
        }
        //Silently ignore
        public virtual Tree ReadTree()
        {
            Tree t = null;

            while (t == null && sentences != null && sentIdx < sentences.GetLength())
            {
                int   thisSentenceId = sentIdx++;
                INode sentRoot       = sentences.Item(thisSentenceId);
                t = GetTreeFromXML(sentRoot);
                if (t != null)
                {
                    t = treeNormalizer.NormalizeWholeTree(t, treeFactory);
                    if (t.Label() is CoreLabel)
                    {
                        ((CoreLabel)t.Label()).Set(typeof(CoreAnnotations.SentenceIDAnnotation), int.ToString(thisSentenceId));
                    }
                }
            }
            return(t);
        }
Example #13
0
        /// <summary>Searches (recursively) for the first child that has the given name</summary>
        protected internal static INode GetChildByName(INode node, string name)
        {
            INodeList children = node.GetChildNodes();

            // this node matches
            if (node.GetNodeName().Equals(name))
            {
                return(node);
            }
            // search children
            for (int i = 0; i < children.GetLength(); i++)
            {
                INode found = GetChildByName(children.Item(i), name);
                if (found != null)
                {
                    return(found);
                }
            }
            // failed
            return(null);
        }
        public static IList <Person> ReadXMLCharacterList(IDocument doc)
        {
            IList <Person> personList = new List <Person>();
            INodeList      characters = doc.GetDocumentElement().GetElementsByTagName("characters").Item(0).GetChildNodes();

            for (int i = 0; i < characters.GetLength(); i++)
            {
                INode child = characters.Item(i);
                if (child.GetNodeName().Equals("character"))
                {
                    string name  = child.GetAttributes().GetNamedItem("name").GetNodeValue();
                    char[] cName = name.ToCharArray();
                    cName[0] = char.ToUpperCase(cName[0]);
                    name     = new string(cName);
                    IList <string> aliases = Arrays.AsList(child.GetAttributes().GetNamedItem("aliases").GetNodeValue().Split(";"));
                    string         gender  = (child.GetAttributes().GetNamedItem("gender") == null) ? string.Empty : child.GetAttributes().GetNamedItem("gender").GetNodeValue();
                    personList.Add(new Person(child.GetAttributes().GetNamedItem("name").GetNodeValue(), gender, aliases));
                }
            }
            return(personList);
        }
Example #15
0
        /// <summary>Returns all of the Element typed children from the given element.</summary>
        /// <remarks>
        /// Returns all of the Element typed children from the given element.  Note: disregards
        /// other node types.
        /// </remarks>
        private static IList <IElement> GetChildElements(IElement element)
        {
            LinkedList <IElement> childElements = new LinkedList <IElement>();

            try
            {
                INodeList nodeList = element.GetChildNodes();
                for (int i = 0; i < nodeList.GetLength(); i++)
                {
                    INode node = nodeList.Item(i);
                    if (node.GetNodeType() == NodeConstants.ElementNode)
                    {
                        childElements.Add((IElement)node);
                    }
                }
            }
            catch (Exception e)
            {
                log.Warning("Exception thrown getting all children for element=" + element + ", e=" + e);
            }
            return(childElements);
        }
Example #16
0
        /// <summary>Searches for children that have the given name and attribute</summary>
        protected internal static INode GetChildByNameAndAttribute(INode node, string name, string attributeName, string attributeValue)
        {
            INodeList     children  = node.GetChildNodes();
            INamedNodeMap attribs   = node.GetAttributes();
            INode         attribute = null;

            // this node matches
            if (node.GetNodeName().Equals(name) && attribs != null && (attribute = attribs.GetNamedItem(attributeName)) != null && attribute.GetNodeValue().Equals(attributeValue))
            {
                return(node);
            }
            // search children
            for (int i = 0; i < children.GetLength(); i++)
            {
                INode found = GetChildByAttribute(children.Item(i), attributeName, attributeValue);
                if (found != null)
                {
                    return(found);
                }
            }
            // failed
            return(null);
        }
Example #17
0
 /// <summary>For the given element, finds the first child Element with the given tag.</summary>
 private static IElement GetFirstTag(IElement element, string tag)
 {
     try
     {
         INodeList nodeList = element.GetElementsByTagName(tag);
         if (nodeList.GetLength() == 0)
         {
             return(null);
         }
         for (int i = 0; i < nodeList.GetLength(); i++)
         {
             INode node = nodeList.Item(i);
             if (node.GetNodeType() == NodeConstants.ElementNode)
             {
                 return((IElement)node);
             }
         }
     }
     catch (Exception)
     {
         log.Warning("Error getting first tag " + tag + " under element=" + element);
     }
     return(null);
 }
Example #18
0
        private bool Load(InputStream stream)
        {
            DocumentBuilder parser = XMLUtils.GetXmlParser();

            if (parser == null)
            {
                return(false);
            }
            try
            {
                IDocument xmlDocument = parser.Parse(stream);
                IElement  root        = xmlDocument.GetDocumentElement();
                INodeList sentences   = root.GetElementsByTagName(Sentence);
                for (int i = 0; i < sentences.GetLength(); i++)
                {
                    IElement sentence = (IElement)sentences.Item(i);
                    Lattice  lattice  = new Lattice();
                    //Create the node map
                    ISortedSet <int> nodes    = new TreeSet <int>();
                    INodeList        xmlNodes = sentence.GetElementsByTagName(Node);
                    for (int nodeIdx = 0; nodeIdx < xmlNodes.GetLength(); nodeIdx++)
                    {
                        IElement xmlNode  = (IElement)xmlNodes.Item(nodeIdx);
                        int      nodeName = System.Convert.ToInt32(xmlNode.GetAttribute(NodeId));
                        nodes.Add(nodeName);
                    }
                    IDictionary <int, int> nodeMap = Generics.NewHashMap();
                    int realNodeIdx      = 0;
                    int lastBoundaryNode = -1;
                    foreach (int nodeName_1 in nodes)
                    {
                        if (lastBoundaryNode == -1)
                        {
                            System.Diagnostics.Debug.Assert(nodeName_1 % NodeOffset == 0);
                            lastBoundaryNode = realNodeIdx;
                        }
                        else
                        {
                            if (nodeName_1 % NodeOffset == 0)
                            {
                                ParserConstraint c = new ParserConstraint(lastBoundaryNode, realNodeIdx, ".*");
                                lattice.AddConstraint(c);
                            }
                        }
                        nodeMap[nodeName_1] = realNodeIdx;
                        realNodeIdx++;
                    }
                    //Read the edges
                    INodeList xmlEdges = sentence.GetElementsByTagName(Edge);
                    for (int edgeIdx = 0; edgeIdx < xmlEdges.GetLength(); edgeIdx++)
                    {
                        IElement xmlEdge = (IElement)xmlEdges.Item(edgeIdx);
                        string   segment = xmlEdge.GetAttribute(Segment);
                        double   weight  = double.Parse(xmlEdge.GetAttribute(Weight));
                        //Input weights should be log scale
                        int         from     = System.Convert.ToInt32(xmlEdge.GetAttribute(FromNode));
                        int         normFrom = nodeMap[from];
                        int         to       = System.Convert.ToInt32(xmlEdge.GetAttribute(ToNode));
                        int         normTo   = nodeMap[to];
                        LatticeEdge e        = new LatticeEdge(segment, weight, normFrom, normTo);
                        // Set attributes below here
                        INodeList xmlAttrs = xmlEdge.GetElementsByTagName(EAttrNode);
                        for (int attrIdx = 0; attrIdx < xmlAttrs.GetLength(); attrIdx++)
                        {
                            IElement xmlAttr = (IElement)xmlAttrs.Item(attrIdx);
                            string   key     = xmlAttr.GetAttribute(EAttr);
                            string   value   = xmlAttr.GetAttribute(EAttrVal);
                            e.SetAttr(key, value);
                        }
                        lattice.AddEdge(e);
                    }
                    //Configure for parsing in ExhaustivePCFG parser
                    lattice.AddBoundary();
                    lattices.Add(lattice);
                }
            }
            catch (IOException e)
            {
                System.Console.Error.Printf("%s: Error reading XML from input stream.%n", this.GetType().FullName);
                Sharpen.Runtime.PrintStackTrace(e);
                return(false);
            }
            catch (SAXException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
                return(false);
            }
            return(true);
        }
Example #19
0
        private static IList <ICoreMap> ToTimexCoreMaps(IElement docElem, ICoreMap originalDocument)
        {
            //--Collect Token Offsets
            IDictionary <int, int> beginMap = Generics.NewHashMap();
            IDictionary <int, int> endMap   = Generics.NewHashMap();
            bool haveTokenOffsets           = true;

            foreach (ICoreMap sent in originalDocument.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (CoreLabel token in sent.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    int tokBegin = token.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    int tokEnd   = token.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                    if (tokBegin == null || tokEnd == null)
                    {
                        haveTokenOffsets = false;
                    }
                    int charBegin = token.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    int charEnd   = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    beginMap[charBegin] = tokBegin;
                    endMap[charEnd]     = tokEnd;
                }
            }
            IList <ICoreMap> timexMaps = new List <ICoreMap>();
            int       offset           = 0;
            INodeList docNodes         = docElem.GetChildNodes();

            for (int i = 0; i < docNodes.GetLength(); i++)
            {
                INode content = docNodes.Item(i);
                if (content is IText)
                {
                    IText text = (IText)content;
                    offset += text.GetWholeText().Length;
                }
                else
                {
                    if (content is IElement)
                    {
                        IElement child = (IElement)content;
                        if (child.GetNodeName().Equals("TIMEX3"))
                        {
                            Timex timex = new Timex(child);
                            if (child.GetChildNodes().GetLength() != 1)
                            {
                                throw new Exception("TIMEX3 should only contain text " + child);
                            }
                            string   timexText = child.GetTextContent();
                            ICoreMap timexMap  = new ArrayCoreMap();
                            timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), timex);
                            timexMap.Set(typeof(CoreAnnotations.TextAnnotation), timexText);
                            int charBegin = offset;
                            timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset);
                            offset += timexText.Length;
                            timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset);
                            int charEnd = offset;
                            //(tokens)
                            if (haveTokenOffsets)
                            {
                                int tokBegin   = beginMap[charBegin];
                                int searchStep = 1;
                                //if no exact match, search around the character offset
                                while (tokBegin == null)
                                {
                                    tokBegin = beginMap[charBegin - searchStep];
                                    if (tokBegin == null)
                                    {
                                        tokBegin = beginMap[charBegin + searchStep];
                                    }
                                    searchStep += 1;
                                }
                                searchStep = 1;
                                int tokEnd = endMap[charEnd];
                                while (tokEnd == null)
                                {
                                    tokEnd = endMap[charEnd - searchStep];
                                    if (tokEnd == null)
                                    {
                                        tokEnd = endMap[charEnd + searchStep];
                                    }
                                    searchStep += 1;
                                }
                                timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokBegin);
                                timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokEnd);
                            }
                            timexMaps.Add(timexMap);
                        }
                        else
                        {
                            throw new Exception("unexpected element " + child);
                        }
                    }
                    else
                    {
                        throw new Exception("unexpected content " + content);
                    }
                }
            }
            return(timexMaps);
        }
        /// <summary>Parses one ACE specification</summary>
        /// <returns>Simply displays the events to stdout</returns>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        public static AceDocument ParseDocument(File f)
        {
            // parse the Dom document
            IDocument document = ReadDocument(f);
            //
            // create the ACE document object
            //
            INode       docElement = document.GetElementsByTagName("document").Item(0);
            AceDocument aceDoc     = new AceDocument(GetAttributeValue(docElement, "DOCID"));
            //
            // read all entities
            //
            INodeList entities    = document.GetElementsByTagName("entity");
            int       entityCount = 0;

            for (int i = 0; i < entities.GetLength(); i++)
            {
                INode node = entities.Item(i);
                //
                // the entity type and subtype
                //
                string id      = GetAttributeValue(node, "ID");
                string type    = GetAttributeValue(node, "TYPE");
                string subtype = GetAttributeValue(node, "SUBTYPE");
                string cls     = GetAttributeValue(node, "CLASS");
                // create the entity
                AceEntity entity = new AceEntity(id, type, subtype, cls);
                aceDoc.AddEntity(entity);
                // fetch all mentions of this event
                IList <INode> mentions = GetChildrenByName(node, "entity_mention");
                // parse all its mentions
                foreach (INode mention1 in mentions)
                {
                    AceEntityMention mention = ParseEntityMention(mention1);
                    entity.AddMention(mention);
                    aceDoc.AddEntityMention(mention);
                }
                entityCount++;
            }
            //log.info("Parsed " + entityCount + " XML entities.");
            //
            // read all relations
            //
            INodeList relations = document.GetElementsByTagName("relation");

            for (int i_1 = 0; i_1 < relations.GetLength(); i_1++)
            {
                INode node = relations.Item(i_1);
                //
                // the relation type, subtype, tense, and modality
                //
                string id       = GetAttributeValue(node, "ID");
                string type     = GetAttributeValue(node, "TYPE");
                string subtype  = GetAttributeValue(node, "SUBTYPE");
                string modality = GetAttributeValue(node, "MODALITY");
                string tense    = GetAttributeValue(node, "TENSE");
                // create the relation
                AceRelation relation = new AceRelation(id, type, subtype, modality, tense);
                aceDoc.AddRelation(relation);
                // XXX: fetch relation_arguments here!
                // fetch all mentions of this relation
                IList <INode> mentions = GetChildrenByName(node, "relation_mention");
                // traverse all mentions
                foreach (INode mention1 in mentions)
                {
                    AceRelationMention mention = ParseRelationMention(mention1, aceDoc);
                    relation.AddMention(mention);
                    aceDoc.AddRelationMention(mention);
                }
            }
            //
            // read all events
            //
            INodeList events = document.GetElementsByTagName("event");

            for (int i_2 = 0; i_2 < events.GetLength(); i_2++)
            {
                INode node = events.Item(i_2);
                //
                // the event type, subtype, tense, and modality
                //
                string id         = GetAttributeValue(node, "ID");
                string type       = GetAttributeValue(node, "TYPE");
                string subtype    = GetAttributeValue(node, "SUBTYPE");
                string modality   = GetAttributeValue(node, "MODALITY");
                string polarity   = GetAttributeValue(node, "POLARITY");
                string genericity = GetAttributeValue(node, "GENERICITY");
                string tense      = GetAttributeValue(node, "TENSE");
                // create the event
                AceEvent @event = new AceEvent(id, type, subtype, modality, polarity, genericity, tense);
                aceDoc.AddEvent(@event);
                // fetch all mentions of this relation
                IList <INode> mentions = GetChildrenByName(node, "event_mention");
                // traverse all mentions
                foreach (INode mention1 in mentions)
                {
                    AceEventMention mention = ParseEventMention(mention1, aceDoc);
                    @event.AddMention(mention);
                    aceDoc.AddEventMention(mention);
                }
            }
            return(aceDoc);
        }
        /// <exception cref="System.Exception"/>
        public static XMLToAnnotation.Data ReadXMLFormat(string fileName)
        {
            //Extract character list, gold quote speaker and mention information from the XML document.
            IDocument         doc      = XMLUtils.ReadDocumentFromFile(fileName);
            INode             text     = doc.GetDocumentElement().GetElementsByTagName("text").Item(0);
            string            docText  = GetJustText(text);
            Annotation        document = GetAnnotatedFile(docText, fileName, GetProcessedCoreNLPProperties());
            IList <ICoreMap>  quotes   = document.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <CoreLabel> tokens   = document.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <XMLToAnnotation.GoldQuoteInfo>      goldList    = new List <XMLToAnnotation.GoldQuoteInfo>();
            IDictionary <int, XMLToAnnotation.Mention> idToMention = new Dictionary <int, XMLToAnnotation.Mention>();
            IList <Person> personList = ReadXMLCharacterList(doc);
            IDictionary <string, IList <Person> > personMap = QuoteAttributionUtils.ReadPersonMap(personList);
            IList <Pair <int, string> >           mentionIdToSpeakerList = new List <Pair <int, string> >();
            //there is at least 1 case in which the XML quote does not match up with the automatically-extracted quote. (Ex: quote by Mr. Collins that begins, "Hunsford, near Westerham, Kent, ...")
            //as the dirty solution, we treat all quotes encapsulated within an XML quote as the same speaker (although this is not 100% accurate!)
            int       quoteIndex = 0;
            INodeList textElems  = text.GetChildNodes();
            int       tokenIndex = 0;

            for (int i = 0; i < textElems.GetLength(); i++)
            {
                INode chapterNode = textElems.Item(i);
                if (chapterNode.GetNodeName().Equals("chapter"))
                {
                    INodeList chapElems = chapterNode.GetChildNodes();
                    for (int j = 0; j < chapElems.GetLength(); j++)
                    {
                        INode child = chapElems.Item(j);
                        if (child.GetNodeName().Equals("quote"))
                        {
                            //search for nested mentions
                            INodeList quoteChildren = child.GetChildNodes();
                            for (int k = 0; k < quoteChildren.GetLength(); k++)
                            {
                                INode quoteChild = quoteChildren.Item(k);
                                if (quoteChild.GetNodeName().Equals("mention"))
                                {
                                    string      mentionText = quoteChild.GetTextContent();
                                    int         id          = System.Convert.ToInt32(Sharpen.Runtime.Substring(quoteChild.GetAttributes().GetNamedItem("id").GetTextContent(), 1));
                                    IList <int> connections = ReadConnection(quoteChild.GetAttributes().GetNamedItem("connection").GetNodeValue());
                                    int         endIndex    = GetEndIndex(tokenIndex, tokens, mentionText);
                                    //                mentions.put(id, new XMLMention(quoteChild.getTextContent(), tokenIndex, endIndex, id, connections));
                                    idToMention[id] = new XMLToAnnotation.Mention(mentionText, tokenIndex, endIndex);
                                    tokenIndex      = endIndex + 1;
                                }
                                else
                                {
                                    string quoteText = quoteChild.GetTextContent();
                                    quoteText = quoteText.ReplaceAll("\n(?!\n)", " ");
                                    //trim unnecessarily newlines
                                    quoteText  = quoteText.ReplaceAll("_", string.Empty);
                                    tokenIndex = GetEndIndex(tokenIndex, tokens, quoteText) + 1;
                                }
                            }
                            string quoteText_1 = child.GetTextContent();
                            //              tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
                            quoteText_1 = quoteText_1.ReplaceAll("\n(?!\n)", " ");
                            //trim unnecessarily newlines
                            quoteText_1 = quoteText_1.ReplaceAll("_", string.Empty);
                            int quotationOffset = 1;
                            if (quoteText_1.StartsWith("``"))
                            {
                                quotationOffset = 2;
                            }
                            IList <int> connections_1 = ReadConnection(child.GetAttributes().GetNamedItem("connection").GetTextContent());
                            int         id_1          = System.Convert.ToInt32(Sharpen.Runtime.Substring(child.GetAttributes().GetNamedItem("id").GetTextContent(), 1));
                            int         mention_id    = null;
                            if (connections_1.Count > 0)
                            {
                                mention_id = connections_1[0];
                            }
                            else
                            {
                                System.Console.Out.WriteLine("quote w/ no mention. ID: " + id_1);
                            }
                            //            Pair<Integer, Integer> mentionPair = idToMentionPair.get(mention_id);
                            mentionIdToSpeakerList.Add(new Pair <int, string>(mention_id, child.GetAttributes().GetNamedItem("speaker").GetTextContent()));
                            string annotatedQuoteText = quotes[quoteIndex].Get(typeof(CoreAnnotations.TextAnnotation));
                            while (!quoteText_1.EndsWith(annotatedQuoteText))
                            {
                                quoteIndex++;
                                annotatedQuoteText = quotes[quoteIndex].Get(typeof(CoreAnnotations.TextAnnotation));
                                mentionIdToSpeakerList.Add(new Pair <int, string>(mention_id, child.GetAttributes().GetNamedItem("speaker").GetTextContent()));
                            }
                            //            idToMentionPair.put(id, new Pair<>(-1, -1));
                            //            imention_id = connections.get(0);
                            //              quotes.add(new XMLQuote(quoteText.substring(quotationOffset, quoteText.length() - quotationOffset), child.getAttributes().getNamedItem("speaker").getTextContent(), id, chapterIndex, mention_id));
                            quoteIndex++;
                        }
                        else
                        {
                            if (child.GetNodeName().Equals("mention"))
                            {
                                string      mentionText = child.GetTextContent();
                                int         id          = System.Convert.ToInt32(Sharpen.Runtime.Substring(child.GetAttributes().GetNamedItem("id").GetTextContent(), 1));
                                IList <int> connections = ReadConnection(child.GetAttributes().GetNamedItem("connection").GetNodeValue());
                                int         endIndex    = GetEndIndex(tokenIndex, tokens, mentionText);
                                idToMention[id] = new XMLToAnnotation.Mention(mentionText, tokenIndex, endIndex);
                                //              mentions.put(id, new XMLMention(child.getTextContent(), tokenIndex, endIndex, id, connections));
                                tokenIndex = endIndex + 1;
                            }
                            else
                            {
                                //#text
                                string nodeText = child.GetTextContent();
                                nodeText = nodeText.ReplaceAll("\n(?!\n)", " ");
                                nodeText = nodeText.ReplaceAll("_", string.Empty);
                                if (tokenIndex >= tokens.Count)
                                {
                                    continue;
                                }
                                tokenIndex = GetEndIndex(tokenIndex, tokens, nodeText) + 1;
                            }
                        }
                    }
                }
            }
            foreach (Pair <int, string> item in mentionIdToSpeakerList)
            {
                XMLToAnnotation.Mention mention = idToMention[item.first];
                if (mention == null)
                {
                    goldList.Add(new XMLToAnnotation.GoldQuoteInfo(-1, -1, item.second, null));
                }
                else
                {
                    goldList.Add(new XMLToAnnotation.GoldQuoteInfo(mention.begin, mention.end, item.second, mention.text));
                }
            }
            //verify
            if (document.Get(typeof(CoreAnnotations.QuotationsAnnotation)).Count != goldList.Count)
            {
                throw new Exception("Quotes size and gold size don't match!");
            }
            return(new XMLToAnnotation.Data(goldList, personList, document));
        }