private ICoreMap MakeTimexMap(HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode node, IList <CoreLabel> tokens, ICoreMap sentence)
            {
                ICoreMap timexMap = new ArrayCoreMap();

                timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex);
                timexMap.Set(typeof(CoreAnnotations.TextAnnotation), node.contents);
                timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), BeginOffset(tokens[0]));
                timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), EndOffset(tokens[tokens.Count - 1]));
                timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokens[0].Index());
                timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens[tokens.Count - 1].Index());
                timexMap.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
                if (sentence.Get(typeof(TimeAnnotations.TimexAnnotations)) == null)
                {
                    sentence.Set(typeof(TimeAnnotations.TimexAnnotations), new List <ICoreMap>());
                }
                sentence.Get(typeof(TimeAnnotations.TimexAnnotations)).Add(timexMap);
                // update NER for tokens
                foreach (CoreLabel token in tokens)
                {
                    token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "DATE");
                    token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), node.timex.Value());
                    token.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex);
                }
                return(timexMap);
            }
Exemple #2
0
        public virtual void TestFromList()
        {
            IList <ICoreMap>  sentences = Generics.NewArrayList();
            ICoreMap          sentence  = new ArrayCoreMap();
            IList <CoreLabel> words     = SentenceUtils.ToCoreLabelList("This", "is", "a", "test", ".");

            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words);
            sentences.Add(sentence);
            Annotation annotation = new Annotation(sentences);

            NUnit.Framework.Assert.AreEqual("This is a test .", annotation.ToString());
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), "This is a test.");
            annotation = new Annotation(sentences);
            NUnit.Framework.Assert.AreEqual("This is a test.", annotation.ToString());
        }
Exemple #3
0
        public virtual void Test()
        {
            Properties props = new Properties();

            props.SetProperty("patternType", "DEP");
            ConstantsAndVariables              constvars      = new ConstantsAndVariables(props, new HashSet <string>(), new Dictionary <string, Type>());
            CreatePatterns <DepPattern>        createPatterns = new CreatePatterns <DepPattern>(props, constvars);
            IDictionary <string, DataInstance> sents          = new Dictionary <string, DataInstance>();
            ICoreMap      m           = new ArrayCoreMap();
            string        text        = "We present a paper that focuses on semantic graphs applied to language.";
            string        graphString = "[present/VBP-2 nsubj>We/PRP-1 dobj>[paper/NN-4 det>a/DT-3] ccomp>[applied/VBN-10 mark>that/IN-5 nsubj>[focuses/NN-6 nmod:on>[graphs/NNS-9 amod>semantic/JJ-8]] nmod:to>language/NN-12]]";
            SemanticGraph graph       = SemanticGraph.ValueOf(graphString);
            //String phrase = "semantic graphs";
            IList <string> tokens = Arrays.AsList(new string[] { "We", "present", "a", "paper", "that", "focuses", "on", "semantic", "graphs", "applied", "to", "language" });

            m.Set(typeof(CoreAnnotations.TokensAnnotation), tokens.Stream().Map(null).Collect(Collectors.ToList()));
            m.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
            sents["sent1"] = DataInstance.GetNewInstance(PatternFactory.PatternType.Dep, m);
            createPatterns.GetAllPatterns(sents, props, ConstantsAndVariables.PatternForEachTokenWay.Memory);
            System.Console.Out.WriteLine("graph is " + graph);
            System.Console.Out.WriteLine(PatternsForEachTokenInMemory.patternsForEachToken);
        }
Exemple #4
0
        public static ICoreMap ToCoreMap(IList <TSVSentenceIterator.SentenceField> fields, IList <string> entries)
        {
            ICoreMap map = new ArrayCoreMap(fields.Count);
            Optional <IList <CoreLabel> > tokens = Optional.Empty();

            // First pass - process all token level stuff.
            foreach (Pair <TSVSentenceIterator.SentenceField, string> entry in Iterables.Zip(fields, entries))
            {
                TSVSentenceIterator.SentenceField field = entry.first;
                string value = TSVUtils.UnescapeSQL(entry.second);
                switch (field)
                {
                case TSVSentenceIterator.SentenceField.Words:
                {
                    IList <string> values = TSVUtils.ParseArray(value);
                    if (!tokens.IsPresent())
                    {
                        tokens = Optional.Of(new List <CoreLabel>(values.Count));
                        for (int i = 0; i < values.Count; i++)
                        {
                            tokens.Get().Add(new CoreLabel());
                        }
                    }
                    int beginChar = 0;
                    for (int i_1 = 0; i_1 < values.Count; i_1++)
                    {
                        tokens.Get()[i_1].SetValue(values[i_1]);
                        tokens.Get()[i_1].SetWord(values[i_1]);
                        tokens.Get()[i_1].SetBeginPosition(beginChar);
                        tokens.Get()[i_1].SetEndPosition(beginChar + values[i_1].Length);
                        beginChar += values[i_1].Length + 1;
                    }
                    break;
                }

                case TSVSentenceIterator.SentenceField.Lemmas:
                {
                    IList <string> values = TSVUtils.ParseArray(value);
                    if (!tokens.IsPresent())
                    {
                        tokens = Optional.Of(new List <CoreLabel>(values.Count));
                        for (int i = 0; i < values.Count; i++)
                        {
                            tokens.Get().Add(new CoreLabel());
                        }
                    }
                    for (int i_1 = 0; i_1 < values.Count; i_1++)
                    {
                        tokens.Get()[i_1].SetLemma(values[i_1]);
                    }
                    break;
                }

                case TSVSentenceIterator.SentenceField.PosTags:
                {
                    IList <string> values = TSVUtils.ParseArray(value);
                    if (!tokens.IsPresent())
                    {
                        tokens = Optional.Of(new List <CoreLabel>(values.Count));
                        for (int i = 0; i < values.Count; i++)
                        {
                            tokens.Get().Add(new CoreLabel());
                        }
                    }
                    for (int i_1 = 0; i_1 < values.Count; i_1++)
                    {
                        tokens.Get()[i_1].SetTag(values[i_1]);
                    }
                    break;
                }

                case TSVSentenceIterator.SentenceField.NerTags:
                {
                    IList <string> values = TSVUtils.ParseArray(value);
                    if (!tokens.IsPresent())
                    {
                        tokens = Optional.Of(new List <CoreLabel>(values.Count));
                        for (int i = 0; i < values.Count; i++)
                        {
                            tokens.Get().Add(new CoreLabel());
                        }
                    }
                    for (int i_1 = 0; i_1 < values.Count; i_1++)
                    {
                        tokens.Get()[i_1].SetNER(values[i_1]);
                    }
                    break;
                }

                default:
                {
                    // ignore.
                    break;
                }
                }
            }
            // Document specific stuff.
            Optional <string> docId         = Optional.Empty();
            Optional <string> sentenceId    = Optional.Empty();
            Optional <int>    sentenceIndex = Optional.Empty();

            foreach (Pair <TSVSentenceIterator.SentenceField, string> entry_1 in Iterables.Zip(fields, entries))
            {
                TSVSentenceIterator.SentenceField field = entry_1.first;
                string value = TSVUtils.UnescapeSQL(entry_1.second);
                switch (field)
                {
                case TSVSentenceIterator.SentenceField.Id:
                {
                    sentenceId = Optional.Of(value);
                    break;
                }

                case TSVSentenceIterator.SentenceField.DocId:
                {
                    docId = Optional.Of(value);
                    break;
                }

                case TSVSentenceIterator.SentenceField.SentenceIndex:
                {
                    sentenceIndex = Optional.Of(System.Convert.ToInt32(value));
                    break;
                }

                case TSVSentenceIterator.SentenceField.Gloss:
                {
                    value = value.Replace("\\n", "\n").Replace("\\t", "\t");
                    map.Set(typeof(CoreAnnotations.TextAnnotation), value);
                    break;
                }

                default:
                {
                    // ignore.
                    break;
                }
                }
            }
            // High level document stuff
            map.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceId.OrElse("-1"));
            map.Set(typeof(CoreAnnotations.DocIDAnnotation), docId.OrElse("???"));
            map.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(0));
            // Doc-char
            if (tokens.IsPresent())
            {
                foreach (Pair <TSVSentenceIterator.SentenceField, string> entry_2 in Iterables.Zip(fields, entries))
                {
                    TSVSentenceIterator.SentenceField field = entry_2.first;
                    string value = TSVUtils.UnescapeSQL(entry_2.second);
                    switch (field)
                    {
                    case TSVSentenceIterator.SentenceField.DocCharBegin:
                    {
                        IList <string> values = TSVUtils.ParseArray(value);
                        for (int i = 0; i < tokens.Get().Count; i++)
                        {
                            tokens.Get()[i].SetBeginPosition(System.Convert.ToInt32(values[i]));
                        }
                        break;
                    }

                    case TSVSentenceIterator.SentenceField.DocCharEnd:
                    {
                        IList <string> values = TSVUtils.ParseArray(value);
                        for (int i = 0; i < tokens.Get().Count; i++)
                        {
                            tokens.Get()[i].SetEndPosition(System.Convert.ToInt32(values[i]));
                        }
                        break;
                    }

                    default:
                    {
                        // ignore.
                        break;
                    }
                    }
                }
            }
            // Final token level stuff.
            if (tokens.IsPresent())
            {
                for (int i = 0; i < tokens.Get().Count; i++)
                {
                    tokens.Get()[i].Set(typeof(CoreAnnotations.DocIDAnnotation), docId.OrElse("???"));
                    tokens.Get()[i].Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
                    tokens.Get()[i].Set(typeof(CoreAnnotations.IndexAnnotation), i + 1);
                    tokens.Get()[i].Set(typeof(CoreAnnotations.TokenBeginAnnotation), i);
                    tokens.Get()[i].Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1);
                }
            }
            // Dependency trees
            if (tokens.IsPresent())
            {
                map.Set(typeof(CoreAnnotations.TokensAnnotation), tokens.Get());
                map.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0);
                map.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Get().Count);
                foreach (Pair <TSVSentenceIterator.SentenceField, string> entry_2 in Iterables.Zip(fields, entries))
                {
                    TSVSentenceIterator.SentenceField field = entry_2.first;
                    string value = TSVUtils.UnescapeSQL(entry_2.second);
                    switch (field)
                    {
                    case TSVSentenceIterator.SentenceField.DependenciesBasic:
                    {
                        SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get());
                        map.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph);
                        //            if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class))
                        //              map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
                        //            if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class))
                        //              map.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph);
                        break;
                    }

                    case TSVSentenceIterator.SentenceField.DependenciesCollapsed:
                    {
                        SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get());
                        map.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph);
                        break;
                    }

                    case TSVSentenceIterator.SentenceField.DependenciesCollapsedCc:
                    {
                        SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get());
                        //            if (!map.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class))
                        //              map.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph);
                        //            map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph);
                        map.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
                        break;
                    }

                    case TSVSentenceIterator.SentenceField.DependenciesAlternate:
                    {
                        SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get());
                        map.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), graph);
                        break;
                    }

                    default:
                    {
                        // ignore.
                        break;
                    }
                    }
                }
            }
            return(map);
        }
Exemple #5
0
        private static IList <ICoreMap> ToTimexCoreMaps(IElement docElem, ICoreMap originalDocument)
        {
            //--Collect Token Offsets
            IDictionary <int, int> beginMap = Generics.NewHashMap();
            IDictionary <int, int> endMap   = Generics.NewHashMap();
            bool haveTokenOffsets           = true;

            foreach (ICoreMap sent in originalDocument.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (CoreLabel token in sent.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    int tokBegin = token.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    int tokEnd   = token.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                    if (tokBegin == null || tokEnd == null)
                    {
                        haveTokenOffsets = false;
                    }
                    int charBegin = token.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    int charEnd   = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    beginMap[charBegin] = tokBegin;
                    endMap[charEnd]     = tokEnd;
                }
            }
            IList <ICoreMap> timexMaps = new List <ICoreMap>();
            int       offset           = 0;
            INodeList docNodes         = docElem.GetChildNodes();

            for (int i = 0; i < docNodes.GetLength(); i++)
            {
                INode content = docNodes.Item(i);
                if (content is IText)
                {
                    IText text = (IText)content;
                    offset += text.GetWholeText().Length;
                }
                else
                {
                    if (content is IElement)
                    {
                        IElement child = (IElement)content;
                        if (child.GetNodeName().Equals("TIMEX3"))
                        {
                            Timex timex = new Timex(child);
                            if (child.GetChildNodes().GetLength() != 1)
                            {
                                throw new Exception("TIMEX3 should only contain text " + child);
                            }
                            string   timexText = child.GetTextContent();
                            ICoreMap timexMap  = new ArrayCoreMap();
                            timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), timex);
                            timexMap.Set(typeof(CoreAnnotations.TextAnnotation), timexText);
                            int charBegin = offset;
                            timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset);
                            offset += timexText.Length;
                            timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset);
                            int charEnd = offset;
                            //(tokens)
                            if (haveTokenOffsets)
                            {
                                int tokBegin   = beginMap[charBegin];
                                int searchStep = 1;
                                //if no exact match, search around the character offset
                                while (tokBegin == null)
                                {
                                    tokBegin = beginMap[charBegin - searchStep];
                                    if (tokBegin == null)
                                    {
                                        tokBegin = beginMap[charBegin + searchStep];
                                    }
                                    searchStep += 1;
                                }
                                searchStep = 1;
                                int tokEnd = endMap[charEnd];
                                while (tokEnd == null)
                                {
                                    tokEnd = endMap[charEnd - searchStep];
                                    if (tokEnd == null)
                                    {
                                        tokEnd = endMap[charEnd + searchStep];
                                    }
                                    searchStep += 1;
                                }
                                timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokBegin);
                                timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokEnd);
                            }
                            timexMaps.Add(timexMap);
                        }
                        else
                        {
                            throw new Exception("unexpected element " + child);
                        }
                    }
                    else
                    {
                        throw new Exception("unexpected content " + content);
                    }
                }
            }
            return(timexMaps);
        }
Exemple #6
0
        /// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary>
        private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string
                                                                                                                                                                                                                                                  > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid)
        {
            // Error checks
            if (lemmas.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (pos.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (ner.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            // Create structure
            IList <CoreLabel> tokens = new List <CoreLabel>(words.Count);
            int beginChar            = 0;

            for (int i = 0; i < words.Count; ++i)
            {
                CoreLabel token = new CoreLabel(12);
                token.SetWord(words[i]);
                token.SetValue(words[i]);
                token.SetBeginPosition(beginChar);
                token.SetEndPosition(beginChar + words[i].Length);
                beginChar += words[i].Length + 1;
                token.SetLemma(lemmas[i]);
                token.SetTag(pos[i]);
                token.SetNER(ner[i]);
                token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
                token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
                token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1);
                token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i);
                token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1);
                tokens.Add(token);
            }
            gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t");
            ICoreMap sentence = new ArrayCoreMap(16);

            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            SemanticGraph graph = tree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
            SemanticGraph maltGraph = maltTree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph);
            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss);
            sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0);
            sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count);
            Annotation doc = new Annotation(gloss);

            doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
            doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            return(doc);
        }
        /// <exception cref="System.IO.IOException"/>
        public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix)
        {
            Pattern          startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            Pattern          endLabelToken      = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            string           backgroundSymbol   = "O";
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            int    lineNum = -1;
            string l       = null;

            while ((l = reader.ReadLine()) != null)
            {
                lineNum++;
                string[] t    = l.Split("\t", 2);
                string   id   = null;
                string   text = null;
                if (t.Length == 2)
                {
                    id   = t[0];
                    text = t[1];
                }
                else
                {
                    if (t.Length == 1)
                    {
                        text = t[0];
                        id   = lineNum.ToString();
                    }
                }
                id = sentIDprefix + id;
                DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
                PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
                dp.SetTokenizerFactory(tokenizerFactory);
                string label   = backgroundSymbol;
                int    sentNum = -1;
                foreach (IList <IHasWord> sentence in dp)
                {
                    sentNum++;
                    string            sentStr = string.Empty;
                    IList <CoreLabel> sent    = new List <CoreLabel>();
                    foreach (IHasWord tokw in sentence)
                    {
                        string  tok             = tokw.Word();
                        Matcher startingMatcher = startingLabelToken.Matcher(tok);
                        Matcher endMatcher      = endLabelToken.Matcher(tok);
                        if (startingMatcher.Matches())
                        {
                            //System.out.println("matched starting");
                            label = startingMatcher.Group(1);
                        }
                        else
                        {
                            if (endMatcher.Matches())
                            {
                                //System.out.println("matched end");
                                label = backgroundSymbol;
                            }
                            else
                            {
                                CoreLabel      c    = new CoreLabel();
                                IList <string> toks = new List <string>();
                                toks.Add(tok);
                                foreach (string toksplit in toks)
                                {
                                    sentStr += " " + toksplit;
                                    c.SetWord(toksplit);
                                    c.SetLemma(toksplit);
                                    c.SetValue(toksplit);
                                    c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit);
                                    c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok);
                                    if (setGoldClass)
                                    {
                                        c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
                                    }
                                    if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label))
                                    {
                                        c.Set(setClassForTheseLabels[label], label);
                                    }
                                    sent.Add(c);
                                }
                            }
                        }
                    }
                    ICoreMap sentcm = new ArrayCoreMap();
                    sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim());
                    sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent);
                    sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum);
                    sentences.Add(sentcm);
                }
            }
            return(sentences);
        }
Exemple #8
0
        /*
         * Old implementation based on JDOM.
         * No longer maintained due to JDOM licensing issues.
         * private static Annotation toAnnotation(String xml) throws IOException {
         * Element docElem;
         * try {
         * docElem = new SAXBuilder().build(new StringReader(xml)).getRootElement();
         * } catch (JDOMException e) {
         * throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml));
         * }
         * Element textElem = docElem.getChild("TEXT");
         * StringBuilder text = new StringBuilder();
         * int offset = 0;
         * List<CoreMap> sentences = new ArrayList<CoreMap>();
         * for (Object sentObj: textElem.getChildren("SENT")) {
         * CoreMap sentence = new ArrayCoreMap();
         * sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
         * Element sentElem = (Element)sentObj;
         * Tree tree = Tree.valueOf(sentElem.getText());
         * List<CoreLabel> tokens = new ArrayList<CoreLabel>();
         * List<Tree> preTerminals = preTerminals(tree);
         * for (Tree preTerminal: preTerminals) {
         * String posTag = preTerminal.value();
         * for (Tree wordTree: preTerminal.children()) {
         * String word = wordTree.value();
         * CoreLabel token = new CoreLabel();
         * token.set(CoreAnnotations.TextAnnotation.class, word);
         * token.set(CoreAnnotations.TextAnnotation.class, word);
         * token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
         * token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
         * offset += word.length();
         * token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset);
         * text.append(word);
         * text.append(' ');
         * offset += 1;
         * tokens.add(token);
         * }
         * }
         * if (preTerminals.size() > 0) {
         * text.setCharAt(text.length() - 1, '\n');
         * }
         * sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1);
         * sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
         * sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
         * sentences.add(sentence);
         * }
         *
         * String docID = docElem.getAttributeValue("id");
         * Matcher matcher = datePattern.matcher(docID);
         * matcher.find();
         * Calendar docDate = new Timex(matcher.group(1)).getDate();
         *
         * Annotation document = new Annotation(text.toString());
         * document.set(CoreAnnotations.DocIDAnnotation.class, docID);
         * document.set(CoreAnnotations.CalendarAnnotation.class, docDate);
         * document.set(CoreAnnotations.SentencesAnnotation.class, sentences);
         * return document;
         * }
         */
        /// <exception cref="System.IO.IOException"/>
        private static Annotation ToAnnotation(string xml)
        {
            Element docElem;

            try
            {
                Builder      parser = new Builder();
                StringReader @in    = new StringReader(xml);
                docElem = parser.Build(@in).GetRootElement();
            }
            catch (Exception e)
            {
                throw new Exception(string.Format("error:\n%s\ninput:\n%s", e, xml));
            }
            Element          textElem         = docElem.GetFirstChildElement("TEXT");
            StringBuilder    text             = new StringBuilder();
            int              offset           = 0;
            IList <ICoreMap> sentences        = new List <ICoreMap>();
            Elements         sentenceElements = textElem.GetChildElements("SENT");

            for (int crtsent = 0; crtsent < sentenceElements.Size(); crtsent++)
            {
                Element  sentElem = sentenceElements.Get(crtsent);
                ICoreMap sentence = new ArrayCoreMap();
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset);
                Tree tree = Tree.ValueOf(sentElem.GetChild(0).GetValue());
                // XXX ms: is this the same as sentElem.getText() in JDOM?
                IList <CoreLabel> tokens       = new List <CoreLabel>();
                IList <Tree>      preTerminals = PreTerminals(tree);
                foreach (Tree preTerminal in preTerminals)
                {
                    string posTag = preTerminal.Value();
                    foreach (Tree wordTree in preTerminal.Children())
                    {
                        string    word  = wordTree.Value();
                        CoreLabel token = new CoreLabel();
                        token.Set(typeof(CoreAnnotations.TextAnnotation), word);
                        token.Set(typeof(CoreAnnotations.TextAnnotation), word);
                        token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), posTag);
                        token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset);
                        offset += word.Length;
                        token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset);
                        text.Append(word);
                        text.Append(' ');
                        offset += 1;
                        tokens.Add(token);
                    }
                }
                if (preTerminals.Count > 0)
                {
                    Sharpen.Runtime.SetCharAt(text, text.Length - 1, '\n');
                }
                sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset - 1);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
                sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                sentences.Add(sentence);
            }
            string  docID   = docElem.GetAttributeValue("id");
            Matcher matcher = datePattern.Matcher(docID);

            matcher.Find();
            Calendar   docDate  = new Timex("DATE", matcher.Group(1)).GetDate();
            Annotation document = new Annotation(text.ToString());

            document.Set(typeof(CoreAnnotations.DocIDAnnotation), docID);
            document.Set(typeof(CoreAnnotations.CalendarAnnotation), docDate);
            document.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
            return(document);
        }