public virtual void TestFromList() { IList <ICoreMap> sentences = Generics.NewArrayList(); ICoreMap sentence = new ArrayCoreMap(); IList <CoreLabel> words = SentenceUtils.ToCoreLabelList("This", "is", "a", "test", "."); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words); sentences.Add(sentence); Annotation annotation = new Annotation(sentences); NUnit.Framework.Assert.AreEqual("This is a test .", annotation.ToString()); sentence.Set(typeof(CoreAnnotations.TextAnnotation), "This is a test."); annotation = new Annotation(sentences); NUnit.Framework.Assert.AreEqual("This is a test.", annotation.ToString()); }
private ICoreMap MakeTimexMap(HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode node, IList <CoreLabel> tokens, ICoreMap sentence) { ICoreMap timexMap = new ArrayCoreMap(); timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex); timexMap.Set(typeof(CoreAnnotations.TextAnnotation), node.contents); timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), BeginOffset(tokens[0])); timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), EndOffset(tokens[tokens.Count - 1])); timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokens[0].Index()); timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens[tokens.Count - 1].Index()); timexMap.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); if (sentence.Get(typeof(TimeAnnotations.TimexAnnotations)) == null) { sentence.Set(typeof(TimeAnnotations.TimexAnnotations), new List <ICoreMap>()); } sentence.Get(typeof(TimeAnnotations.TimexAnnotations)).Add(timexMap); // update NER for tokens foreach (CoreLabel token in tokens) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "DATE"); token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), node.timex.Value()); token.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex); } return(timexMap); }
public virtual void Test() { Properties props = new Properties(); props.SetProperty("patternType", "DEP"); ConstantsAndVariables constvars = new ConstantsAndVariables(props, new HashSet <string>(), new Dictionary <string, Type>()); CreatePatterns <DepPattern> createPatterns = new CreatePatterns <DepPattern>(props, constvars); IDictionary <string, DataInstance> sents = new Dictionary <string, DataInstance>(); ICoreMap m = new ArrayCoreMap(); string text = "We present a paper that focuses on semantic graphs applied to language."; string graphString = "[present/VBP-2 nsubj>We/PRP-1 dobj>[paper/NN-4 det>a/DT-3] ccomp>[applied/VBN-10 mark>that/IN-5 nsubj>[focuses/NN-6 nmod:on>[graphs/NNS-9 amod>semantic/JJ-8]] nmod:to>language/NN-12]]"; SemanticGraph graph = SemanticGraph.ValueOf(graphString); //String phrase = "semantic graphs"; IList <string> tokens = Arrays.AsList(new string[] { "We", "present", "a", "paper", "that", "focuses", "on", "semantic", "graphs", "applied", "to", "language" }); m.Set(typeof(CoreAnnotations.TokensAnnotation), tokens.Stream().Map(null).Collect(Collectors.ToList())); m.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); sents["sent1"] = DataInstance.GetNewInstance(PatternFactory.PatternType.Dep, m); createPatterns.GetAllPatterns(sents, props, ConstantsAndVariables.PatternForEachTokenWay.Memory); System.Console.Out.WriteLine("graph is " + graph); System.Console.Out.WriteLine(PatternsForEachTokenInMemory.patternsForEachToken); }
public static ICoreMap ToCoreMap(IList <TSVSentenceIterator.SentenceField> fields, IList <string> entries) { ICoreMap map = new ArrayCoreMap(fields.Count); Optional <IList <CoreLabel> > tokens = Optional.Empty(); // First pass - process all token level stuff. foreach (Pair <TSVSentenceIterator.SentenceField, string> entry in Iterables.Zip(fields, entries)) { TSVSentenceIterator.SentenceField field = entry.first; string value = TSVUtils.UnescapeSQL(entry.second); switch (field) { case TSVSentenceIterator.SentenceField.Words: { IList <string> values = TSVUtils.ParseArray(value); if (!tokens.IsPresent()) { tokens = Optional.Of(new List <CoreLabel>(values.Count)); for (int i = 0; i < values.Count; i++) { tokens.Get().Add(new CoreLabel()); } } int beginChar = 0; for (int i_1 = 0; i_1 < values.Count; i_1++) { tokens.Get()[i_1].SetValue(values[i_1]); tokens.Get()[i_1].SetWord(values[i_1]); tokens.Get()[i_1].SetBeginPosition(beginChar); tokens.Get()[i_1].SetEndPosition(beginChar + values[i_1].Length); beginChar += values[i_1].Length + 1; } break; } case TSVSentenceIterator.SentenceField.Lemmas: { IList <string> values = TSVUtils.ParseArray(value); if (!tokens.IsPresent()) { tokens = Optional.Of(new List <CoreLabel>(values.Count)); for (int i = 0; i < values.Count; i++) { tokens.Get().Add(new CoreLabel()); } } for (int i_1 = 0; i_1 < values.Count; i_1++) { tokens.Get()[i_1].SetLemma(values[i_1]); } break; } case TSVSentenceIterator.SentenceField.PosTags: { IList <string> values = TSVUtils.ParseArray(value); if (!tokens.IsPresent()) { tokens = Optional.Of(new List <CoreLabel>(values.Count)); for (int i = 0; i < values.Count; i++) { tokens.Get().Add(new CoreLabel()); } } for (int i_1 = 0; i_1 < values.Count; i_1++) { tokens.Get()[i_1].SetTag(values[i_1]); } break; } case TSVSentenceIterator.SentenceField.NerTags: { IList <string> values = TSVUtils.ParseArray(value); if (!tokens.IsPresent()) { tokens = Optional.Of(new List <CoreLabel>(values.Count)); for (int i = 0; i < values.Count; i++) { tokens.Get().Add(new CoreLabel()); } } for (int i_1 = 0; i_1 < values.Count; i_1++) { tokens.Get()[i_1].SetNER(values[i_1]); } break; } default: { // ignore. break; } } } // Document specific stuff. Optional <string> docId = Optional.Empty(); Optional <string> sentenceId = Optional.Empty(); Optional <int> sentenceIndex = Optional.Empty(); foreach (Pair <TSVSentenceIterator.SentenceField, string> entry_1 in Iterables.Zip(fields, entries)) { TSVSentenceIterator.SentenceField field = entry_1.first; string value = TSVUtils.UnescapeSQL(entry_1.second); switch (field) { case TSVSentenceIterator.SentenceField.Id: { sentenceId = Optional.Of(value); break; } case TSVSentenceIterator.SentenceField.DocId: { docId = Optional.Of(value); break; } case TSVSentenceIterator.SentenceField.SentenceIndex: { sentenceIndex = Optional.Of(System.Convert.ToInt32(value)); break; } case TSVSentenceIterator.SentenceField.Gloss: { value = value.Replace("\\n", "\n").Replace("\\t", "\t"); map.Set(typeof(CoreAnnotations.TextAnnotation), value); break; } default: { // ignore. break; } } } // High level document stuff map.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceId.OrElse("-1")); map.Set(typeof(CoreAnnotations.DocIDAnnotation), docId.OrElse("???")); map.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(0)); // Doc-char if (tokens.IsPresent()) { foreach (Pair <TSVSentenceIterator.SentenceField, string> entry_2 in Iterables.Zip(fields, entries)) { TSVSentenceIterator.SentenceField field = entry_2.first; string value = TSVUtils.UnescapeSQL(entry_2.second); switch (field) { case TSVSentenceIterator.SentenceField.DocCharBegin: { IList <string> values = TSVUtils.ParseArray(value); for (int i = 0; i < tokens.Get().Count; i++) { tokens.Get()[i].SetBeginPosition(System.Convert.ToInt32(values[i])); } break; } case TSVSentenceIterator.SentenceField.DocCharEnd: { IList <string> values = TSVUtils.ParseArray(value); for (int i = 0; i < tokens.Get().Count; i++) { tokens.Get()[i].SetEndPosition(System.Convert.ToInt32(values[i])); } break; } default: { // ignore. break; } } } } // Final token level stuff. if (tokens.IsPresent()) { for (int i = 0; i < tokens.Get().Count; i++) { tokens.Get()[i].Set(typeof(CoreAnnotations.DocIDAnnotation), docId.OrElse("???")); tokens.Get()[i].Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); tokens.Get()[i].Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); tokens.Get()[i].Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); tokens.Get()[i].Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); } } // Dependency trees if (tokens.IsPresent()) { map.Set(typeof(CoreAnnotations.TokensAnnotation), tokens.Get()); map.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); map.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Get().Count); foreach (Pair <TSVSentenceIterator.SentenceField, string> entry_2 in Iterables.Zip(fields, entries)) { TSVSentenceIterator.SentenceField field = entry_2.first; string value = TSVUtils.UnescapeSQL(entry_2.second); switch (field) { case TSVSentenceIterator.SentenceField.DependenciesBasic: { SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get()); map.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); // if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class)) // map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph); // if (!map.containsKey(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class)) // map.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, graph); break; } case TSVSentenceIterator.SentenceField.DependenciesCollapsed: { SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get()); map.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); break; } case TSVSentenceIterator.SentenceField.DependenciesCollapsedCc: { SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get()); // if (!map.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)) // map.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, graph); // map.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, graph); map.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); break; } case TSVSentenceIterator.SentenceField.DependenciesAlternate: { SemanticGraph graph = TSVUtils.ParseJsonTree(value, tokens.Get()); map.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), graph); break; } default: { // ignore. break; } } } } return(map); }
private static IList <ICoreMap> ToTimexCoreMaps(IElement docElem, ICoreMap originalDocument) { //--Collect Token Offsets IDictionary <int, int> beginMap = Generics.NewHashMap(); IDictionary <int, int> endMap = Generics.NewHashMap(); bool haveTokenOffsets = true; foreach (ICoreMap sent in originalDocument.Get(typeof(CoreAnnotations.SentencesAnnotation))) { foreach (CoreLabel token in sent.Get(typeof(CoreAnnotations.TokensAnnotation))) { int tokBegin = token.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int tokEnd = token.Get(typeof(CoreAnnotations.TokenEndAnnotation)); if (tokBegin == null || tokEnd == null) { haveTokenOffsets = false; } int charBegin = token.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int charEnd = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); beginMap[charBegin] = tokBegin; endMap[charEnd] = tokEnd; } } IList <ICoreMap> timexMaps = new List <ICoreMap>(); int offset = 0; INodeList docNodes = docElem.GetChildNodes(); for (int i = 0; i < docNodes.GetLength(); i++) { INode content = docNodes.Item(i); if (content is IText) { IText text = (IText)content; offset += text.GetWholeText().Length; } else { if (content is IElement) { IElement child = (IElement)content; if (child.GetNodeName().Equals("TIMEX3")) { Timex timex = new Timex(child); if (child.GetChildNodes().GetLength() != 1) { throw new Exception("TIMEX3 should only contain text " + child); } string timexText = child.GetTextContent(); ICoreMap timexMap = new ArrayCoreMap(); timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), timex); timexMap.Set(typeof(CoreAnnotations.TextAnnotation), timexText); int charBegin = offset; timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset); offset += timexText.Length; timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset); int charEnd = offset; //(tokens) if (haveTokenOffsets) { int tokBegin = beginMap[charBegin]; int searchStep = 1; //if no exact match, search around the character offset while (tokBegin == null) { tokBegin = beginMap[charBegin - searchStep]; if (tokBegin == null) { tokBegin = beginMap[charBegin + searchStep]; } searchStep += 1; } searchStep = 1; int tokEnd = endMap[charEnd]; while (tokEnd == null) { tokEnd = endMap[charEnd - searchStep]; if (tokEnd == null) { tokEnd = endMap[charEnd + searchStep]; } searchStep += 1; } timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokBegin); timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokEnd); } timexMaps.Add(timexMap); } else { throw new Exception("unexpected element " + child); } } else { throw new Exception("unexpected content " + content); } } } return(timexMaps); }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }
/// <exception cref="System.IO.IOException"/> public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { Pattern startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); Pattern endLabelToken = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); string backgroundSymbol = "O"; IList <ICoreMap> sentences = new List <ICoreMap>(); int lineNum = -1; string l = null; while ((l = reader.ReadLine()) != null) { lineNum++; string[] t = l.Split("\t", 2); string id = null; string text = null; if (t.Length == 2) { id = t[0]; text = t[1]; } else { if (t.Length == 1) { text = t[0]; id = lineNum.ToString(); } } id = sentIDprefix + id; DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); dp.SetTokenizerFactory(tokenizerFactory); string label = backgroundSymbol; int sentNum = -1; foreach (IList <IHasWord> sentence in dp) { sentNum++; string sentStr = string.Empty; IList <CoreLabel> sent = new List <CoreLabel>(); foreach (IHasWord tokw in sentence) { string tok = tokw.Word(); Matcher startingMatcher = startingLabelToken.Matcher(tok); Matcher endMatcher = endLabelToken.Matcher(tok); if (startingMatcher.Matches()) { //System.out.println("matched starting"); label = startingMatcher.Group(1); } else { if (endMatcher.Matches()) { //System.out.println("matched end"); label = backgroundSymbol; } else { CoreLabel c = new CoreLabel(); IList <string> toks = new List <string>(); toks.Add(tok); foreach (string toksplit in toks) { sentStr += " " + toksplit; c.SetWord(toksplit); c.SetLemma(toksplit); c.SetValue(toksplit); c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit); c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok); if (setGoldClass) { c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { c.Set(setClassForTheseLabels[label], label); } sent.Add(c); } } } } ICoreMap sentcm = new ArrayCoreMap(); sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim()); sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent); sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum); sentences.Add(sentcm); } } return(sentences); }
/* * Old implementation based on JDOM. * No longer maintained due to JDOM licensing issues. * private static Annotation toAnnotation(String xml) throws IOException { * Element docElem; * try { * docElem = new SAXBuilder().build(new StringReader(xml)).getRootElement(); * } catch (JDOMException e) { * throw new RuntimeException(String.format("error:\n%s\ninput:\n%s", e, xml)); * } * Element textElem = docElem.getChild("TEXT"); * StringBuilder text = new StringBuilder(); * int offset = 0; * List<CoreMap> sentences = new ArrayList<CoreMap>(); * for (Object sentObj: textElem.getChildren("SENT")) { * CoreMap sentence = new ArrayCoreMap(); * sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset); * Element sentElem = (Element)sentObj; * Tree tree = Tree.valueOf(sentElem.getText()); * List<CoreLabel> tokens = new ArrayList<CoreLabel>(); * List<Tree> preTerminals = preTerminals(tree); * for (Tree preTerminal: preTerminals) { * String posTag = preTerminal.value(); * for (Tree wordTree: preTerminal.children()) { * String word = wordTree.value(); * CoreLabel token = new CoreLabel(); * token.set(CoreAnnotations.TextAnnotation.class, word); * token.set(CoreAnnotations.TextAnnotation.class, word); * token.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag); * token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset); * offset += word.length(); * token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset); * text.append(word); * text.append(' '); * offset += 1; * tokens.add(token); * } * } * if (preTerminals.size() > 0) { * text.setCharAt(text.length() - 1, '\n'); * } * sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset - 1); * sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); * sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); * sentences.add(sentence); * } * * String docID = docElem.getAttributeValue("id"); * Matcher matcher = datePattern.matcher(docID); * matcher.find(); * Calendar docDate = new Timex(matcher.group(1)).getDate(); * * Annotation document = new Annotation(text.toString()); * document.set(CoreAnnotations.DocIDAnnotation.class, docID); * document.set(CoreAnnotations.CalendarAnnotation.class, docDate); * document.set(CoreAnnotations.SentencesAnnotation.class, sentences); * return document; * } */ /// <exception cref="System.IO.IOException"/> private static Annotation ToAnnotation(string xml) { Element docElem; try { Builder parser = new Builder(); StringReader @in = new StringReader(xml); docElem = parser.Build(@in).GetRootElement(); } catch (Exception e) { throw new Exception(string.Format("error:\n%s\ninput:\n%s", e, xml)); } Element textElem = docElem.GetFirstChildElement("TEXT"); StringBuilder text = new StringBuilder(); int offset = 0; IList <ICoreMap> sentences = new List <ICoreMap>(); Elements sentenceElements = textElem.GetChildElements("SENT"); for (int crtsent = 0; crtsent < sentenceElements.Size(); crtsent++) { Element sentElem = sentenceElements.Get(crtsent); ICoreMap sentence = new ArrayCoreMap(); sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset); Tree tree = Tree.ValueOf(sentElem.GetChild(0).GetValue()); // XXX ms: is this the same as sentElem.getText() in JDOM? IList <CoreLabel> tokens = new List <CoreLabel>(); IList <Tree> preTerminals = PreTerminals(tree); foreach (Tree preTerminal in preTerminals) { string posTag = preTerminal.Value(); foreach (Tree wordTree in preTerminal.Children()) { string word = wordTree.Value(); CoreLabel token = new CoreLabel(); token.Set(typeof(CoreAnnotations.TextAnnotation), word); token.Set(typeof(CoreAnnotations.TextAnnotation), word); token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), posTag); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), offset); offset += word.Length; token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset); text.Append(word); text.Append(' '); offset += 1; tokens.Add(token); } } if (preTerminals.Count > 0) { Sharpen.Runtime.SetCharAt(text, text.Length - 1, '\n'); } sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), offset - 1); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree); sentences.Add(sentence); } string docID = docElem.GetAttributeValue("id"); Matcher matcher = datePattern.Matcher(docID); matcher.Find(); Calendar docDate = new Timex("DATE", matcher.Group(1)).GetDate(); Annotation document = new Annotation(text.ToString()); document.Set(typeof(CoreAnnotations.DocIDAnnotation), docID); document.Set(typeof(CoreAnnotations.CalendarAnnotation), docDate); document.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); return(document); }