private ICoreMap MakeTimexMap(HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode node, IList <CoreLabel> tokens, ICoreMap sentence) { ICoreMap timexMap = new ArrayCoreMap(); timexMap.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex); timexMap.Set(typeof(CoreAnnotations.TextAnnotation), node.contents); timexMap.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), BeginOffset(tokens[0])); timexMap.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), EndOffset(tokens[tokens.Count - 1])); timexMap.Set(typeof(CoreAnnotations.TokenBeginAnnotation), tokens[0].Index()); timexMap.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens[tokens.Count - 1].Index()); timexMap.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); if (sentence.Get(typeof(TimeAnnotations.TimexAnnotations)) == null) { sentence.Set(typeof(TimeAnnotations.TimexAnnotations), new List <ICoreMap>()); } sentence.Get(typeof(TimeAnnotations.TimexAnnotations)).Add(timexMap); // update NER for tokens foreach (CoreLabel token in tokens) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "DATE"); token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), node.timex.Value()); token.Set(typeof(TimeAnnotations.TimexAnnotation), node.timex); } return(timexMap); }
public virtual IList <ICoreMap> Process(ICoreMap document, string output) { List <ICoreMap> ret = new List <ICoreMap>(); IList <ICoreMap> sentences = document.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <CoreLabel> tokens = document.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <HeidelTimeKBPAnnotator.HeidelTimeOutputReader.Node> nodes = ToNodeSequence(output); int tokenIdx = 0; int nodeIdx = 0; string partial = string.Empty; // Things that are left over from previous partially matched tokens. foreach (HeidelTimeKBPAnnotator.HeidelTimeOutputReader.Node node in nodes) { // Get tokens. string text = node.contents.Trim(); while (tokens[tokenIdx].Word().Equals("*NL*") && tokenIdx < tokens.Count) { tokenIdx += 1; } // Skip past stupid *NL* tags. int tokenEndIdx = tokenIdx; foreach (CoreLabel token in tokens.SubList(tokenIdx, tokens.Count)) { if (text.Length == 0) { break; } tokenEndIdx++; string matchStr = token.OriginalText().Trim(); // This is necessarily in the middle. if (Objects.Equals(matchStr, "*NL*")) { continue; } // This is one weird case where JavaNLP has a whitespace token. if ((partial + text).StartsWith(matchStr)) { text = Sharpen.Runtime.Substring(text, matchStr.Length - partial.Length).Trim(); partial = string.Empty; } else { // And clear partial. if (matchStr.StartsWith(partial + text)) { // uh oh we have a partial match. partial = Sharpen.Runtime.Substring(matchStr, 0, partial.Length + text.Length); // we need to remember what we matched earlier. text = string.Empty; } else { // This should never happen. System.Diagnostics.Debug.Assert(false); } } } // Only process time nodes if they span the same sentence. if (node is HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode && tokens[tokenIdx].SentIndex() == tokens[tokenEndIdx - 1].SentIndex()) { HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode timexNode = (HeidelTimeKBPAnnotator.HeidelTimeOutputReader.TimexNode)node; ICoreMap sentence = sentences[tokens[tokenIdx].SentIndex()]; ret.Add(MakeTimexMap(timexNode, tokens.SubList(tokenIdx, tokenEndIdx), sentence)); } if (partial.Length > 0) { tokenIdx = tokenEndIdx - 1; } else { // Move back a token because this is actually shared between the two nodes. tokenIdx = tokenEndIdx; } nodeIdx++; } return(ret); }