/// <exception cref="System.IO.IOException"/> public virtual void Annotate(ICoreMap document) { //--Create Input File //(create file) File inputFile = File.CreateTempFile("heideltime", ".input"); //(write to file) PrintWriter inputWriter = new PrintWriter(inputFile); inputWriter.Println(document.Get(typeof(CoreAnnotations.TextAnnotation))); inputWriter.Close(); //--Get Date //(error checks) if (!document.ContainsKey(typeof(CoreAnnotations.CalendarAnnotation)) && !document.ContainsKey(typeof(CoreAnnotations.DocDateAnnotation))) { throw new ArgumentException("CoreMap must have either a Calendar or DocDate annotation"); } //not strictly necessary, technically... //(variables) Calendar dateCalendar = document.Get(typeof(CoreAnnotations.CalendarAnnotation)); string pubDate = null; if (dateCalendar != null) { //(case: calendar annotation) pubDate = string.Format("%TF", dateCalendar); } else { //(case: docdateannotation) string s = document.Get(typeof(CoreAnnotations.DocDateAnnotation)); if (s != null) { pubDate = s; } } //--Build Command List <string> args = new List <string>(); args.Add("java"); args.Add("-jar"); args.Add(this.heideltimePath.GetPath() + "/heideltime.jar"); args.Add("-c"); args.Add(this.heideltimePath.GetPath() + "/config.props"); args.Add("-l"); args.Add(this.language); args.Add("-t"); args.Add("NEWS"); if (pubDate != null) { args.Add("-dct"); args.Add(pubDate); } args.Add(inputFile.GetPath()); // run HeidelTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.Run(process, outputWriter, null); string output = outputWriter.GetBuffer().ToString(); Pattern docClose = Pattern.Compile("</DOC>.*", Pattern.Dotall); output = docClose.Matcher(output).ReplaceAll("</DOC>").ReplaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">", string.Empty); //TODO TimeML.dtd? FileNotFoundException if we leave it in Pattern badNestedTimex = Pattern.Compile(Pattern.Quote("<T</TIMEX3>IMEX3")); output = badNestedTimex.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3"); Pattern badNestedTimex2 = Pattern.Compile(Pattern.Quote("<TI</TIMEX3>MEX3")); output = badNestedTimex2.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3"); //output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>"); output = output.ReplaceAll("<TimeML>", string.Empty); // parse the HeidelTime output IElement outputXML; try { outputXML = XMLUtils.ParseElement(output); } catch (Exception ex) { throw new Exception(string.Format("error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.SlurpFile(inputFile), output), ex); } inputFile.Delete(); // get Timex annotations IList <ICoreMap> timexAnns = ToTimexCoreMaps(outputXML, document); document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns); if (outputResults) { System.Console.Out.WriteLine(timexAnns); } // align Timex annotations to sentences int timexIndex = 0; foreach (ICoreMap sentence in document.Get(typeof(CoreAnnotations.SentencesAnnotation))) { int sentBegin = BeginOffset(sentence); int sentEnd = EndOffset(sentence); // skip times before the sentence while (timexIndex < timexAnns.Count && BeginOffset(timexAnns[timexIndex]) < sentBegin) { ++timexIndex; } // determine times within the sentence int sublistBegin = timexIndex; int sublistEnd = timexIndex; while (timexIndex < timexAnns.Count && sentBegin <= BeginOffset(timexAnns[timexIndex]) && EndOffset(timexAnns[timexIndex]) <= sentEnd) { ++sublistEnd; ++timexIndex; } // set the sentence timexes sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns.SubList(sublistBegin, sublistEnd)); } }
private void Init(IElement element) { Init(XMLUtils.NodeToString(element, false), element); }