// Used to create timex from XML (mainly for testing) public static Timex FromXml(string xml) { IElement element = XMLUtils.ParseElement(xml); if ("TIMEX3".Equals(element.GetNodeName())) { Timex t = new Timex(); // t.init(xml, element); // Doesn't preserve original input xml // Will reorder attributes of xml so can match xml of test timex and actual timex // (for which we can't control the order of the attributes now we don't use nu.xom...) t.Init(element); return(t); } else { throw new ArgumentException("Invalid timex xml: " + xml); } }
/// <exception cref="System.IO.IOException"/> public virtual void Annotate(ICoreMap document) { //--Create Input File //(create file) File inputFile = File.CreateTempFile("heideltime", ".input"); //(write to file) PrintWriter inputWriter = new PrintWriter(inputFile); inputWriter.Println(document.Get(typeof(CoreAnnotations.TextAnnotation))); inputWriter.Close(); //--Get Date //(error checks) if (!document.ContainsKey(typeof(CoreAnnotations.CalendarAnnotation)) && !document.ContainsKey(typeof(CoreAnnotations.DocDateAnnotation))) { throw new ArgumentException("CoreMap must have either a Calendar or DocDate annotation"); } //not strictly necessary, technically... //(variables) Calendar dateCalendar = document.Get(typeof(CoreAnnotations.CalendarAnnotation)); string pubDate = null; if (dateCalendar != null) { //(case: calendar annotation) pubDate = string.Format("%TF", dateCalendar); } else { //(case: docdateannotation) string s = document.Get(typeof(CoreAnnotations.DocDateAnnotation)); if (s != null) { pubDate = s; } } //--Build Command List <string> args = new List <string>(); args.Add("java"); args.Add("-jar"); args.Add(this.heideltimePath.GetPath() + "/heideltime.jar"); args.Add("-c"); args.Add(this.heideltimePath.GetPath() + "/config.props"); args.Add("-l"); args.Add(this.language); args.Add("-t"); args.Add("NEWS"); if (pubDate != null) { args.Add("-dct"); args.Add(pubDate); } args.Add(inputFile.GetPath()); // run HeidelTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.Run(process, outputWriter, null); string output = outputWriter.GetBuffer().ToString(); Pattern docClose = Pattern.Compile("</DOC>.*", Pattern.Dotall); output = docClose.Matcher(output).ReplaceAll("</DOC>").ReplaceAll("<!DOCTYPE TimeML SYSTEM \"TimeML.dtd\">", string.Empty); //TODO TimeML.dtd? FileNotFoundException if we leave it in Pattern badNestedTimex = Pattern.Compile(Pattern.Quote("<T</TIMEX3>IMEX3")); output = badNestedTimex.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3"); Pattern badNestedTimex2 = Pattern.Compile(Pattern.Quote("<TI</TIMEX3>MEX3")); output = badNestedTimex2.Matcher(output).ReplaceAll("</TIMEX3><TIMEX3"); //output = output.replaceAll("\\n\\n<TimeML>\\n\\n","<TimeML>"); output = output.ReplaceAll("<TimeML>", string.Empty); // parse the HeidelTime output IElement outputXML; try { outputXML = XMLUtils.ParseElement(output); } catch (Exception ex) { throw new Exception(string.Format("error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.SlurpFile(inputFile), output), ex); } inputFile.Delete(); // get Timex annotations IList <ICoreMap> timexAnns = ToTimexCoreMaps(outputXML, document); document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns); if (outputResults) { System.Console.Out.WriteLine(timexAnns); } // align Timex annotations to sentences int timexIndex = 0; foreach (ICoreMap sentence in document.Get(typeof(CoreAnnotations.SentencesAnnotation))) { int sentBegin = BeginOffset(sentence); int sentEnd = EndOffset(sentence); // skip times before the sentence while (timexIndex < timexAnns.Count && BeginOffset(timexAnns[timexIndex]) < sentBegin) { ++timexIndex; } // determine times within the sentence int sublistBegin = timexIndex; int sublistEnd = timexIndex; while (timexIndex < timexAnns.Count && sentBegin <= BeginOffset(timexAnns[timexIndex]) && EndOffset(timexAnns[timexIndex]) <= sentEnd) { ++sublistEnd; ++timexIndex; } // set the sentence timexes sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns.SubList(sublistBegin, sublistEnd)); } }
/// <exception cref="System.IO.IOException"/> public virtual void Annotate(ICoreMap document) { // write input file in GUTime format IElement inputXML = ToInputXML(document); File inputFile = File.CreateTempFile("gutime", ".input"); //Document doc = new Document(inputXML); PrintWriter inputWriter = new PrintWriter(inputFile); inputWriter.Println(XMLUtils.NodeToString(inputXML, false)); // new XMLOutputter().output(inputXML, inputWriter); inputWriter.Close(); bool useFirstDate = (!document.ContainsKey(typeof(CoreAnnotations.CalendarAnnotation)) && !document.ContainsKey(typeof(CoreAnnotations.DocDateAnnotation))); List <string> args = new List <string>(); args.Add("perl"); args.Add("-I" + this.gutimePath.GetPath()); args.Add(new File(this.gutimePath, "TimeTag.pl").GetPath()); if (useFirstDate) { args.Add("-FDNW"); } args.Add(inputFile.GetPath()); // run GUTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.Run(process, outputWriter, null); string output = outputWriter.GetBuffer().ToString(); Pattern docClose = Pattern.Compile("</DOC>.*", Pattern.Dotall); output = docClose.Matcher(output).ReplaceAll("</DOC>"); //The TimeTag.pl result file contains next tags which must be removed output = output.ReplaceAll("<lex.*?>", string.Empty); output = output.Replace("</lex>", string.Empty); output = output.Replace("<NG>", string.Empty); output = output.Replace("</NG>", string.Empty); output = output.Replace("<VG>", string.Empty); output = output.Replace("</VG>", string.Empty); output = output.Replace("<s>", string.Empty); output = output.Replace("</s>", string.Empty); // parse the GUTime output IElement outputXML; try { outputXML = XMLUtils.ParseElement(output); } catch (Exception ex) { throw new Exception(string.Format("error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.SlurpFile(inputFile), output), ex); } /* * try { * outputXML = new SAXBuilder().build(new StringReader(output)).getRootElement(); * } catch (JDOMException e) { * throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s", * e, IOUtils.slurpFile(inputFile), output)); * } */ inputFile.Delete(); // get Timex annotations IList <ICoreMap> timexAnns = ToTimexCoreMaps(outputXML, document); document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns); if (outputResults) { System.Console.Out.WriteLine(timexAnns); } // align Timex annotations to sentences int timexIndex = 0; foreach (ICoreMap sentence in document.Get(typeof(CoreAnnotations.SentencesAnnotation))) { int sentBegin = BeginOffset(sentence); int sentEnd = EndOffset(sentence); // skip times before the sentence while (timexIndex < timexAnns.Count && BeginOffset(timexAnns[timexIndex]) < sentBegin) { ++timexIndex; } // determine times within the sentence int sublistBegin = timexIndex; int sublistEnd = timexIndex; while (timexIndex < timexAnns.Count && sentBegin <= BeginOffset(timexAnns[timexIndex]) && EndOffset(timexAnns[timexIndex]) <= sentEnd) { ++sublistEnd; ++timexIndex; } // set the sentence timexes sentence.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns.SubList(sublistBegin, sublistEnd)); } }