/// <summary> /// reference: /// https://stanfordnlp.github.io/CoreNLP/api.html /// 生成情景 /// 解析每组中的sentence dep /// </summary> /// <param name="manager"></param> public List <Scenario> GenerateScenarios(ScenarioTimeManager manager) { string id = "S0"; List <Scenario> scenarios = new List <Scenario>(); manager.Group.Keys.ToList().ForEach(key => { //each key has a scenario depict List <edu.stanford.nlp.util.CoreMap> group = manager.Group[key]; Scenario scenario = new Scenario(_net, id); //逐句子处理,拼凑情景元素值 group.ForEach(sentence => ElementExtractByDependencyPrase(sentence, scenario)); scenarios.Add(scenario); //更新事件链id id += "1"; }); return(scenarios); }
/// <summary> /// 分别转换情景成词向量 /// </summary> /// <param name="manager"></param> /// <returns></returns> public List <double[]> ToScenarioWordVector(ScenarioTimeManager manager) { List <double[]> list = new List <double[]>(); manager.Group.Keys.ToList().ForEach(key => { //each key has a scenario depict List <edu.stanford.nlp.util.CoreMap> group = manager.Group[key]; group.ForEach(sentence => { java.util.AbstractList tokens = sentence.get(tokensAnnotationClass) as java.util.AbstractList; foreach (edu.stanford.nlp.ling.CoreLabel lable in tokens) { string word = lable.value(); double[] vt = _net.ToDouble(word); if (vt != null) { list.Add(vt); } } }); }); return(list); }
/// <summary> /// 执行annotate操作,分析此段话的时间区间 /// 归纳事件的时间序列 /// </summary> /// <param name="rawText">待处理文本</param> /// <param name="beginDateTime">起始日期, 默认使用系统当前时间,建议设置为事件发生日期时间</param> public ScenarioTimeManager InductiveEventTimeSeries(string rawText, string beginDateTime = null) { if (rawText == null || rawText.Length == 0) { return(null); } //annotate text edu.stanford.nlp.pipeline.StanfordCoreNLPClient pipeline = new edu.stanford.nlp.pipeline.StanfordCoreNLPClient(_props, NLPConfiguration.CoreNLPAddress, Convert.ToInt32(NLPConfiguration.CoreNLPPort)); edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(rawText); //date format and set for reference string formateDate = beginDateTime != null?Convert.ToDateTime(beginDateTime).ToString("yyyy-MM-dd") : DateTime.Now.ToString("yyyy-MM-dd"); document.set(docDateAnnotationClass, formateDate); //annotate timex pipeline.annotate(document); java.util.AbstractList sentences = document.get(sentencesAnnotationClass) as java.util.AbstractList; if (sentences == null) { return(null); } //create scenario manager (timeline, scenario, info, etc) ScenarioTimeManager stManager = new ScenarioTimeManager(Convert.ToDateTime(formateDate), rawText); //1. 分析时间序列 foreach (edu.stanford.nlp.util.CoreMap sentence in sentences) { //}{debug 展示句子内容 string text = (string)sentence.get(textAnnotationClass); //edu.stanford.nlp.util, edu.stanford.nlp.coref.data.Mention var mentions = sentence.get(mentionsAnnotationClass) as java.util.AbstractList; //从mentions entites里找到EntityTypeAnnotation foreach (edu.stanford.nlp.util.CoreMap anno in mentions) { string entityType = (string)anno.get(entityTypeAnnotation); //reference : https://nlp.stanford.edu/pubs/lrec2012-sutime.pdf if (entityType == "DATE") //date without time { //hashmap java.util.HashMap probHash = anno.get(namedEntityTagProbsAnnotation) as java.util.HashMap; //extract information double prob = Convert.ToDouble(probHash.get(entityType).ToString()); //java.lang.Double -> string -> double if (prob < CONFIDENCE) { continue; } int offset = (anno.get(tokenBeginAnnotation) as java.lang.Integer).intValue(); //begin offset string normalizedTimex = (string)anno.get(normalizedNamedEntityTagAnnotationClass); if (!normalizedTimex.Contains("T")) //不包含TIME的DATE可能是综述性时间,需要加入记录 { DateTime dtime = normalizedTimex.ToDateTime(); stManager.AddTimeStamp(dtime, offset); //置信度标注 } } else if (entityType == "TIME") // a time point indicating a particular instance on a time scale { //hashmap java.util.HashMap probHash = anno.get(namedEntityTagProbsAnnotation) as java.util.HashMap; //extract information double prob = Convert.ToDouble(probHash.get(entityType).ToString()); //java.lang.Double -> string -> double if (prob < CONFIDENCE) { continue; } int offset = (anno.get(tokenBeginAnnotation) as java.lang.Integer).intValue(); //begin offset string normalizedTimex = (string)anno.get(normalizedNamedEntityTagAnnotationClass); DateTime dtime = normalizedTimex.TimeExpression().ToDateTime(); stManager.AddTimeStamp(dtime, offset); //置信度标注 } #region Duration //else if (entityType == "DURATION") // the amount of intervening time between the two end-points of a time interval //{ // java.util.HashMap probHash = anno.get(namedEntityTagProbsAnnotation) as java.util.HashMap; // double prob = Convert.ToDouble(probHash.get(entityType).ToString()); //java.lang.Double -> string -> double // int offset = (anno.get(tokenBeginAnnotation) as java.lang.Integer).intValue(); //begin offset // string normalizedTimex = (string)anno.get(normalizedNamedEntityTagAnnotationClass); // //set defaultduration information // int days = 0, hours =0 ,minutes =0 ,seconds = 0, milliseconds = 0; // //Exact // if (!normalizedTimex.Contains("/")) //duration ranges are not part of TIMEX3 standard // if(!normalizedTimex.Contains("X")) //Inexact time // { // if (normalizedTimex.Contains("D")) // days = Convert.ToInt32(normalizedTimex.Replace('P', ' ').Replace('D', ' ').Trim()); // else if (normalizedTimex.Contains("H")) // hours = Convert.ToInt32(normalizedTimex.Replace('P', ' ').Replace('T',' ').Replace('H', ' ').Trim()); // } // //create timespan // TimeSpan span = new TimeSpan(days, hours, minutes, seconds, milliseconds); // if (prob > CONFIDENCE) stManager.AddTimeStamp(span, offset); //置信度标注 //} #endregion } stManager.AddSentence(sentence); } return(stManager); }