//最大长度 public static void TraningMaxLenth() { MaxJiaFangLength = 0; MaxYiFangLength = 0; MaxContractNameLength = 0; MaxProjectNameLength = 0; foreach (var c in TraningDataset.ContractList) { var TEJiaFang = EntityWordAnlayzeTool.TrimEnglish(c.JiaFang); if (TEJiaFang.Length > MaxJiaFangLength) { MaxJiaFangLength = TEJiaFang.Length; MaxJiaFang = TEJiaFang; } var TEYiFang = EntityWordAnlayzeTool.TrimEnglish(c.YiFang); if (TEYiFang.Length > MaxYiFangLength) { MaxYiFangLength = TEYiFang.Length; MaxYiFang = TEYiFang; } var ContractList = c.ContractName.Split("、"); foreach (var cn in ContractList) { var TEContractName = EntityWordAnlayzeTool.TrimEnglish(cn); if (TEContractName.Length > MaxContractNameLength) { MaxContractNameLength = TEContractName.Length; MaxContractName = TEContractName; } } var ProjectNameList = c.ProjectName.Split("、"); foreach (var jn in ProjectNameList) { if (jn.Contains(",")) { continue; } var TEProjectName = EntityWordAnlayzeTool.TrimEnglish(jn); if (TEProjectName.Length > MaxContractNameLength) { MaxProjectNameLength = TEProjectName.Length; MaxProjectName = TEProjectName; } } } Program.Training.WriteLine("最大甲方(除去英语)长度:" + MaxJiaFangLength); Program.Training.WriteLine("最大甲方(除去英语):" + MaxJiaFang); Program.Training.WriteLine("最大乙方(除去英语)长度:" + MaxYiFangLength); Program.Training.WriteLine("最大乙方(除去英语):" + MaxYiFang); Program.Training.WriteLine("最大合同(除去英语)长度:" + MaxContractNameLength); Program.Training.WriteLine("最大合同(除去英语):" + MaxContractName); Program.Training.WriteLine("最大工程(除去英语)长度:" + MaxProjectNameLength); Program.Training.WriteLine("最大工程(除去英语):" + MaxProjectName); //新建北京至石家庄铁路客运专线石家庄枢纽(北京局代建部分)站场工程一个标段 //新建大塔至四眼井铁路吴四圪堵至四眼井段站前工程wssg-1标段 }
public List <struStockChange> Extract() { var DateRange = LocateDateRange(root); var list = new List <struStockChange>(); var Name = GetHolderName(); if (!String.IsNullOrEmpty(Name.FullName) && !String.IsNullOrEmpty(Name.ShortName)) { companynamelist.Add(new struCompanyName() { secFullName = Name.FullName, secShortName = Name.ShortName }); } list = ExtractFromTable(); //list = ExtractFromTableByContent(); if (list.Count > 0) { return(list); //如果这里直接返回,由于召回率等因素,可以细微提高成绩 } var stockchange = new struStockChange(); //公告ID stockchange.id = Id; //if (!Program.IsMultiThreadMode) Program.Logger.WriteLine("公告ID:" + stockchange.id); stockchange.HolderFullName = Name.FullName.NormalizeTextResult(); if (EntityWordAnlayzeTool.TrimEnglish(stockchange.HolderFullName).Length > ContractTraning.MaxYiFangLength) { stockchange.HolderFullName = String.Empty; } stockchange.HolderShortName = Name.ShortName; stockchange.ChangeEndDate = GetChangeEndDate(root); DateTime x; if (!DateTime.TryParse(stockchange.ChangeEndDate, out x)) { //无法处理的情况 if (!Program.IsDebugMode) { //非调试模式 stockchange.ChangeEndDate = String.Empty; } } if (!string.IsNullOrEmpty(stockchange.HolderFullName) && !string.IsNullOrEmpty(stockchange.ChangeEndDate)) { if (!stockchange.HolderFullName.Contains("增持") && !stockchange.HolderFullName.Contains("减持")) { list.Add(stockchange); } } return(list); }
static string GetProjectName(MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("项目名称候补词(关键字):[" + item + "]"); return(ProjectName); } var MarkFeature = new EntityProperty.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new EntityProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); return(ProjectName); } var list = BussinessLogic.GetProjectName(root); if (list.Count > 0) { return(list[0]); } return(""); }
public static void RunWordAnlayze() { var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html"); var Contract = TraningDataset.GetContractById("1044779")[0]; EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html"); Contract = TraningDataset.GetContractById("1450")[0]; EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html"); Contract = TraningDataset.GetContractById("1042224")[0]; EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html"); Contract = TraningDataset.GetContractById("917362")[0]; EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, Contract.ProjectName); }
//实体自身特性分析 public static void EntityWordPerperty() { var posSeg = new PosSegmenter(); //首单词统计 var FirstWordPos = new Dictionary <String, int>(); var WordLength = new Dictionary <int, int>(); Program.Training.WriteLine("甲方统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.JiaFang); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); Program.Training.WriteLine("乙方统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.YiFang); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); Program.Training.WriteLine("合同统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ContractName); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); Program.Training.WriteLine("工程统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ProjectName); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); }
public static void ContractTest() { StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\20526193.html"); StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\20596890.html"); StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\1018217.html"); StockChange.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\增减持\html\314146.html"); IncreaseStock.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\定增\html\7880.html"); var x1 = Normalizer.NormalizeItemListNumber("(4)2012 年 4 月,公司与中国华西企业股份"); var x2 = Normalizer.NormalizeItemListNumber("4 、承包方式: 从深化设计、制作、运输、"); var x3 = Normalizer.NormalizeItemListNumber("4、承包方式: 从深化设计、制作、运输、"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1153.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1008828.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\3620.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1518.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1120707.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html"); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html"); IncreaseStock.Extract(@"E:\WorkSpace2018\FDDC_announcements_round1_train_20180518\round1_train_20180518\定增\html\7880.html"); //数字金额的测试 var TestString = "中标价为人民币共计16928.79754万元(大写:人民币壹亿陆仟玖佰贰拾捌万柒仟玖佰柒拾伍元肆角整)。"; var Result = Utility.SeekMoney(TestString); //Console.WriteLine(Result.Item1); TestString = "安徽盛运环保(集团)股份有限公司"; //Result = Utility.GetStringBefore(TestString, "有限公司"); //Console.WriteLine(Result); Contract.Extract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\5258.html"); var x0 = "在此之前,2003年6月30日,本公司曾与MICROS US和MICROS Singapore(以下简称 “MICROS”)签订了《技术许可与代理协议》,并分别于2005年11月、2006年12月和2007年 10月与MICROS相继签署了第一、二、三次补充协议。"; var t0 = EntityWordAnlayzeTool.GetMainWordSentence(x0); //在此之前,2003年6月30日,本公司曾与MICROS US和MICROS Singapore(以下简称 “MICROS”)签订了《技术许可与代理协议》,并分别于2005年11月、2006年12月和2007年 10月与MICROS相继签署了第一、二、三次补充协议。" //在此之前,2003年6月30日,本公司 与MICROS US和MICROS Singapore(以下简称 “MICROS”)签订 《技术许可与代理协议》,并 于2005年11月、2006年12月和2007年 10月与MICROS 签署 第一、二、三次补充协议。 }
public static void AnlayzeEntitySurroundWords() { var ContractPath_TRAIN = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同"; Console.WriteLine("前导词:甲方"); foreach (var filename in System.IO.Directory.GetFiles(ContractPath_TRAIN + @"\html\")) { var fi = new System.IO.FileInfo(filename); var Id = fi.Name.Replace(".html", ""); if (TraningDataset.GetContractById(Id).Count == 0) { continue; } var contract = TraningDataset.GetContractById(Id).First(); if (contract.JiaFang == "") { continue; } var root = HTMLEngine.Anlayze(filename); EntityWordAnlayzeTool.AnlayzeEntitySurroundWords(root, contract.JiaFang); } }
public static struCompanyName AfterProcessFullName(string FullName) { var ShortName = String.Empty; var CompanyNameTrailingwords = new string[] { "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称" }; //暂时不做括号的正规化 foreach (var trailing in CompanyNameTrailingwords) { if (FullName.Contains(trailing)) { //获取简称 var BracketsList = RegularTool.GetChineseBrackets(FullName); foreach (var bracketItem in BracketsList) { var ShortNameList = RegularTool.GetChineseQuotation(bracketItem); if (ShortNameList.Count > 0) { ShortName = ShortNameList.First(); if (!String.IsNullOrEmpty(ShortName)) { ShortName = ShortName.Substring(1, ShortName.Length - 2); } } } FullName = Utility.GetStringBefore(FullName, trailing); } } if (FullName.Contains("及其")) { FullName = Utility.GetStringBefore(FullName, "及其"); } if (FullName.Contains("股东")) { FullName = Utility.GetStringAfter(FullName, "股东"); } if (FullName.Contains("一致行动人")) { FullName = Utility.GetStringAfter(FullName, "一致行动人"); } if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName)) { FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName; } //删除前导 FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName); FullName = CutOtherLeadingWords(FullName); if (ShortName != String.Empty) { return(new struCompanyName() { secFullName = FullName, secShortName = ShortName, Score = 80 }); } else { return(new struCompanyName() { secFullName = FullName, Score = 60 }); } }
/// <summary> /// 获得工程名 /// </summary> /// <returns></returns> string GetProjectName() { var ExtractorText = new ExtractPropertyByText(); //这些关键字后面(最优先) ExtractorText.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; ExtractorText.ExtractFromTextFile(TextFileName); foreach (var item in ExtractorText.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (TrimJianCheng(ProjectName) == String.Empty) { continue; } ProjectName = TrimJianCheng(ProjectName); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]"); } return(ProjectName); } var Extractor = new ExtractPropertyByHTML(); Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList; foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (TrimJianCheng(ProjectName) == String.Empty) { continue; } ProjectName = TrimJianCheng(ProjectName); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]"); } return(ProjectName); } foreach (var bracket in quotationList) { if (bracket.Value.EndsWith("工程") || bracket.Value.EndsWith("标段")) { return(bracket.Value); } } var MarkFeature = new ExtractPropertyByHTML.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new ExtractPropertyByHTML.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new ExtractPropertyByHTML.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); } return(ProjectName); } var ExtractDP = new ExtractPropertyByDP(); var KeyList = new List <ExtractPropertyByDP.DPKeyWord>(); KeyList.Add(new ExtractPropertyByDP.DPKeyWord() { StartWord = new string[] { "确定为", "确定", "中标", "参与", "发布", "为" }, StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 }, EndWord = new string[] { "采购", "项目", "工程", "标段" }, EndDPValue = new string[] { } }); ExtractDP.StartWithKey(KeyList, Dplist); foreach (var item in ExtractDP.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxProjectNameLength) { continue; } if (ProjectName.Length <= 4) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("工程候补词:[" + ProjectName + "]"); } return(ProjectName); } return(String.Empty); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> public string GetJiaFang() { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = ContractTraning.MaxJiaFangLength; e.MaxLengthCheckPreprocess = EntityWordAnlayzeTool.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (EntityWordAnlayzeTool.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); }
/// <summary> /// 获得合同名 /// </summary> /// <returns></returns> string GetContractName() { var e = new EntityProperty(); e.PropertyName = "合同名称"; e.PropertyType = EntityProperty.enmType.NER; e.MaxLength = ContractTraning.MaxContractNameLength; e.MinLength = 5; /* 训练模式下 * e.LeadingColonKeyWordList = ContractTraning.ContractNameLeadingDict * .Where((x) => { return x.Value >= 40; }) //阈值40%以上 * .Select((x) => { return x.Key + ":"; }).ToArray(); */ e.LeadingColonKeyWordList = new string[] { "合同名称:" }; e.QuotationTrailingWordList = new string[] { "协议书", "合同书", "确认书", "合同", "协议" }; e.QuotationTrailingWordList_IsSkipBracket = true; //暂时只能选True var KeyList = new List <ExtractPropertyByDP.DPKeyWord>(); KeyList.Add(new ExtractPropertyByDP.DPKeyWord() { StartWord = new string[] { "签署", "签订" }, //通过SRL训练获得 StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 }, EndWord = new string[] { "补充协议", "合同书", "合同", "协议书", "协议", }, EndDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系, LTPTrainingDP.动宾关系, LTPTrainingDP.主谓关系 } }); e.DpKeyWordList = KeyList; var StartArray = new string[] { "签署了", "签订了" }; //通过语境训练获得 var EndArray = new string[] { "合同" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.ExternalStartEndStringFeatureCandidatePreprocess = (x) => { return(x + "合同"); }; e.MaxLengthCheckPreprocess = str => { return(EntityWordAnlayzeTool.TrimEnglish(str)); }; //最高级别的置信度,特殊处理器 e.LeadingColonKeyWordCandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimJianCheng(str)); return(c); }; e.CandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimJianCheng(str)); var RightQMarkIdx = c.IndexOf("”"); if (!(RightQMarkIdx != -1 && RightQMarkIdx != c.Length - 1)) { //对于"XXX"合同,有右边引号,但不是最后的时候,不用做 c = c.TrimStart("“".ToCharArray()); } c = c.TrimStart("《".ToCharArray()); c = c.TrimEnd("》".ToCharArray()).TrimEnd("”".ToCharArray()); return(c); }; e.ExcludeContainsWordList = new string[] { "日常经营重大合同" }; //下面这个列表的根据不足 e.ExcludeEqualsWordList = new string[] { "合同", "重大合同", "项目合同", "终止协议", "经营合同", "特别重大合同", "相关项目合同" }; e.Extract(this); //是否所有的候选词里面包括(测试集无法使用) var contractlist = TraningDataset.ContractList.Where((x) => { return(x.id == this.Id); }); if (contractlist.Count() > 0) { var contract = contractlist.First(); var contractname = contract.ContractName; if (!String.IsNullOrEmpty(contractname)) { e.CheckIsCandidateContainsTarget(contractname); } } //置信度 e.Confidence = ContractTraning.ContractES.GetStardardCI(); return(e.EvaluateCI()); }
static string GetContractName(MyRootHtmlNode root) { var Extractor = new EntityProperty(); var MarkFeature = new EntityProperty.struMarkFeature(); MarkFeature.MarkStartWith = "《"; MarkFeature.MarkEndWith = "》"; MarkFeature.InnerEndWith = "合同"; var MarkFeatureConfirm = new EntityProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "《"; MarkFeatureConfirm.MarkEndWith = "》"; MarkFeatureConfirm.InnerEndWith = "确认书"; Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同名称候补词(《XXX》):[" + item + "]"); return(ContractName); } Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "合同名称:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同名称候补词(关键字):[" + item + "]"); return(ContractName); } //合同 Extractor = new EntityProperty(); var StartArray = new string[] { "签署了" }; var EndArray = new string[] { "合同" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同候补词(合同):[" + item + "]"); return(ContractName); } return(""); }
static string GetJiaFang(MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "甲方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购人:", "采购人名称:", "采购方:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(关键字):[" + JiaFang + "]"); return(JiaFang); } //招标 Extractor = new EntityProperty(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); JiaFang = JiaFang.Replace("业主", "").Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang + "]"); return(JiaFang); } //合同 Extractor = new EntityProperty(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); JiaFang = JiaFang.Replace("业主", "").Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang + "]"); return(JiaFang); } return(""); }
//最大长度 public static void TraningMaxLenth() { MaxJiaFangLength = 0; MaxYiFangLength = 0; MaxContractNameLength = 0; MaxProjectNameLength = 0; foreach (var c in TraningDataset.ContractList) { var TEJiaFang = EntityWordAnlayzeTool.TrimEnglish(c.JiaFang); if (TEJiaFang.Length > MaxJiaFangLength) { MaxJiaFangLength = TEJiaFang.Length; MaxJiaFang = TEJiaFang; } var TEYiFang = EntityWordAnlayzeTool.TrimEnglish(c.YiFang); if (TEYiFang.Length > MaxYiFangLength) { MaxYiFangLength = TEYiFang.Length; MaxYiFang = TEYiFang; } var ContractList = c.ContractName.Split("、"); foreach (var cn in ContractList) { var TEContractName = EntityWordAnlayzeTool.TrimEnglish(cn); if (TEContractName.Length > MaxContractNameLength) { MaxContractNameLength = TEContractName.Length; MaxContractName = TEContractName; } } if (!string.IsNullOrEmpty(c.ContractMoneyUpLimit)) { var m = 0.0; if (double.TryParse(c.ContractMoneyUpLimit, out m)) { if (m < MinAmount) { MinAmount = m; } } } var ProjectNameList = c.ProjectName.Split("、"); foreach (var jn in ProjectNameList) { if (jn.Contains(",")) { continue; } var TEProjectName = EntityWordAnlayzeTool.TrimEnglish(jn); if (TEProjectName.Length > MaxContractNameLength) { MaxProjectNameLength = TEProjectName.Length; MaxProjectName = TEProjectName; } } } Program.Training.WriteLine("最大甲方(除去英语)长度:" + MaxJiaFangLength); Program.Training.WriteLine("最大甲方(除去英语):" + MaxJiaFang); Program.Training.WriteLine("最大乙方(除去英语)长度:" + MaxYiFangLength); Program.Training.WriteLine("最大乙方(除去英语):" + MaxYiFang); Program.Training.WriteLine("最大合同(除去英语)长度:" + MaxContractNameLength); Program.Training.WriteLine("最大合同(除去英语):" + MaxContractName); Program.Training.WriteLine("最大工程(除去英语)长度:" + MaxProjectNameLength); Program.Training.WriteLine("最大工程(除去英语):" + MaxProjectName); Program.Training.WriteLine("最小金额:" + MinAmount); }