/// <summary> /// 获得工程名 /// </summary> /// <returns></returns> string GetProjectName() { var e = new EntityProperty(); e.PropertyName = "工程名称"; e.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; e.LeadingColonKeyWordCandidatePreprocess = TrimEndJianCheng; e.QuotationTrailingWordList_IsSkipBracket = true; e.QuotationTrailingWordList = new string[] { "标段施工项目", "标段土建工程", "标段施工总承包", "标段的工程", "标段工程", "标段施工总价承包", "标段施工总承包工程", "标段施工工程", "标段土建工程建设项目", "标段站前工程", "标段工程(施工)", "工程施工工程", "项目施工工程", "施工工程", "工程项目", "工程标段", "标段的施工项目", "标段项目", "标段施工", "招标采购项目", "招标活动", "采购活动", "招标项目", "项目", "采购", "总承包", "工程", "标段", "标", }; e.Extract(this); var prj = e.EvaluateCI(); if (!String.IsNullOrEmpty(prj)) { return(prj); } //var Stardard = TraningDataset.ContractList.Where(x => x.Id == this.Id).ToList(); //if (Stardard.Count == 1) //{ //Console.WriteLine("标准答案:" + Stardard[0].ProjectName); //} //var ProjectNameList = ProjectNameLogic.GetProjectNameByCutWord(this); //var ProjectNameListNER = ProjectNameLogic.GetProjectNameByNer(this); var StartArray = new string[] { "公司为", "参与了", "确定为" }; var EndArray = new string[] { "的中标单位", "的公开招投标", "的中标人", "候选人" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.Extract(this); prj = e.EvaluateCI(); if (!String.IsNullOrEmpty(prj)) { if (ExtractPropertyByHTML.FindWordCnt(prj + "项目", root).Count >= 1) { return(prj + "项目"); } return(prj); } foreach (var item in quotationList) { if (item.Value.Contains("推荐的中标候选人公示")) { prj = Utility.GetStringBefore(item.Value, "推荐的中标候选人公示"); return(prj); } } return(string.Empty); }
static string GetProjectName(MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("项目名称候补词(关键字):[" + item + "]"); return(ProjectName); } var MarkFeature = new EntityProperty.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new EntityProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); return(ProjectName); } var list = BussinessLogic.GetProjectName(root); if (list.Count > 0) { return(list[0]); } return(""); }
/// <summary> /// 获得工程名 /// </summary> /// <returns></returns> string GetProjectName() { var e = new EntityProperty(); e.PropertyName = "工程名称"; e.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; e.LeadingColonKeyWordCandidatePreprocess = TrimEndJianCheng; e.QuotationTrailingWordList = new string[] { "工程", "标段", "标", "招标活动", "项目", "采购" }; var StartArray = new string[] { "公司为", "参与了", "确定为" }; var EndArray = new string[] { "的中标单位", "的公开招投标", "的中标人", "候选人" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.Extract(this); var prj = e.EvaluateCI(); if (!String.IsNullOrEmpty(prj)) { return(prj); } foreach (var item in quotationList) { if (item.Value.Contains("推荐的中标候选人公示")) { return(Utility.GetStringBefore(item.Value, "推荐的中标候选人公示")); } } return(string.Empty); }
static string GetMoney(HTMLEngine.MyRootHtmlNode root) { var Money = ""; var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "中标金额", "中标价", "合同金额", "合同总价", "订单总金额" }; Extractor.Extract(root); var AllMoneyList = new List <Tuple <String, String> >(); foreach (var item in Extractor.CandidateWord) { var ml = Utility.SeekMoney(item); AllMoneyList.AddRange(ml); } if (AllMoneyList.Count == 0) { return(""); } foreach (var m in AllMoneyList) { if (m.Item2 == "人民币" || m.Item2 == "元") { Money = m.Item1; break; } } if (Money == "") { Money = AllMoneyList[0].Item1; } Program.Logger.WriteLine("金额候补词:[" + Money + "]"); return(Money); }
/// <summary> /// 获得合同名 /// </summary> /// <returns></returns> string GetContractName() { var e = new EntityProperty(); e.PropertyName = "合同名称"; e.PropertyType = EntityProperty.enmType.NER; e.MaxLength = ContractTraning.ContractES.MaxLength; e.MinLength = ContractTraning.ContractES.MinLength; e.LeadingColonKeyWordList = new string[] { "合同名称:" }; e.QuotationTrailingWordList = new string[] { "协议书", "合同书", "确认书", "合同", "协议" }; e.QuotationTrailingWordList_IsSkipBracket = true; //暂时只能选True var KeyList = new List <ExtractPropertyByDP.DPKeyWord>(); KeyList.Add(new ExtractPropertyByDP.DPKeyWord() { StartWord = new string[] { "签署", "签订" }, //通过SRL训练获得 StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 }, EndWord = new string[] { "补充协议", "合同书", "合同", "协议书", "协议", }, EndDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系, LTPTrainingDP.动宾关系, LTPTrainingDP.主谓关系 } }); e.DpKeyWordList = KeyList; var StartArray = new string[] { "签署了", "签订了" }; //通过语境训练获得 var EndArray = new string[] { "合同" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.ExternalStartEndStringFeatureCandidatePreprocess = (x) => { return(x + "合同"); }; e.MaxLengthCheckPreprocess = str => { return(Utility.TrimEnglish(str)); }; //最高级别的置信度,特殊处理器 e.LeadingColonKeyWordCandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimEndJianCheng(str)); return(c); }; e.CandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimEndJianCheng(str)); var RightQMarkIdx = c.IndexOf("”"); if (!(RightQMarkIdx != -1 && RightQMarkIdx != c.Length - 1)) { //对于"XXX"合同,有右边引号,但不是最后的时候,不用做 c = c.TrimStart("“".ToCharArray()); } c = c.TrimStart("《".ToCharArray()); c = c.TrimEnd("》".ToCharArray()).TrimEnd("”".ToCharArray()); return(c); }; e.ExcludeContainsWordList = new string[] { "日常经营重大合同" }; //下面这个列表的根据不足,正确做法是【尚未签署】 e.ExcludeEqualsWordList = new string[] { "若干项重大合同", "中标合同", "正式合同", "合同", "重大合同", "项目合同", "终止协议", "经营合同", "特别重大合同", "相关项目合同" }; e.Extract(this); //冒号优先 return(e.EvaluateCI()); }
/// <summary> /// 获得合同名 /// </summary> /// <returns></returns> string GetContractName() { var e = new EntityProperty(); e.PropertyName = "合同名称"; e.PropertyType = EntityProperty.enmType.NER; e.MaxLength = 200; e.MinLength = 4; e.LeadingColonKeyWordList = new string[] { "合同名称:" }; e.QuotationTrailingWordList = new string[] { "商务合同补充协议", "承包合同补充协议", "补充协议", "经营合同补充协议", "协议书", "合同书", "确认书", "合同", "协议" }; e.QuotationTrailingWordList_IsSkipBracket = false; var StartArray = new string[] { "签署了", "签订了" }; //通过语境训练获得 var EndArray = new string[] { "合同" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.ExternalStartEndStringFeatureCandidatePreprocess = (x) => { return(x + "合同"); }; e.MaxLengthCheckPreprocess = str => { return(Utility.TrimEnglish(str)); }; //最高级别的置信度,特殊处理器 e.LeadingColonKeyWordCandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimEndJianCheng(str)); return(c); }; e.CandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimEndJianCheng(str)); var RightQMarkIdx = c.IndexOf("”"); if (!(RightQMarkIdx != -1 && RightQMarkIdx != c.Length - 1)) { //对于"XXX"合同,有右边引号,但不是最后的时候,不用做 c = c.TrimStart("“".ToCharArray()); } c = c.TrimStart("《".ToCharArray()); c = c.TrimEnd("》".ToCharArray()).TrimEnd("”".ToCharArray()); return(c); }; e.ExcludeContainsWordList = new string[] { "日常经营重大合同" }; //下面这个列表的根据不足,正确做法是【尚未签署】 e.ExcludeEqualsWordList = new string[] { "若干项重大合同", "中标合同", "正式合同", "合同", "重大合同", "项目合同", "终止协议", "经营合同", "特别重大合同", "相关项目合同" }; e.Extract(this); //冒号优先 var contractname = e.EvaluateCI(); return(contractname); }
/// <summary> /// 认购方式 /// </summary> /// <param name="root"></param> /// <returns></returns> string getBuyMethod(HTMLEngine.MyRootHtmlNode root) { var p = new EntityProperty(); //是否包含关键字 "现金认购" p.KeyWordMap.Add("现金认购", "现金"); p.Extract(this); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("认购方式:" + string.Join(Utility.SplitChar, p.WordMapResult)); } return(string.Join(Utility.SplitChar, p.WordMapResult)); }
public static string GetCompanyFullName(HTMLEngine.MyRootHtmlNode root) { var Extractor = new EntityProperty(); Extractor.TrailingWordList = new string[] { "公司董事会" }; Extractor.Extract(root); Extractor.CandidateWord.Reverse(); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("全称:[" + item + "公司]"); return(item); } return(""); }
//变动截止日期 static string GetChangeEndDate(HTMLEngine.MyRootHtmlNode root) { var Extractor = new EntityProperty(); var StartArray = new string[] { "截止", "截至" }; var EndArray = new string[] { "日" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("候补变动截止日期:[" + item + "]"); return(Normalizer.NormailizeDate(item + "日")); } return(""); }
//认购方式 string getBuyMethod(HTMLEngine.MyRootHtmlNode root) { var p = new EntityProperty(); //是否包含关键字 "现金认购" p.KeyWordMap.Add("现金认购", "现金"); p.Extract(this); if (!String.IsNullOrEmpty(p.WordMapResult)) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("认购方式:" + p.WordMapResult); } } return(p.WordMapResult); }
//固定搭配 public static string GetCompanyShortName(HTMLEngine.MyRootHtmlNode root) { var companyList = new Dictionary <string, string>(); //从第一行开始找到 有限公司 有限责任公司, 如果有简称的话Value是简称 //股票简称:东方电气 //东方电气股份有限公司董事会 var Extractor = new EntityProperty(); Extractor.LeadingWordList = new string[] { "股票简称", "证券简称" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ShortName = item.Replace(":", "").Replace(":", "").Trim(); if (Utility.GetStringBefore(ShortName, "、") != "") { ShortName = Utility.GetStringBefore(ShortName, "、"); } if (Utility.GetStringBefore(ShortName, ")") != "") { ShortName = Utility.GetStringBefore(ShortName, ")"); } if (Utility.GetStringBefore(ShortName, "公告") != "") { ShortName = Utility.GetStringBefore(ShortName, "公告"); } if (Utility.GetStringBefore(ShortName, "股票") != "") { ShortName = Utility.GetStringBefore(ShortName, "股票"); } if (Utility.GetStringBefore(ShortName, "证券") != "") { ShortName = Utility.GetStringBefore(ShortName, "证券"); } if (Utility.GetStringBefore(ShortName, " ") != "") { ShortName = Utility.GetStringBefore(ShortName, " "); } FDDC.Program.Logger.WriteLine("简称:[" + ShortName + "]"); return(ShortName); } return(""); }
static string GetYiFang(HTMLEngine.MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "供应商名称:", "乙方:" }; //"中标单位:","中标人:","中标单位:","中标人:","乙方(供方):","承包人:","承包方:","中标方:","供应商名称:","中标人名称:" Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("乙方候补词(关键字):[" + item + "]"); return(item.Trim()); } //乙方:"有限公司" Extractor = new EntityProperty(); //这些关键字后面 Extractor.TrailingWordList = new string[] { "有限公司董事会" }; Extractor.Extract(root); Extractor.CandidateWord.Reverse(); foreach (var item in Extractor.CandidateWord) { //如果有子公司的话,优先使用子公司 foreach (var c in companynamelist) { if (c.isSubCompany) { return(c.secFullName); } } Program.Logger.WriteLine("乙方候补词(关键字):[" + item + "有限公司]"); return(item.Trim() + "有限公司"); } if (companynamelist.Count > 0) { return(companynamelist[companynamelist.Count - 1].secFullName); } return(""); }
static string GetHolderFullName(HTMLEngine.MyRootHtmlNode root) { var Extractor = new EntityProperty(); var StartArray = new string[] { "接到", "收到", "股东" }; var EndArray = new string[] { "的", "通知", "告知函", "减持", "增持", "《" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var word in Extractor.CandidateWord) { if (word.Contains("简称")) { Program.Logger.WriteLine("候补股东全称修正:[" + word + "]"); return(word); } } if (Extractor.CandidateWord.Count > 0) { return(Extractor.CandidateWord[0]); } return(""); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> string GetJiaFang(String YiFang) { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "项目招标人:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = 32; e.MaxLengthCheckPreprocess = Utility.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } var ner = SearchJiaFang(); var NerJia = String.Empty; if (!String.IsNullOrEmpty(ner)) { foreach (var cn in companynamelist) { if (cn.secShortName == ner) { ner = cn.secFullName; } } if (String.IsNullOrEmpty(YiFang)) { NerJia = ner; } if (!YiFang.Equals(ner)) { NerJia = ner; } } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } if (!String.IsNullOrEmpty(NerJia)) { //原则上,有NER中提取的甲方,则使用甲方 foreach (var c in CandidateWord) { //但是,这里有可能是正确的解答,例如 //NER:(集团)有限公司 实际上应该是 XXXX(集团)有限公司 if (c.EndsWith(NerJia)) { return(c); } } return(NerJia); } else { return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); } }
static string GetJiaFang(MyRootHtmlNode root) { var Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "甲方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购人:", "采购人名称:", "采购方:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(关键字):[" + JiaFang + "]"); return(JiaFang); } //招标 Extractor = new EntityProperty(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); JiaFang = JiaFang.Replace("业主", "").Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang + "]"); return(JiaFang); } //合同 Extractor = new EntityProperty(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = AfterProcessJiaFang(item.Trim()); JiaFang = JiaFang.Replace("业主", "").Trim(); if (EntityWordAnlayzeTool.TrimEnglish(JiaFang).Length > ContractTraning.MaxJiaFangLength) { continue; } if (JiaFang.Length < 3) { continue; //使用实际长度排除全英文的情况 } Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang + "]"); return(JiaFang); } return(""); }
static string GetContractName(MyRootHtmlNode root) { var Extractor = new EntityProperty(); var MarkFeature = new EntityProperty.struMarkFeature(); MarkFeature.MarkStartWith = "《"; MarkFeature.MarkEndWith = "》"; MarkFeature.InnerEndWith = "合同"; var MarkFeatureConfirm = new EntityProperty.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "《"; MarkFeatureConfirm.MarkEndWith = "》"; MarkFeatureConfirm.InnerEndWith = "确认书"; Extractor.MarkFeature = new EntityProperty.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同名称候补词(《XXX》):[" + item + "]"); return(ContractName); } Extractor = new EntityProperty(); //这些关键字后面 Extractor.LeadingWordList = new string[] { "合同名称:" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同名称候补词(关键字):[" + item + "]"); return(ContractName); } //合同 Extractor = new EntityProperty(); var StartArray = new string[] { "签署了" }; var EndArray = new string[] { "合同" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ContractName = item.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ContractName).Length > ContractTraning.MaxContractNameLength) { continue; } Program.Logger.WriteLine("合同候补词(合同):[" + item + "]"); return(ContractName); } return(""); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> public string GetJiaFang() { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = ContractTraning.JiaFangES.MaxLength; e.MaxLengthCheckPreprocess = Utility.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); }
/// <summary> /// 获得合同名 /// </summary> /// <returns></returns> string GetContractName() { var e = new EntityProperty(); e.PropertyName = "合同名称"; e.PropertyType = EntityProperty.enmType.NER; e.MaxLength = ContractTraning.MaxContractNameLength; e.MinLength = 5; /* 训练模式下 * e.LeadingColonKeyWordList = ContractTraning.ContractNameLeadingDict * .Where((x) => { return x.Value >= 40; }) //阈值40%以上 * .Select((x) => { return x.Key + ":"; }).ToArray(); */ e.LeadingColonKeyWordList = new string[] { "合同名称:" }; e.QuotationTrailingWordList = new string[] { "协议书", "合同书", "确认书", "合同", "协议" }; e.QuotationTrailingWordList_IsSkipBracket = true; //暂时只能选True var KeyList = new List <ExtractPropertyByDP.DPKeyWord>(); KeyList.Add(new ExtractPropertyByDP.DPKeyWord() { StartWord = new string[] { "签署", "签订" }, //通过SRL训练获得 StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 }, EndWord = new string[] { "补充协议", "合同书", "合同", "协议书", "协议", }, EndDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系, LTPTrainingDP.动宾关系, LTPTrainingDP.主谓关系 } }); e.DpKeyWordList = KeyList; var StartArray = new string[] { "签署了", "签订了" }; //通过语境训练获得 var EndArray = new string[] { "合同" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.ExternalStartEndStringFeatureCandidatePreprocess = (x) => { return(x + "合同"); }; e.MaxLengthCheckPreprocess = str => { return(EntityWordAnlayzeTool.TrimEnglish(str)); }; //最高级别的置信度,特殊处理器 e.LeadingColonKeyWordCandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimJianCheng(str)); return(c); }; e.CandidatePreprocess = str => { var c = Normalizer.ClearTrailing(TrimJianCheng(str)); var RightQMarkIdx = c.IndexOf("”"); if (!(RightQMarkIdx != -1 && RightQMarkIdx != c.Length - 1)) { //对于"XXX"合同,有右边引号,但不是最后的时候,不用做 c = c.TrimStart("“".ToCharArray()); } c = c.TrimStart("《".ToCharArray()); c = c.TrimEnd("》".ToCharArray()).TrimEnd("”".ToCharArray()); return(c); }; e.ExcludeContainsWordList = new string[] { "日常经营重大合同" }; //下面这个列表的根据不足 e.ExcludeEqualsWordList = new string[] { "合同", "重大合同", "项目合同", "终止协议", "经营合同", "特别重大合同", "相关项目合同" }; e.Extract(this); //是否所有的候选词里面包括(测试集无法使用) var contractlist = TraningDataset.ContractList.Where((x) => { return(x.id == this.Id); }); if (contractlist.Count() > 0) { var contract = contractlist.First(); var contractname = contract.ContractName; if (!String.IsNullOrEmpty(contractname)) { e.CheckIsCandidateContainsTarget(contractname); } } //置信度 e.Confidence = ContractTraning.ContractES.GetStardardCI(); return(e.EvaluateCI()); }