/// <summary> /// 获得工程名 /// </summary> /// <returns></returns> string GetProjectName() { var e = new EntityProperty(); e.PropertyName = "工程名称"; e.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; e.LeadingColonKeyWordCandidatePreprocess = TrimEndJianCheng; e.QuotationTrailingWordList_IsSkipBracket = true; e.QuotationTrailingWordList = new string[] { "标段施工项目", "标段土建工程", "标段施工总承包", "标段的工程", "标段工程", "标段施工总价承包", "标段施工总承包工程", "标段施工工程", "标段土建工程建设项目", "标段站前工程", "标段工程(施工)", "工程施工工程", "项目施工工程", "施工工程", "工程项目", "工程标段", "标段的施工项目", "标段项目", "标段施工", "招标采购项目", "招标活动", "采购活动", "招标项目", "项目", "采购", "总承包", "工程", "标段", "标", }; e.Extract(this); var prj = e.EvaluateCI(); if (!String.IsNullOrEmpty(prj)) { return(prj); } //var Stardard = TraningDataset.ContractList.Where(x => x.Id == this.Id).ToList(); //if (Stardard.Count == 1) //{ //Console.WriteLine("标准答案:" + Stardard[0].ProjectName); //} //var ProjectNameList = ProjectNameLogic.GetProjectNameByCutWord(this); //var ProjectNameListNER = ProjectNameLogic.GetProjectNameByNer(this); var StartArray = new string[] { "公司为", "参与了", "确定为" }; var EndArray = new string[] { "的中标单位", "的公开招投标", "的中标人", "候选人" }; e.ExternalStartEndStringFeature = Utility.GetStartEndStringArray(StartArray, EndArray); e.Extract(this); prj = e.EvaluateCI(); if (!String.IsNullOrEmpty(prj)) { if (ExtractPropertyByHTML.FindWordCnt(prj + "项目", root).Count >= 1) { return(prj + "项目"); } return(prj); } foreach (var item in quotationList) { if (item.Value.Contains("推荐的中标候选人公示")) { prj = Utility.GetStringBefore(item.Value, "推荐的中标候选人公示"); return(prj); } } return(string.Empty); }
List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root) { var result = new List <string>(); foreach (var item in KeyWordMap) { var cnt = ExtractPropertyByHTML.FindWordCnt(item.Key, root).Count; if (cnt > 0) { if (!result.Contains(item.Value)) { result.Add(item.Value); } } } return(result); }
List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root) { var result = new List <string>(); foreach (var item in KeyWordMap) { var HasKey = ExtractPropertyByHTML.HasWord(item.Key, root); if (HasKey) { if (!result.Contains(item.Value)) { result.Add(item.Value); } } } return(result); }
/// <summary> /// 获得乙方 /// </summary> /// <returns></returns> string GetYiFang() { var Extractor = new ExtractPropertyByText(); //这些关键字后面 Extractor.LeadingColonKeyWordList = new string[] { "乙方:" }; //"供应商名称:","中标单位:","中标人:","中标单位:","中标人:","乙方(供方):","承包人:","承包方:","中标方:","供应商名称:","中标人名称:" Extractor.ExtractFromTextFile(TextFileName); foreach (var item in Extractor.CandidateWord) { var YiFang = item.Value.Trim(); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("乙方候补词(关键字):[" + YiFang + "]"); } return(YiFang); } //乙方:"有限公司" //如果有子公司的话,优先使用子公司 foreach (var c in companynamelist) { if (c.isSubCompany) { return(c.secFullName); } } var ExtractorHTML = new ExtractPropertyByHTML(); //这些关键字后面 ExtractorHTML.TrailingWordList = new string[] { "有限公司董事会" }; ExtractorHTML.Extract(root); ExtractorHTML.CandidateWord.Reverse(); foreach (var item in ExtractorHTML.CandidateWord) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("乙方候补词(关键字):[" + item.Value.Trim() + "有限公司]"); } return(item.Value.Trim() + "有限公司"); } return(AnnouceCompanyName); }
/// <summary> /// 股数 /// </summary> /// <param name="root"></param> /// <returns></returns> public static List <LocAndValue <String> > LocateStockNumber(HTMLEngine.MyRootHtmlNode root) { var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { RegularExpress = @"\d+(,\d+)+", TrailingWordList = new string[] { "股" }.ToList() }; var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var ExpResult = ExtractPropertyByHTML.RegularExFinder(sentence.PositionId, sentence.Content, targetRegular, "|"); list.AddRange(ExpResult); } } return(list); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> public string GetJiaFang() { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = ContractTraning.JiaFangES.MaxLength; e.MaxLengthCheckPreprocess = Utility.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); }
public void Extract(AnnouceDocument doc) { //纯关键字类型 if (KeyWordMap.Count != 0) { var candidate = ExtractByKeyWordMap(doc.root); if (candidate.Count == 1) { WordMapResult = candidate.First(); } if (candidate.Count > 1) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("找到纯关键字类型两个关键字"); } } return; } if (LeadingColonKeyWordList != null) { //按照规则,由固定先导词的,例如 [项目名:] //这里的词语不受任何其他因素制约,例如最大最小长度,有专用的预处理器 var ExtractorText = new ExtractPropertyByText(); //这些关键字后面:注意:TEXT版本可能存在空格,所以HTML版本也检查一遍 ExtractorText.LeadingColonKeyWordList = LeadingColonKeyWordList; ExtractorText.ExtractFromTextFile(doc.TextFileName); foreach (var item in ExtractorText.CandidateWord) { var PropertyValue = item.Value; if (LeadingColonKeyWordCandidatePreprocess != null) { PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue); } if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } LeadingColonKeyWordCandidate.Add(PropertyValue); } var Extractor = new ExtractPropertyByHTML(); Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList; Extractor.Extract(doc.root); foreach (var item in ExtractorText.CandidateWord) { var PropertyValue = item.Value; if (LeadingColonKeyWordCandidatePreprocess != null) { PropertyValue = LeadingColonKeyWordCandidatePreprocess(PropertyValue); } if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } //TEXT里面有的,这里不重复添加了 if (!LeadingColonKeyWordCandidate.Contains(PropertyValue)) { LeadingColonKeyWordCandidate.Add(PropertyValue); } } } //书名号和引号 if (QuotationTrailingWordList != null) { //接下来《》,“” 优先 foreach (var bracket in doc.quotationList) { foreach (var word in QuotationTrailingWordList) { if (bracket.Value.EndsWith(word)) { var PropertyValue = CheckCandidate(bracket.Value); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } QuotationTrailingCandidate.Add(PropertyValue); } } } } //句法依存 if (DpKeyWordList != null) { var ExtractDP = new ExtractPropertyByDP(); ExtractDP.StartWithKey(DpKeyWordList, doc.Dplist); foreach (var item in ExtractDP.CandidateWord) { var PropertyValue = CheckCandidate(item.Value); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } DpKeyWordCandidate.Add(PropertyValue); } } if (ExternalStartEndStringFeature != null) { var ExtractorTEXT = new ExtractPropertyByText(); ExtractorTEXT.StartEndFeature = ExternalStartEndStringFeature; ExtractorTEXT.ExtractFromTextFile(doc.TextFileName); foreach (var item in ExtractorTEXT.CandidateWord) { var PropertyValue = item.Value; if (ExternalStartEndStringFeatureCandidatePreprocess != null) { PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue); } PropertyValue = CheckCandidate(PropertyValue); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } ExternalStartEndStringFeatureCandidate.Add(PropertyValue); } //一部分无法提取TEXT的情况 var ExtractorHTML = new ExtractPropertyByHTML(); ExtractorHTML.StartEndFeature = ExternalStartEndStringFeature; ExtractorHTML.Extract(doc.root); foreach (var item in ExtractorHTML.CandidateWord) { var PropertyValue = item.Value; if (ExternalStartEndStringFeatureCandidatePreprocess != null) { PropertyValue = ExternalStartEndStringFeatureCandidatePreprocess(PropertyValue); } PropertyValue = CheckCandidate(PropertyValue); if (String.IsNullOrEmpty(PropertyValue)) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine(this.PropertyName + ":[" + PropertyValue + "]"); } if (!ExternalStartEndStringFeatureCandidate.Contains(PropertyValue)) { ExternalStartEndStringFeatureCandidate.Add(PropertyValue); } } } }
/// <summary> /// 获得工程名 /// </summary> /// <returns></returns> string GetProjectName() { var ExtractorText = new ExtractPropertyByText(); //这些关键字后面(最优先) ExtractorText.LeadingColonKeyWordList = new string[] { "项目名称:", "工程名称:", "中标项目:", "合同标的:", "工程内容:" }; ExtractorText.ExtractFromTextFile(TextFileName); foreach (var item in ExtractorText.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (TrimJianCheng(ProjectName) == String.Empty) { continue; } ProjectName = TrimJianCheng(ProjectName); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]"); } return(ProjectName); } var Extractor = new ExtractPropertyByHTML(); Extractor.LeadingColonKeyWordList = ExtractorText.LeadingColonKeyWordList; foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (TrimJianCheng(ProjectName) == String.Empty) { continue; } ProjectName = TrimJianCheng(ProjectName); if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("项目名称候补词(关键字):[" + ProjectName + "]"); } return(ProjectName); } foreach (var bracket in quotationList) { if (bracket.Value.EndsWith("工程") || bracket.Value.EndsWith("标段")) { return(bracket.Value); } } var MarkFeature = new ExtractPropertyByHTML.struMarkFeature(); MarkFeature.MarkStartWith = "“"; MarkFeature.MarkEndWith = "”"; MarkFeature.InnerEndWith = "标段"; var MarkFeatureConfirm = new ExtractPropertyByHTML.struMarkFeature(); MarkFeatureConfirm.MarkStartWith = "“"; MarkFeatureConfirm.MarkEndWith = "”"; MarkFeatureConfirm.InnerEndWith = "标"; Extractor.MarkFeature = new ExtractPropertyByHTML.struMarkFeature[] { MarkFeature, MarkFeatureConfirm }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxContractNameLength) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("工程名称候补词(《XXX》):[" + item + "]"); } return(ProjectName); } var ExtractDP = new ExtractPropertyByDP(); var KeyList = new List <ExtractPropertyByDP.DPKeyWord>(); KeyList.Add(new ExtractPropertyByDP.DPKeyWord() { StartWord = new string[] { "确定为", "确定", "中标", "参与", "发布", "为" }, StartDPValue = new string[] { LTPTrainingDP.核心关系, LTPTrainingDP.定中关系, LTPTrainingDP.并列关系 }, EndWord = new string[] { "采购", "项目", "工程", "标段" }, EndDPValue = new string[] { } }); ExtractDP.StartWithKey(KeyList, Dplist); foreach (var item in ExtractDP.CandidateWord) { var ProjectName = item.Value.Trim(); if (EntityWordAnlayzeTool.TrimEnglish(ProjectName).Length > ContractTraning.MaxProjectNameLength) { continue; } if (ProjectName.Length <= 4) { continue; } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("工程候补词:[" + ProjectName + "]"); } return(ProjectName); } return(String.Empty); }
/// <summary> /// 从释义表抽取数据 /// </summary> /// <param name="Target"></param> /// <param name="Comany"></param> /// <returns></returns> private List <(string Target, string Company)> ExtractTargetFromExplainTable(List <struCompanyName> CompanyAtExplainTable, string[] ExplainKeys) { var AllCompanyName = new List <String>(); foreach (var item in CompanyAtExplainTable) { if (!String.IsNullOrEmpty(item.secShortName)) { AllCompanyName.Add(item.secShortName); } if (!String.IsNullOrEmpty(item.secFullName)) { AllCompanyName.Add(item.secFullName); } } //股份的抽取 var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { LeadingWordList = AllCompanyName, RegularExpress = RegularTool.PercentExpress, TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益", "的股份", "股份" }.ToList() }; //其他标的 var OtherTargets = new string[] { "资产及负债", "资产和负债", "主要资产和部分负债", "主要资产及部分负债", "经营性资产及负债", "经营性资产和负债", "应收账款和其他应收款", "负债", "债权", "全部权益", "经营性资产", "非股权类资产", "资产、负债、业务", "直属资产", "普通股股份", "土地使用权", "使用权", "房产" }; var TargetAndCompanyList = new List <(string Target, string Comany)>(); foreach (var Rplkey in ExplainKeys) { //可能性最大的排在最前 foreach (var ExplainDictItem in ExplainDict) { var keys = ExplainDictItem.Key.Split(Utility.SplitChar); var keys2 = ExplainDictItem.Key.Split(new char[] { '/', '/' }); if (keys.Length == 1 && keys2.Length > 1) { keys = keys2; } var values = ExplainDictItem.Value.Split(Utility.SplitChar); var values2 = ExplainDictItem.Value.Split(";"); if (values.Length == 1 && values2.Length > 1) { values = values2; } //keys里面可能包括【拟】字需要去除 var SearchKey = keys.Select((x) => { return(x.StartsWith("拟") ? x.Substring(1) : x); }); SearchKey = SearchKey.Select(x => x.Trim()).ToArray(); if (SearchKey.Contains(Rplkey)) { if (Rplkey.Equals("交易标的") || Rplkey.Equals("标的资产") || Rplkey.Equals("标的公司")) { foreach (var cn in companynamelist) { if (ExplainDictItem.Value.Equals(cn.secFullName) || ExplainDictItem.Value.Equals(cn.secShortName)) { var extra = ("100%股权", ExplainDictItem.Value); TargetAndCompanyList.Add(extra); Console.WriteLine(Id + ":100%股权" + ExplainDictItem.Value); return(TargetAndCompanyList); } } } foreach (var targetRecordItem in values) { var SingleItemList = Utility.CutByPOSConection(targetRecordItem); foreach (var SingleItem in SingleItemList) { var targetAndcompany = SingleItem.Trim().Replace(" ", ""); targetAndcompany = targetAndcompany.Trim().Replace("合计", ""); if (targetAndcompany.Contains("持有的")) { targetAndcompany = Utility.GetStringAfter(targetAndcompany, "持有的"); } if (targetAndcompany.Contains("持有")) { targetAndcompany = Utility.GetStringAfter(targetAndcompany, "持有"); } if (targetAndcompany.Contains("所持")) { targetAndcompany = Utility.GetStringAfter(targetAndcompany, "所持"); } //将公司名称和交易标的划分开来 var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, targetAndcompany, targetRegular, "|"); if (ExpResult.Count == 0) { //其他类型的标的 if (!String.IsNullOrEmpty(GetOtherOwnerByExplainTable(targetAndcompany))) { var extra = (targetAndcompany, GetOtherOwnerByExplainTable(targetAndcompany)); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } } else { foreach (var rc in CompanyAtExplainTable) { var IsFullNameHit = false; //资产里面可能是带有公司名字的情况 if (!String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.Contains(rc.secFullName)) { foreach (var ot in OtherTargets) { if (targetAndcompany.Contains(ot)) { IsFullNameHit = true; TargetAndCompanyList.Add((ot, rc.secFullName)); break; } } } if (!IsFullNameHit) { if (!String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.Contains(rc.secShortName)) { foreach (var ot in OtherTargets) { if (targetAndcompany.Contains(ot)) { IsFullNameHit = true; TargetAndCompanyList.Add((ot, rc.secFullName)); break; } } } } //XXXX持有的XXXX的形式,不过现在可能已经不用了 if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.StartsWith(rc.secFullName)) { var extra = (targetAndcompany.Substring(rc.secFullName.Length), rc.secFullName); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.StartsWith(rc.secShortName)) { var extra = (targetAndcompany.Substring(rc.secShortName.Length), rc.secShortName); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } } } } else { foreach (var r in ExpResult) { var arr = r.Value.Split("|"); var target = arr[1] + arr[2]; var targetCompany = arr[0]; if (targetCompany.Contains("持有的")) { targetCompany = Utility.GetStringAfter(targetCompany, "持有的"); } if (targetCompany.Contains("持有")) { targetCompany = Utility.GetStringAfter(targetCompany, "持有"); } if (targetCompany.Contains("所持")) { targetCompany = Utility.GetStringAfter(targetCompany, "所持"); } var extra = (target.Replace(" ", ""), targetCompany.Replace(" ", "")); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } } } } } if (TargetAndCompanyList.Count != 0) { return(TargetAndCompanyList); } } } } return(TargetAndCompanyList); }
private List <(string Target, string Company)> ExtractExtend(string[] ExplainKeys) { var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { RegularExpress = RegularTool.PercentExpress, TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益", "的股份", "股份" }.ToList() }; var Result = new List <(string Target, string Comany)>(); //可能性最大的排在最前 foreach (var item in ExplainDict) { var list = new List <(string Target, string Comany)>(); var keys = item.Key.Split(Utility.SplitChar); var keys2 = item.Key.Split(new char[] { '/', '/' }); if (keys.Length == 1 && keys2.Length > 1) { keys = keys2; } var values = item.Value.Split(Utility.SplitChar); var values2 = item.Value.Split(";"); if (values.Length == 1 && values2.Length > 1) { values = values2; } foreach (var ek in ExplainKeys) { if (keys.Contains(ek)) { foreach (var value in values) { var serachWord = value.Replace(" ", string.Empty); foreach (var words in serachWord.Split(Utility.SplitChar)) { var SingleItemList = Utility.CutByPOSConection(words); foreach (var SingleItem in SingleItemList) { var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, SingleItem, targetRegular, "|"); foreach (var r in ExpResult) { var arr = r.Value.Split("|"); var target = arr[1] + arr[2]; var targetCompany = SingleItem.Substring(0, r.StartIdx); if (targetCompany.Contains("持有的")) { targetCompany = Utility.GetStringAfter(targetCompany, "持有的"); } if (targetCompany.Contains("持有")) { targetCompany = Utility.GetStringAfter(targetCompany, "持有"); } if (targetCompany.Contains("所持")) { targetCompany = Utility.GetStringAfter(targetCompany, "所持"); } var extra = (target, targetCompany); list.Add(extra); } } } } if (list.Count != 0) { return(list.Distinct().ToList()); } } } } return(Result); }
/// <summary> /// 从释义表格中抽取 /// </summary> /// <returns></returns> List <(string Target, string Comany)> getTargetListFromReplaceTable() { var ReplaceCompany = new List <String>(); foreach (var c in companynamelist) { if (c.positionId == -1) { //释义表 if (!String.IsNullOrEmpty(c.secShortName)) { ReplaceCompany.Add(c.secShortName); } } } var TargetAndCompanyList = new List <(string Target, string Comany)>(); //股份的抽取 var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { LeadingWordList = ReplaceCompany, RegularExpress = RegularTool.PercentExpress, TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益" }.ToList() }; var ReplacementKeys = new string[] { "交易标的", //09% 00303 "标的资产", //15% 00464 "本次交易", //12% 00369 "本次重组", //09% 00297 "拟购买资产", //07% 00221 "本次重大资产重组", //07% 00219 "置入资产", //03% 00107 "本次发行", //02% 00070 "拟注入资产", //02% 00068 "目标资产" //02% 00067 }; foreach (var Rplkey in ReplacementKeys) { //可能性最大的排在最前 foreach (var item in ReplacementDict) { var keys = item.Key.Split(Utility.SplitChar); var keys2 = item.Key.Split("/"); if (keys.Length == 1 && keys2.Length > 1) { keys = keys2; } var values = item.Value.Split(Utility.SplitChar); var values2 = item.Value.Split(";"); if (values.Length == 1 && values2.Length > 1) { values = values2; } if (keys.Contains(Rplkey)) { foreach (var value in values) { var targetAndcompany = value.Trim(); //将公司名称和交易标的划分开来 var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, value, targetRegular, "|"); if (ExpResult.Count == 0) { //其他类型的标的 foreach (var rc in ReplaceCompany) { if (targetAndcompany.StartsWith(rc)) { var extra = (value.Substring(rc.Length), rc); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } } } else { foreach (var r in ExpResult) { var arr = r.Value.Split("|"); var extra = (arr[1] + arr[2], arr[0]); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } } } } if (TargetAndCompanyList.Count != 0) { return(TargetAndCompanyList); } } } } return(TargetAndCompanyList); }
/// <summary> /// 从释义表抽取数据 /// </summary> /// <param name="Target"></param> /// <param name="Comany"></param> /// <returns></returns> private List <(string Target, string Comany)> ExtractFromExplainTable(List <struCompanyName> CompanyAtExplainTable, string[] ExplainKeys) { var AllCompanyName = new List <String>(); foreach (var item in CompanyAtExplainTable) { if (!String.IsNullOrEmpty(item.secShortName)) { AllCompanyName.Add(item.secShortName); } if (!String.IsNullOrEmpty(item.secFullName)) { AllCompanyName.Add(item.secFullName); } } //股份的抽取 var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { LeadingWordList = AllCompanyName, RegularExpress = RegularTool.PercentExpress, TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益" }.ToList() }; var OtherTargets = new string[] { "资产及负债", "直属资产" }; var TargetAndCompanyList = new List <(string Target, string Comany)>(); foreach (var Rplkey in ExplainKeys) { //可能性最大的排在最前 foreach (var item in ExplainDict) { var keys = item.Key.Split(Utility.SplitChar); var keys2 = item.Key.Split("/"); if (keys.Length == 1 && keys2.Length > 1) { keys = keys2; } var values = item.Value.Split(Utility.SplitChar); var values2 = item.Value.Split(";"); if (values.Length == 1 && values2.Length > 1) { values = values2; } //keys里面可能包括【拟】字需要去除 var SearchKey = keys.Select((x) => { return(x.StartsWith("拟") ? x.Substring(1) : x); }); SearchKey = SearchKey.Select(x => x.Trim()).ToArray(); if (SearchKey.Contains(Rplkey)) { foreach (var targetRecordItem in values) { //DEBUG: var SingleItemList = Utility.CutByPOSConection(targetRecordItem); if (SingleItemList.Count == 2) { //1.家和股份 和的问题 //2.空格问题 //3.置入和置出问题 //4.其他奇怪的问题 //5.资产和负债 //6.所拥有的,所持有的 //Console.WriteLine(Id + " 分割:"); //Console.WriteLine(Id + " 原词:" + targetRecordItem); //Console.WriteLine(Id + " 分量1:" + SingleItemList[0]); //Console.WriteLine(Id + " 分量2:" + SingleItemList[1]); } foreach (var SingleItem in SingleItemList) { var targetAndcompany = SingleItem.Trim().Replace(" ", ""); //将公司名称和交易标的划分开来 var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, targetAndcompany, targetRegular, "|"); if (ExpResult.Count == 0) { //其他类型的标的 foreach (var rc in CompanyAtExplainTable) { var IsFullNameHit = false; if (!String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.Contains(rc.secFullName)) { foreach (var ot in OtherTargets) { if (targetAndcompany.Contains(ot)) { IsFullNameHit = true; TargetAndCompanyList.Add((rc.secFullName, ot)); break; } } } if (!IsFullNameHit) { if (!String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.Contains(rc.secShortName)) { foreach (var ot in OtherTargets) { if (targetAndcompany.Contains(ot)) { IsFullNameHit = true; TargetAndCompanyList.Add((rc.secShortName, ot)); break; } } } } if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.StartsWith(rc.secFullName)) { var extra = (SingleItem.Substring(rc.secFullName.Length), rc.secFullName); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.StartsWith(rc.secShortName)) { var extra = (SingleItem.Substring(rc.secShortName.Length), rc.secShortName); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } } } else { foreach (var r in ExpResult) { var arr = r.Value.Split("|"); var extra = (arr[1] + arr[2], arr[0]); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } } } } } if (TargetAndCompanyList.Count != 0) { return(TargetAndCompanyList); } } } } return(TargetAndCompanyList); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> string GetJiaFang(String YiFang) { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "项目招标人:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = 32; e.MaxLengthCheckPreprocess = Utility.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } var ner = SearchJiaFang(); var NerJia = String.Empty; if (!String.IsNullOrEmpty(ner)) { foreach (var cn in companynamelist) { if (cn.secShortName == ner) { ner = cn.secFullName; } } if (String.IsNullOrEmpty(YiFang)) { NerJia = ner; } if (!YiFang.Equals(ner)) { NerJia = ner; } } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } if (!String.IsNullOrEmpty(NerJia)) { //原则上,有NER中提取的甲方,则使用甲方 foreach (var c in CandidateWord) { //但是,这里有可能是正确的解答,例如 //NER:(集团)有限公司 实际上应该是 XXXX(集团)有限公司 if (c.EndsWith(NerJia)) { return(c); } } return(NerJia); } else { return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); } }