private List <(string Target, string Company)> ExtractExtend(string[] ExplainKeys) { var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { RegularExpress = RegularTool.PercentExpress, TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益", "的股份", "股份" }.ToList() }; var Result = new List <(string Target, string Comany)>(); //可能性最大的排在最前 foreach (var item in ExplainDict) { var list = new List <(string Target, string Comany)>(); var keys = item.Key.Split(Utility.SplitChar); var keys2 = item.Key.Split(new char[] { '/', '/' }); if (keys.Length == 1 && keys2.Length > 1) { keys = keys2; } var values = item.Value.Split(Utility.SplitChar); var values2 = item.Value.Split(";"); if (values.Length == 1 && values2.Length > 1) { values = values2; } foreach (var ek in ExplainKeys) { if (keys.Contains(ek)) { foreach (var value in values) { var serachWord = value.Replace(" ", string.Empty); foreach (var words in serachWord.Split(Utility.SplitChar)) { var SingleItemList = Utility.CutByPOSConection(words); foreach (var SingleItem in SingleItemList) { var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, SingleItem, targetRegular, "|"); foreach (var r in ExpResult) { var arr = r.Value.Split("|"); var target = arr[1] + arr[2]; var targetCompany = SingleItem.Substring(0, r.StartIdx); if (targetCompany.Contains("持有的")) { targetCompany = Utility.GetStringAfter(targetCompany, "持有的"); } if (targetCompany.Contains("持有")) { targetCompany = Utility.GetStringAfter(targetCompany, "持有"); } if (targetCompany.Contains("所持")) { targetCompany = Utility.GetStringAfter(targetCompany, "所持"); } var extra = (target, targetCompany); list.Add(extra); } } } } if (list.Count != 0) { return(list.Distinct().ToList()); } } } } return(Result); }
/// <summary> /// 从释义表抽取数据 /// </summary> /// <param name="Target"></param> /// <param name="Comany"></param> /// <returns></returns> private List <(string Target, string Comany)> ExtractFromExplainTable(List <struCompanyName> CompanyAtExplainTable, string[] ExplainKeys) { var AllCompanyName = new List <String>(); foreach (var item in CompanyAtExplainTable) { if (!String.IsNullOrEmpty(item.secShortName)) { AllCompanyName.Add(item.secShortName); } if (!String.IsNullOrEmpty(item.secFullName)) { AllCompanyName.Add(item.secFullName); } } //股份的抽取 var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { LeadingWordList = AllCompanyName, RegularExpress = RegularTool.PercentExpress, TrailingWordList = new string[] { "的股权", "股权", "的权益", "权益" }.ToList() }; var OtherTargets = new string[] { "资产及负债", "直属资产" }; var TargetAndCompanyList = new List <(string Target, string Comany)>(); foreach (var Rplkey in ExplainKeys) { //可能性最大的排在最前 foreach (var item in ExplainDict) { var keys = item.Key.Split(Utility.SplitChar); var keys2 = item.Key.Split("/"); if (keys.Length == 1 && keys2.Length > 1) { keys = keys2; } var values = item.Value.Split(Utility.SplitChar); var values2 = item.Value.Split(";"); if (values.Length == 1 && values2.Length > 1) { values = values2; } //keys里面可能包括【拟】字需要去除 var SearchKey = keys.Select((x) => { return(x.StartsWith("拟") ? x.Substring(1) : x); }); SearchKey = SearchKey.Select(x => x.Trim()).ToArray(); if (SearchKey.Contains(Rplkey)) { foreach (var targetRecordItem in values) { //DEBUG: var SingleItemList = Utility.CutByPOSConection(targetRecordItem); if (SingleItemList.Count == 2) { //1.家和股份 和的问题 //2.空格问题 //3.置入和置出问题 //4.其他奇怪的问题 //5.资产和负债 //6.所拥有的,所持有的 //Console.WriteLine(Id + " 分割:"); //Console.WriteLine(Id + " 原词:" + targetRecordItem); //Console.WriteLine(Id + " 分量1:" + SingleItemList[0]); //Console.WriteLine(Id + " 分量2:" + SingleItemList[1]); } foreach (var SingleItem in SingleItemList) { var targetAndcompany = SingleItem.Trim().Replace(" ", ""); //将公司名称和交易标的划分开来 var ExpResult = ExtractPropertyByHTML.RegularExFinder(0, targetAndcompany, targetRegular, "|"); if (ExpResult.Count == 0) { //其他类型的标的 foreach (var rc in CompanyAtExplainTable) { var IsFullNameHit = false; if (!String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.Contains(rc.secFullName)) { foreach (var ot in OtherTargets) { if (targetAndcompany.Contains(ot)) { IsFullNameHit = true; TargetAndCompanyList.Add((rc.secFullName, ot)); break; } } } if (!IsFullNameHit) { if (!String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.Contains(rc.secShortName)) { foreach (var ot in OtherTargets) { if (targetAndcompany.Contains(ot)) { IsFullNameHit = true; TargetAndCompanyList.Add((rc.secShortName, ot)); break; } } } } if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secFullName) && targetAndcompany.StartsWith(rc.secFullName)) { var extra = (SingleItem.Substring(rc.secFullName.Length), rc.secFullName); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } if (TargetAndCompanyList.Count == 0 && !String.IsNullOrEmpty(rc.secShortName) && targetAndcompany.StartsWith(rc.secShortName)) { var extra = (SingleItem.Substring(rc.secShortName.Length), rc.secShortName); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } break; } } } else { foreach (var r in ExpResult) { var arr = r.Value.Split("|"); var extra = (arr[1] + arr[2], arr[0]); if (!TargetAndCompanyList.Contains(extra)) { TargetAndCompanyList.Add(extra); } } } } } if (TargetAndCompanyList.Count != 0) { return(TargetAndCompanyList); } } } } return(TargetAndCompanyList); }