static void Main(string[] args) { Logger = new StreamWriter("Log.log"); //全局编码 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); //公司全称简称曾用名字典 CompanyNameLogic.LoadCompanyName(@"Resources" + Path.DirectorySeparatorChar + "FDDC_announcements_company_name_20180531.json"); //增减持公告日期的读入 StockChange.ImportPublishTime(); //结巴分词的地名修正词典 PosNS.ImportNS(@"Resources" + Path.DirectorySeparatorChar + "ns.dict"); CIRecord = new StreamWriter("CI.log"); //预处理 Traning(); return; Evaluator = new StreamWriter("Evaluator.log"); Score = new StreamWriter(@"Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt"); //new Contract(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1008828.html").Extract();return; Extract(); CIRecord.Close(); Score.Close(); Evaluator.Close(); Logger.Close(); }
static void Main(string[] args) { if (Environment.OSVersion.Platform == System.PlatformID.Unix) { //静态变量已经定下来了,这里改不来了! Console.WriteLine("Switch Doc Path To:" + DocBase); } //日志 Logger = new StreamWriter("Log.log"); //实体属性器日志设定 EntityProperty.Logger = Logger; //全局编码 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); CIRecord = new StreamWriter("CI.log"); QuickTestArea(); return; //PDFToTXT.GetPdf2TxtBatchFile(); //公司全称简称曾用名字典 CompanyNameLogic.LoadCompanyName("Resources" + Path.DirectorySeparatorChar + "FDDC_announcements_company_name_20180531.json"); //结巴分词的地名修正词典 PosNS.ImportNS("Resources" + Path.DirectorySeparatorChar + "ns.dict"); //预处理 Traning(); Evaluator = new StreamWriter("Evaluator.log"); Score = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt"); Extract(); CIRecord.Close(); Score.Close(); Evaluator.Close(); Logger.Close(); }
/// <summary> /// 快速测试区 /// </summary> private static void QuickTestArea() { var plst = LTPTrainingNER.GetParagraghList(StockChangePath_TEST + "/ner/18877033.xml"); CompanyNameLogic.GetCompanyNameByNerInfo(plst); return; var s0 = "爱康科技向爱康实业、爱康国际、苏州度金、天地国际、钨业研究支付现金购买其合计持有爱康光电100%股权"; var pos = new PosSegmenter(); var words = pos.Cut(s0); Evaluator = new StreamWriter("Evaluator.log"); Score = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt"); //Evaluate.EvaluateReorganizationByFile(@"E:\WorkSpace2018\FDDC2018\FDDC_SRC\Result\chongzu_train.txt"); //Score.Close(); //Evaluator.Close(); //TraningDataset.InitReorganization(); ReOrganizationTraning.EvaluateMethodList = new string[] { "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法", "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法", "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法" }.ToList(); var t = new Reorganization(); t.Id = "748379"; t.HTMLFileName = ReorganizationPath_TEST + "/html/1759374.html"; //t.TextFileName = ContractPath_TEST + "/txt/128869.txt"; //t.NerXMLFileName = ContractPath_TEST + "/ner/128869.xml"; t.Init(); var recs = t.Extract(); var s1 = recs[0].ConvertToString(); }
/// <summary> /// 公司名称的获得 /// </summary> /// <param name="FullName"></param> /// <param name="ShortName"></param> /// <returns></returns> public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word) { if (String.IsNullOrEmpty(word)) { return(String.Empty, String.Empty); } var fullname = word.Replace(" ", String.Empty); var shortname = String.Empty; foreach (var companyname in doc.companynamelist) { if (companyname.secFullName == fullname) { //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性! if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName)) { shortname = companyname.secShortName; break; } } if (companyname.secShortName == fullname) { fullname = companyname.secFullName; shortname = companyname.secShortName; break; } //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下 //简称和全称的关系 if (companyname.secFullName.Contains(fullname) && companyname.secFullName.Length > fullname.Length) { fullname = companyname.secFullName; shortname = word; } } if (string.IsNullOrEmpty(shortname)) { //字典 shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName; } if (string.IsNullOrEmpty(shortname)) { //在原文中寻找该字符名称,然后看一下,其后是否有【简称】字样, //简称后是否有引号字样“XXXX”有的话,差不多就是了 shortname = GetShortNameByFullName(fullname, doc); if (!string.IsNullOrEmpty(shortname)) { Console.WriteLine(fullname + ":" + shortname); } } return(fullname, shortname); }
public static (String FullName, String ShortName) NormalizeCompanyName(AnnouceDocument doc, string word) { if (String.IsNullOrEmpty(word)) { return(String.Empty, String.Empty); } var fullname = word.Replace(" ", String.Empty); var shortname = String.Empty; foreach (var companyname in doc.companynamelist) { if (companyname.secFullName == fullname) { //注意:这里可能出现两个具有相同FullName,但是某个没有ShortName的可能性! if (shortname == String.Empty && !String.IsNullOrEmpty(companyname.secShortName)) { shortname = companyname.secShortName; break; } } if (companyname.secShortName == fullname) { fullname = companyname.secFullName; shortname = companyname.secShortName; break; } //如果进来的是简称,而提取的公司信息里面,只有全称,这里简单推断一下 //简称和全称的关系 if (companyname.secFullName.Contains(fullname) && companyname.secFullName.Length > fullname.Length) { fullname = companyname.secFullName; shortname = word; } } if (shortname == String.Empty) { shortname = CompanyNameLogic.GetCompanyNameByFullName(fullname).secShortName; } return(fullname, shortname); }
ContractRec ExtractSingle() { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new ContractRec(); //公告ID contract.Id = Id; //乙方 contract.YiFang = GetYiFang(); if (contract.YiFang.Contains("本公司")) { contract.YiFang = string.Empty; } contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); if (contract.YiFang.Length < 3) { contract.YiFang = string.Empty; } //甲方 contract.JiaFang = GetJiaFang(contract.YiFang); if (contract.JiaFang.Contains("本公司")) { contract.JiaFang = string.Empty; } contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (contract.JiaFang.Contains("简称")) { contract.JiaFang = Utility.GetStringBefore(contract.JiaFang, "("); } //机构列表 if (Nerlist != null) { var NiList = Nerlist.Where((n) => n.Type == LTPTrainingNER.enmNerType.Ni).Select((m) => m.RawData); if (!NiList.Contains(contract.JiaFang)) { if (NiList.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } } //项目 contract.ProjectName = GetProjectName(); contract.ProjectName = contract.ProjectName.NormalizeTextResult(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } if (contract.ProjectName.EndsWith(")")) { if (contract.ProjectName.Contains("(招标编号")) { contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(招标编号"); } if (contract.ProjectName.Contains("(合同编号")) { contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(合同编号"); } } contract.ProjectName = contract.ProjectName.Replace("的推荐中标", ""); //特殊处理 contract.ProjectName = contract.ProjectName.Replace("<1>", "1、"); contract.ProjectName = contract.ProjectName.Replace("“", ""); contract.ProjectName = contract.ProjectName.Replace("”", ""); //合同名 contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); contract.ContractName = contract.ContractName.NormalizeTextResult(); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = ExtendContractName(contract.ContractName); //如果是采购协议,则工程名清空 if (contract.ContractName.Contains("采购")) { if (contract.ProjectName.Contains("标段")) { //TODO: } else { contract.ProjectName = string.Empty; } } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); var YiFangArray = contract.YiFang.Split(Utility.SplitChar); if (YiFangArray.Length > 1) { contract.UnionMember = Utility.GetStringAfter(contract.YiFang, Utility.SplitChar); contract.YiFang = YiFangArray[0]; Console.WriteLine("联合体:" + contract.UnionMember); } return(contract); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> public string GetJiaFang() { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = ContractTraning.JiaFangES.MaxLength; e.MaxLengthCheckPreprocess = Utility.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (Utility.TrimEnglish(JiaFang.secFullName).Length > ContractTraning.JiaFangES.MaxLength) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); }
public static struCompanyName AfterProcessFullName(string FullName) { var ShortName = String.Empty; var CompanyNameTrailingwords = new string[] { "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称" }; //暂时不做括号的正规化 foreach (var trailing in CompanyNameTrailingwords) { if (FullName.Contains(trailing)) { //获取简称 var BracketsList = RegularTool.GetChineseBrackets(FullName); foreach (var bracketItem in BracketsList) { var ShortNameList = RegularTool.GetChineseQuotation(bracketItem); if (ShortNameList.Count > 0) { ShortName = ShortNameList.First(); if (!String.IsNullOrEmpty(ShortName)) { ShortName = ShortName.Substring(1, ShortName.Length - 2); } } } FullName = Utility.GetStringBefore(FullName, trailing); } } if (FullName.Contains("及其")) { FullName = Utility.GetStringBefore(FullName, "及其"); } if (FullName.Contains("股东")) { FullName = Utility.GetStringAfter(FullName, "股东"); } if (FullName.Contains("一致行动人")) { FullName = Utility.GetStringAfter(FullName, "一致行动人"); } if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName)) { FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName; } //删除前导 FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName); FullName = CutOtherLeadingWords(FullName); if (ShortName != String.Empty) { return(new struCompanyName() { secFullName = FullName, secShortName = ShortName, Score = 80 }); } else { return(new struCompanyName() { secFullName = FullName, Score = 60 }); } }
struContract ExtractSingle(MyRootHtmlNode root, String Id) { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new struContract(); //公告ID contract.id = Id; //甲方 contract.JiaFang = GetJiaFang(); contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (!Nerlist.Contains(contract.JiaFang)) { //作为特殊单位,国家电网公司一般都是甲方 if (Nerlist.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } //乙方 contract.YiFang = GetYiFang(); contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); //项目 contract.ProjectName = GetProjectName(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } contract.ProjectName = contract.ProjectName.NormalizeTextResult(); //合同 if (contractType == "中标") { //按照数据分析来看,应该工程名 在中标的时候填写,合同名在合同的时候填写 contract.ContractName = String.Empty; } else { contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = contract.ContractName.NormalizeTextResult(); } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract.JiaFang, contract.YiFang); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); return(contract); }
/// <summary> /// 根据表头标题抽取 /// </summary> /// <param name="root"></param> /// <param name="id"></param> /// <returns></returns> List <RecordBase> ExtractFromTable() { var StockHolderRule = new TableSearchTitleRule(); StockHolderRule.Name = "股东全称"; StockHolderRule.Title = new string[] { "股东名称", "名称", "增持主体", "增持人", "减持主体", "减持人", "姓名" }.ToList(); StockHolderRule.IsTitleEq = true; StockHolderRule.IsRequire = true; var ChangeDateRule = new TableSearchTitleRule(); ChangeDateRule.Name = "变动截止日期"; ChangeDateRule.Title = new string[] { "买卖时间", "日期", "减持期间", "增持期间", "减持股份期间", "增持股份期间", "减持时间", "增持时间", "减持股份时间", "增持股份时间", "买入时间", "卖出时间" }.ToList(); ChangeDateRule.IsTitleEq = false; ChangeDateRule.Normalize = NormailizeEndChangeDate; var ChangePriceRule = new TableSearchTitleRule(); ChangePriceRule.Name = "变动价格"; ChangePriceRule.Title = new string[] { "买入均价", "卖出均价", "成交均价", "减持价格", "增持价格", "减持股均价", "增持股均价", "减持均", "增持均", "价格区间" }.ToList(); ChangePriceRule.IsTitleEq = false; ChangePriceRule.Normalize = (x, y) => { var prices = RegularTool.GetRegular(x, RegularTool.MoneyExpress); if (prices.Count == 0) { if (x.Contains("元")) { return(Utility.GetStringBefore(x, "元")); } } else { //增减持,区间的情况,取最高价,假设最后一个数字是最大的 return(prices.Last().RawData); } return(x); }; var ChangeNumberRule = new TableSearchTitleRule(); ChangeNumberRule.Name = "变动数量"; ChangeNumberRule.Title = new string[] { "成交数量", "减持股数", "增持股数", "减持数量", "增持数量", "买入股份数", "卖出股份数", "股数" }.ToList(); ChangeNumberRule.IsTitleEq = false; ChangeNumberRule.Normalize = NumberUtility.NormalizerStockNumber; var Rules = new List <TableSearchTitleRule>(); Rules.Add(StockHolderRule); Rules.Add(ChangeDateRule); Rules.Add(ChangePriceRule); Rules.Add(ChangeNumberRule); var result = HTMLTable.GetMultiInfoByTitleRules(root, Rules, false); if (result.Count == 0) { //没有抽取到任何数据 Rules.Clear(); ChangeDateRule.IsRequire = true; Rules.Add(ChangeDateRule); Rules.Add(ChangePriceRule); Rules.Add(ChangeNumberRule); result = HTMLTable.GetMultiInfoByTitleRules(root, Rules, false); if (result.Count == 0) { return(new List <RecordBase>()); } var NewResult = new List <CellInfo[]>(); var Name = GetHolderName(); if (String.IsNullOrEmpty(Name.FullName) && String.IsNullOrEmpty(Name.ShortName)) { return(new List <RecordBase>()); } foreach (var item in result) { NewResult.Add(new CellInfo[] { new CellInfo() { RawData = String.IsNullOrEmpty(Name.FullName)?Name.ShortName:Name.FullName }, item[0], item[1], item[2] }); } result = NewResult; } var holderafterlist = GetHolderAfter(); var stockchangelist = new List <RecordBase>(); foreach (var rec in result) { var stockchange = new StockChangeRec(); stockchange.Id = Id; var ModifyName = rec[0].RawData; //表格里面长的名字可能被分页切割掉 //这里使用合计表进行验证 if (!holderafterlist.Select((z) => { return(z.Name); }).ToList().Contains(ModifyName)) { foreach (var item in holderafterlist) { if (item.Name.EndsWith("先生")) { break; //特殊处理,没有逻辑可言 } if (item.Name.StartsWith(ModifyName) && !item.Name.Equals(ModifyName)) { ModifyName = item.Name; break; } if (item.Name.EndsWith(ModifyName) && !item.Name.Equals(ModifyName)) { ModifyName = item.Name; break; } } } var Name = CompanyNameLogic.NormalizeCompanyName(this, ModifyName); stockchange.HolderFullName = Name.FullName.NormalizeTextResult(); stockchange.HolderShortName = Name.ShortName; if (stockchange.HolderFullName.Contains("简称")) { stockchange.HolderShortName = Utility.GetStringAfter(stockchange.HolderFullName, "简称"); stockchange.HolderShortName = stockchange.HolderShortName.Replace(")", String.Empty).Replace("“", String.Empty).Replace("”", String.Empty); stockchange.HolderFullName = Utility.GetStringBefore(stockchange.HolderFullName, "("); } stockchange.ChangeEndDate = rec[1].RawData; DateTime x; if (!DateTime.TryParse(stockchange.ChangeEndDate, out x)) { //无法处理的情况 if (!Program.IsDebugMode) { //非调试模式 stockchange.ChangeEndDate = String.Empty; } } if (!String.IsNullOrEmpty(rec[2].RawData)) { //股价区间化的去除 if (!(rec[2].RawData.Contains("-") || rec[2].RawData.Contains("~") || rec[2].RawData.Contains("至"))) { stockchange.ChangePrice = rec[2].RawData.Replace(" ", String.Empty); stockchange.ChangePrice = stockchange.ChangePrice.Replace("*", ""); stockchange.ChangePrice = stockchange.ChangePrice.NormalizeNumberResult(); } } if (!RegularTool.IsUnsign(stockchange.ChangePrice)) { if (!String.IsNullOrEmpty(stockchange.ChangePrice)) { Console.WriteLine("Error ChangePrice:[" + stockchange.ChangePrice + "]"); } stockchange.ChangePrice = String.Empty; } if (!String.IsNullOrEmpty(rec[3].RawData)) { stockchange.ChangeNumber = rec[3].RawData.Replace(" ", String.Empty); stockchange.ChangeNumber = stockchange.ChangeNumber.NormalizeNumberResult(); if (!RegularTool.IsUnsign(stockchange.ChangeNumber)) { if (!String.IsNullOrEmpty(stockchange.ChangeNumber)) { Console.WriteLine("Error ChangeNumber:[" + stockchange.ChangeNumber + "]"); } stockchange.ChangeNumber = String.Empty; } } //基本上所有的有效记录都有股东名和截至日期,所以,这里这么做,可能对于极少数没有截至日期的数据有伤害,但是对于整体指标来说是好的 if (string.IsNullOrEmpty(stockchange.HolderFullName) || string.IsNullOrEmpty(stockchange.ChangeEndDate)) { continue; } if (stockchange.ChangeNumber == "0" || stockchange.ChangePrice == "0") { continue; } stockchangelist.Add(stockchange); } //寻找所有的股东全称 var namelist = stockchangelist.Select(x => ((StockChangeRec)x).HolderFullName).Distinct().ToList(); var newRec = new List <StockChangeRec>(); foreach (var name in namelist) { var stocklist = stockchangelist.Where((x) => { return(((StockChangeRec)x).HolderFullName == name); }).ToList(); stocklist.Sort((x, y) => { return(((StockChangeRec)x).ChangeEndDate.CompareTo(((StockChangeRec)x).ChangeEndDate)); }); var last = (StockChangeRec)stocklist.Last(); for (int i = 0; i < holderafterlist.Count; i++) { var after = holderafterlist[i]; after.Name = after.Name.Replace(" ", ""); if (after.Name == last.HolderFullName || after.Name == last.HolderShortName) { stockchangelist.Remove(last); //结构体,无法直接修改!!使用删除,增加的方法 last.HoldNumberAfterChange = after.Count; last.HoldPercentAfterChange = after.Percent; newRec.Add(last); } } } if (holderafterlist.Count != namelist.Count) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("增持者数量确认!"); } } stockchangelist.AddRange(newRec); return(stockchangelist); }
/// <summary> /// 获得甲方 /// </summary> /// <returns></returns> string GetJiaFang(String YiFang) { //最高置信度的抽取 EntityProperty e = new EntityProperty(); e.ExcludeContainsWordList = new string[] { "招标代理" }; e.LeadingColonKeyWordList = new string[] { "甲方:", "合同买方:", "发包人:", "发包单位:", "发包方:", "发包机构:", "发包人名称:", "招标人:", "招标单位:", "招标方:", "招标机构:", "招标人名称:", "项目招标人:", "业主:", "业主单位:", "业主方:", "业主机构:", "业主名称:", "采购单位:", "采购单位名称:", "采购人:", "采购人名称:", "采购方:", "采购方名称:" }; e.CandidatePreprocess = (x => { x = Normalizer.ClearTrailing(x); return(CompanyNameLogic.AfterProcessFullName(x).secFullName); }); e.MaxLength = 32; e.MaxLengthCheckPreprocess = Utility.TrimEnglish; e.MinLength = 3; e.Extract(this); //这里不直接做Distinct,出现频次越高,则可信度越高 //多个甲方的时候,可能意味着没有甲方! if (e.LeadingColonKeyWordCandidate.Distinct().Count() > 1) { foreach (var candidate in e.LeadingColonKeyWordCandidate) { Program.Logger.WriteLine("发现多个甲方:" + candidate); } } if (e.LeadingColonKeyWordCandidate.Count > 0) { return(e.LeadingColonKeyWordCandidate[0]); } var ner = SearchJiaFang(); var NerJia = String.Empty; if (!String.IsNullOrEmpty(ner)) { foreach (var cn in companynamelist) { if (cn.secShortName == ner) { ner = cn.secFullName; } } if (String.IsNullOrEmpty(YiFang)) { NerJia = ner; } if (!YiFang.Equals(ner)) { NerJia = ner; } } //招标 var Extractor = new ExtractPropertyByHTML(); var CandidateWord = new List <String>(); var StartArray = new string[] { "招标单位", "业主", "收到", "接到" }; var EndArray = new string[] { "发来", "发出", "的中标" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); JiaFang.secFullName = JiaFang.secFullName.Replace("招标单位", String.Empty).Trim(); if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(招标):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } //合同 Extractor = new ExtractPropertyByHTML(); StartArray = new string[] { "与", "与业主" }; EndArray = new string[] { "签署", "签订" }; Extractor.StartEndFeature = Utility.GetStartEndStringArray(StartArray, EndArray); Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var JiaFang = CompanyNameLogic.AfterProcessFullName(item.Value.Trim()); JiaFang.secFullName = JiaFang.secFullName.Replace("业主", String.Empty).Trim(); if (JiaFang.secFullName.Contains("招标代理")) { continue; //特殊业务规则 } if (Utility.TrimEnglish(JiaFang.secFullName).Length > 32) { continue; } if (JiaFang.secFullName.Length < 3) { continue; //使用实际长度排除全英文的情况 } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("甲方候补词(合同):[" + JiaFang.secFullName + "]"); } CandidateWord.Add(JiaFang.secFullName); } if (!String.IsNullOrEmpty(NerJia)) { //原则上,有NER中提取的甲方,则使用甲方 foreach (var c in CandidateWord) { //但是,这里有可能是正确的解答,例如 //NER:(集团)有限公司 实际上应该是 XXXX(集团)有限公司 if (c.EndsWith(NerJia)) { return(c); } } return(NerJia); } else { return(CompanyNameLogic.MostLikeCompanyName(CandidateWord)); } }