//符号包裹 void ExtractByMarkFeature(MyRootHtmlNode root) { foreach (var word in MarkFeature) { Func <String, List <String> > ExtractMethod = (x) => { var strlist = new List <String>(); foreach (var strContent in RegularTool.GetMultiValueBetweenMark(x, word.MarkStartWith, word.MarkEndWith)) { if (word.InnerStartWith != null) { if (!strContent.StartsWith(word.InnerStartWith)) { continue; } } if (word.InnerEndWith != null) { if (!strContent.EndsWith(word.InnerEndWith)) { continue; } } strlist.Add(strContent); } return(strlist); }; SearchNormalContent(root, ExtractMethod); } }
public static void RunWordAnlayze() { var s0 = "华陆工程(科技)有限责任公司"; JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord("华陆工程科技有限责任公司"); segmenter.AddWord("中煤陕西榆林能源化工有限公司"); PosSegmenter posSeg = new PosSegmenter(segmenter); var c = posSeg.Cut(s0); s0 = s0.NormalizeTextResult(); s0 = RegularTool.TrimBrackets(s0); /* var SProjectName = new Surround(); var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html"); var Contract = TraningDataset.GetContractById("1044779")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html"); Contract = TraningDataset.GetContractById("1450")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html"); Contract = TraningDataset.GetContractById("1042224")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html"); Contract = TraningDataset.GetContractById("917362")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); SProjectName.WriteTop(10); */ var TestString = "承运市"; var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter(); foreach (var item in pos.Cut(TestString)) { Console.WriteLine(item.Word + ":" + item.Flag); } }
public struWordNER(string element) { var x = RegularTool.GetMultiValueBetweenMark(element, "\"", "\""); if (x.Count != 4) { if (x.Count == 3) { //Console.WriteLine(element); id = int.Parse(x[0]); cont = "\""; //" pos = x[1]; ne = x[2]; } else { id = int.Parse(x[0]); cont = ""; pos = ""; ne = ""; } } else { id = int.Parse(x[0]); cont = x[1]; pos = x[2]; ne = x[3]; } }
//获得日期 public static List <LocAndValue <DateTime> > LocateDate(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <DateTime> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { list.Add(new LocAndValue <DateTime>() { Loc = sentence.PositionId, Type = "日期", Value = DateUtility.GetWorkDay(year, month, day) }); } } } } return(list); }
public static void RegularExpress() { var d0 = "宏润建设集团股份有限公司(以下简称“公司”)于2014年1月7日收到西安市建设工程中标通知书,“西安市地铁四号线工程(航天东路站—北客站)土建施工D4TJSG-5标”项目由公司中标承建,工程中标价49,290万元。"; var x0 = RegularTool.GetMultiValueBetweenMark(d0, "“", "”"); var d1 = DateUtility.GetDate("河北先河环保科技股份有限公司董事会二○一二年十一月三十日"); Console.WriteLine(d1); var d2 = "公司第五届董事会第七次会议审议通过了《关于公司与神华铁路货车运输有限责任公司签订企业自用货车购置供货合同的议案》,2014年1月20日,公司与神华铁路货车运输有限责任公司签署了《企业自用货车购置供货合同》。"; var x2 = RegularTool.GetValueBetweenString(d2, "与", "签订"); var s0 = "2010年12月3日,中工国际工程股份有限公司与委内瑞拉农业土地部下属的委内瑞拉农业公司签署了委内瑞拉农副产品加工设备制造厂工业园项目商务合同,与委内瑞拉农签署了委内瑞拉奥里合同。"; var x = RegularTool.GetMultiValueBetweenString(s0, "与", "签署"); var s1 = "收到贵州高速公路开发总公司发出的通知"; var s2 = "接到贵州高速公路开发总公司发出的通知"; var s3 = "收到贵州高速公路开发总公司发出的告知"; var s4 = "接到贵州高速公路开发总公司发出的告知"; Regex rg = new Regex("(?<=(" + "收到|接到" + "))[.\\s\\S]*?(?=(" + "通知|告知" + "))", RegexOptions.Multiline | RegexOptions.Singleline); Console.WriteLine(rg.Match(s1).Value); Console.WriteLine(rg.Match(s2).Value); Console.WriteLine(rg.Match(s3).Value); Console.WriteLine(rg.Match(s4).Value); }
/// <summary> /// 引号和书名号内容提取 /// </summary> /// <param name="root">原始HTML</param> /// <param name="IsSkipBracket">是否忽略括号内部的内容</param> /// <returns></returns> public static List <LocAndValue <String> > LocateQuotation(HTMLEngine.MyRootHtmlNode root, bool IsSkipBracket = true) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; var BracketList = RegularTool.GetChineseBrackets(OrgString); Regex r = new Regex(@"\《.*?\》"); foreach (var item in r.Matches(OrgString).ToList()) { bool IsContentInBracket = false; foreach (var bracketItem in BracketList) { if (bracketItem.Contains(item.Value)) { IsContentInBracket = true; break; } } if (IsSkipBracket && IsContentInBracket) { continue; } list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Type = "字符", Value = item.Value.Substring(1, item.Value.Length - 2) }); } r = new Regex(@"\“.*?\”"); foreach (var item in r.Matches(OrgString).ToList()) { bool IsContentInBracket = false; foreach (var bracketItem in BracketList) { if (bracketItem.Contains(item.Value)) { IsContentInBracket = true; break; } } if (IsSkipBracket && IsContentInBracket) { continue; } list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Type = "字符", Value = item.Value.Substring(1, item.Value.Length - 2) }); } } } return(list); }
public struWordSRL(string element) { var x = RegularTool.GetMultiValueBetweenMark(element, "\"", "\""); if (x.Count != 6) { Console.WriteLine(element); id = int.Parse(x[0]); cont = String.Empty; //" pos = x[1]; ne = x[2]; parent = x[3]; relate = x[4]; } else { id = int.Parse(x[0]); cont = x[1]; pos = x[2]; ne = x[3]; parent = x[4]; relate = x[5]; } args = new List <struWordSRLARG>(); }
void ExtractByStartEndStringFeature(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); foreach (var word in StartEndFeature) { for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx]; var list = RegularTool.GetMultiValueBetweenString(line, word.StartWith, word.EndWith); foreach (var item in list) { CandidateWord.Add(new LocAndValue <string>() { Value = item }); } } ; } }
public void ExtractTextByInChineseBracketsColonKeyWord(string filename) { var lines = new List <String>(); var sr = new StreamReader(filename); while (!sr.EndOfStream) { var line = sr.ReadLine(); if (!String.IsNullOrEmpty(line)) { lines.Add(line); } } sr.Close(); for (int CurrentLineIdx = 0; CurrentLineIdx < lines.Count; CurrentLineIdx++) { var line = lines[CurrentLineIdx]; foreach (var word in LeadingColonKeyWordListInChineseBrackets) { var result = RegularTool.GetValueInChineseBracketsLeadingKeyWord(line, word); foreach (var item in result) { CandidateWord.Add(new LocAndValue <string>() { Loc = CurrentLineIdx, Value = item }); } } } }
public static List <LocAndValue <String> > LocatePercent(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; var BracketList = RegularTool.GetChineseBrackets(OrgString); Regex r = new Regex(RegularTool.PercentExpress); foreach (var item in r.Matches(OrgString).ToList()) { list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Description = "百分比", Value = item.Value, StartIdx = item.Index }); } } } return(list); }
/// <summary> /// 最后用抽取 /// </summary> static void Main_FINAL(string[] args) { Logger = new StreamWriter("Log.log"); //实体属性器日志设定 EntityProperty.Logger = Logger; //全局编码 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); //结巴分词的地名修正词典 PosNS.ImportNS("Resources" + Path.DirectorySeparatorChar + "ns.dict"); if (!Directory.Exists("/home/118_4/submit")) { Directory.CreateDirectory("/home/118_4/submit"); } Console.WriteLine("Start To Extract Info Contract TRAIN"); StreamWriter ResultCSV = new StreamWriter(@"/home/118_4/submit/hetong.txt", false, utf8WithoutBom); Run <Contract>(@"/home/data/hetong", @"/home/118_4/temp/hetong", ResultCSV); Console.WriteLine("Complete Extract Info Contract"); Console.WriteLine("Start To Extract Info StockChange TRAIN"); Console.WriteLine("读取增减持信息:" + "/home/data/zengjianchi/zengjianchi_public.csv"); var sr = new StreamReader("/home/data/zengjianchi/zengjianchi_public.csv"); sr.ReadLine(); //Skip Header while (!sr.EndOfStream) { var line = sr.ReadLine().Split(","); var numbers = RegularTool.GetNumberList(line[0]); int year = int.Parse(numbers[0]); int month = int.Parse(numbers[1]); int day = int.Parse(numbers[2]); var AnnouceDate = new DateTime(year, month, day); PublishTime.Add(line[1], AnnouceDate); //Console.WriteLine("ID:" + line[1] + " Date:" + AnnouceDate.ToString("yyyy-MM-dd")); } sr.Close(); Console.WriteLine("读取增减持信息:" + PublishTime.Count); ResultCSV = new StreamWriter(@"/home/118_4/submit/zengjianchi.txt", false, utf8WithoutBom); Run <StockChange>(@"/home/data/zengjianchi", @"/home/118_4/temp/zengjianchi", ResultCSV); Console.WriteLine("Complete Extract Info StockChange"); Console.WriteLine("Start To Extract Info Reorganization TRAIN"); //替代训练结果 Console.WriteLine("加载替代训练结果"); ReOrganizationTraning.EvaluateMethodList = new string[] { "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法", "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法", "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法" }.ToList(); Console.WriteLine("加载替代训练结果:" + ReOrganizationTraning.EvaluateMethodList.Count); ResultCSV = new StreamWriter(@"/home/118_4/submit/chongzu.txt", false, utf8WithoutBom); Run <Reorganization>(@"/home/data/chongzu", "", ResultCSV); Console.WriteLine("Complete Extract Info Reorganization"); Logger.Close(); }
List <RecordBase> ExtractMultiCommon() { var MainRec = ExtractSingle(); //三项订单 //中标通知书6份 //中标通知书四份 //履行进展情况 var Records = new List <RecordBase>(); var isMulti = false; foreach (var p in root.Children) { foreach (var s in p.Children) { if (isMulti) { if (nermap.ParagraghlocateDict.ContainsKey(s.PositionId)) { var nerlist = nermap.ParagraghlocateDict[s.PositionId]; if (nerlist.moneylist.Count == 1) { var ContractRec = new ContractRec(); ContractRec.Id = Id; ContractRec.JiaFang = MainRec.JiaFang; ContractRec.YiFang = MainRec.YiFang; ContractRec.ContractMoneyUpLimit = MoneyUtility.Format(nerlist.moneylist.First().Value.MoneyAmount, String.Empty); ContractRec.ContractMoneyDownLimit = ContractRec.ContractMoneyUpLimit; Records.Add(ContractRec); } } } else { var scan = NumberUtility.ConvertUpperToLower(s.Content).Replace(" ", ""); var cnt = RegularTool.GetRegular(scan, "中标通知书\\d份"); if (cnt.Count == 1) { Console.WriteLine(Id + ":" + cnt[0].RawData + "[" + scan + "]"); isMulti = true; } if (s.Content.Contains("履行进展情况")) { Console.WriteLine(Id + ":履行进展情况"); isMulti = true; } } } } return(Records); }
static string NormalizerFreezeYear(string orgString, string TitleWord) { orgString = orgString.Replace(" ", ""); if (orgString.Equals("十二")) { return("12"); } var x1 = Utility.GetStringAfter(orgString, "日起"); int x2; if (int.TryParse(x1, out x2)) { return(x2.ToString()); } x1 = Utility.GetStringBefore(orgString, "个月"); if (int.TryParse(x1, out x2)) { return(x2.ToString()); } x1 = RegularTool.GetValueBetweenString(orgString, "日起", "个月"); if (x1.Equals("十二")) { return("12"); } if (int.TryParse(x1, out x2)) { return(x2.ToString()); } if (orgString.Equals("十二")) { return("12"); } if (orgString.Equals("十二个月")) { return("12"); } if (orgString.Equals("1年")) { return("12"); } if (orgString.Equals("3年")) { return("36"); } return(orgString.Trim()); }
public static List <String> GetValueInChineseBracketsLeadingKeyWord(string OrgString, String KeyWord) { var WordList = new List <String>(); var BucketWords = RegularTool.GetChineseBrackets(OrgString); foreach (var word in BucketWords) { var value = Utility.GetStringAfter(word.Substring(1, word.Length - 2), KeyWord); if (value != String.Empty) { WordList.Add(value); } } return(WordList); }
//符号包裹 void ExtractByStartEndStringFeature(MyRootHtmlNode root) { StartEndResultList.Clear(); foreach (var word in StartEndFeature) { Func <String, List <String> > ExtractMethod = (x) => { var list = RegularTool.GetMultiValueBetweenString(x, word.StartWith, word.EndWith); var detail = new struStartEndResultDetail(); detail.Feature = word; detail.CandidateWord = list; return(list); }; SearchNormalContent(root, ExtractMethod); } }
public struWordSRLARG(string element) { var x = RegularTool.GetMultiValueBetweenMark(element, "\"", "\""); if (x.Count == 3) { id = int.Parse(x[0]); type = ""; Begin = int.Parse(x[1]); End = int.Parse(x[2]); } else { id = int.Parse(x[0]); type = x[1]; Begin = int.Parse(x[2]); End = int.Parse(x[3]); } cont = string.Empty; }
//获得日期 public static List <LocAndValue> LocateDate(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = Utility.ConvertUpperDateToLittle(OrgString).Replace(" ", ""); if (!String.IsNullOrEmpty(RegularTool.GetDate(OrgString))) { list.Add(new LocAndValue() { Loc = sentence.PositionId, Value = RegularTool.GetDate(OrgString) }); } } } return(list); }
public struWordDP(string element) { var x = RegularTool.GetMultiValueBetweenMark(element, "\"", "\""); if (x.Count != 5) { //Console.WriteLine(element); id = int.Parse(x[0]); cont = "\""; //" pos = x[1]; parent = int.Parse(x[2]); relate = x[3]; } else { id = int.Parse(x[0]); cont = x[1]; pos = x[2]; parent = int.Parse(x[3]); relate = x[4]; } }
ContractRec ExtractSingle() { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new ContractRec(); //公告ID contract.Id = Id; //乙方 contract.YiFang = GetYiFang(); if (contract.YiFang.Contains("本公司")) { contract.YiFang = string.Empty; } contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); if (contract.YiFang.Length < 3) { contract.YiFang = string.Empty; } //甲方 contract.JiaFang = GetJiaFang(contract.YiFang); if (contract.JiaFang.Contains("本公司")) { contract.JiaFang = string.Empty; } contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (contract.JiaFang.Contains("简称")) { contract.JiaFang = Utility.GetStringBefore(contract.JiaFang, "("); } //机构列表 if (Nerlist != null) { var NiList = Nerlist.Where((n) => n.Type == LTPTrainingNER.enmNerType.Ni).Select((m) => m.RawData); if (!NiList.Contains(contract.JiaFang)) { if (NiList.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } } //项目 contract.ProjectName = GetProjectName(); contract.ProjectName = contract.ProjectName.NormalizeTextResult(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } if (contract.ProjectName.EndsWith(")")) { if (contract.ProjectName.Contains("(招标编号")) { contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(招标编号"); } if (contract.ProjectName.Contains("(合同编号")) { contract.ProjectName = Utility.GetStringBefore(contract.ProjectName, "(合同编号"); } } contract.ProjectName = contract.ProjectName.Replace("的推荐中标", ""); //特殊处理 contract.ProjectName = contract.ProjectName.Replace("<1>", "1、"); contract.ProjectName = contract.ProjectName.Replace("“", ""); contract.ProjectName = contract.ProjectName.Replace("”", ""); //合同名 contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); contract.ContractName = contract.ContractName.NormalizeTextResult(); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = ExtendContractName(contract.ContractName); //如果是采购协议,则工程名清空 if (contract.ContractName.Contains("采购")) { if (contract.ProjectName.Contains("标段")) { //TODO: } else { contract.ProjectName = string.Empty; } } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); var YiFangArray = contract.YiFang.Split(Utility.SplitChar); if (YiFangArray.Length > 1) { contract.UnionMember = Utility.GetStringAfter(contract.YiFang, Utility.SplitChar); contract.YiFang = YiFangArray[0]; Console.WriteLine("联合体:" + contract.UnionMember); } return(contract); }
//获得日期 public static List <LocAndValue <(DateTime StartDate, DateTime EndDate)> > LocateDateRange(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <(DateTime StartDate, DateTime EndDate)> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetRangeDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); DateTime ST = new DateTime(); DateTime ED = new DateTime(); if (DateNumberList.Count == 6) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Year = DateNumberList[3]; Month = DateNumberList[4]; Day = DateNumberList[5]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 5) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Month = DateNumberList[3]; Day = DateNumberList[4]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 4) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Day = DateNumberList[3]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } } } } return(list); }
public List <String> GetCompanys(string OrgString) { var Rtn = new List <String>(); if (String.IsNullOrEmpty(OrgString)) { return(Rtn); } OrgString = OrgString.Replace(" ", ""); var Items = OrgString.Split(Utility.SplitChar); if (Items.Length > 3 && Items.Last().EndsWith("等")) { Items[Items.Length - 1] = Items[Items.Length - 1].Substring(0, Items[Items.Length - 1].Length - 1); } foreach (var SingleItem in Items) { var ExtractSingleItem = SingleItem; if (ExtractSingleItem.Equals("交易对方")) { continue; } var number = RegularTool.GetNumberList(ExtractSingleItem); if (number.Count == 1 && ExtractSingleItem.Contains("名")) { ExtractSingleItem = Utility.GetStringBefore(ExtractSingleItem, number[0]); } if (IsCompanyOrPerson(ExtractSingleItem)) { Rtn.Add(ExtractSingleItem); } else { //这里可能出现一些 “和” ,“及” 这样的文字,需要区分 var AndIdx = ExtractSingleItem.IndexOf("和"); if (AndIdx == -1) { AndIdx = ExtractSingleItem.IndexOf("及"); } if (AndIdx != -1 && AndIdx != 0 && AndIdx != (ExtractSingleItem.Length - 1)) { var FirstWord = ExtractSingleItem.Substring(0, AndIdx); if (FirstWord.Contains("等")) { FirstWord = Utility.GetStringBefore(FirstWord, "等"); } if (FirstWord.Contains("自然人")) { FirstWord = Utility.GetStringBefore(FirstWord, "自然人"); } var Secondword = ExtractSingleItem.Substring(AndIdx + 1); if (Secondword.Contains("等")) { Secondword = Utility.GetStringBefore(Secondword, "等"); } if (Secondword.Contains("自然人")) { Secondword = Utility.GetStringBefore(Secondword, "自然人"); } if (IsCompanyOrPerson(FirstWord) && IsCompanyOrPerson(Secondword)) { Rtn.Add(FirstWord); Rtn.Add(Secondword); } else { Console.WriteLine("无法匹配任何公司或者自然人:" + FirstWord + "|" + Secondword); return(new List <String>()); } } else { Console.WriteLine("无法匹配任何公司或者自然人:" + ExtractSingleItem); return(new List <String>()); } } } //Console.WriteLine("输入:" + OrgString); foreach (var item in Rtn) { //Console.WriteLine("输出:" + item); } return(Rtn); }
public static string NormailizeDate(string orgString, string keyword = "") { orgString = orgString.Trim().Replace(",", ""); var NumberList = RegularTool.GetNumberList(orgString); if (NumberList.Count == 6) { String Year = NumberList[3]; String Month = NumberList[4]; String Day = NumberList[5]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } if (NumberList.Count == 5) { if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { String Year = NumberList[0]; String Month = NumberList[3]; String Day = NumberList[4]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { if (month <= 12 && day <= 31) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } } } if (orgString.Contains("年") && orgString.Contains("月") && orgString.Contains("月")) { String Year = Utility.GetStringBefore(orgString, "年"); String Month = RegularTool.GetValueBetweenString(orgString, "年", "月"); String Day = Utility.GetStringAfter(orgString, "月").Replace("日", ""); int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } var SplitChar = new string[] { "/", ".", "-" }; foreach (var sc in SplitChar) { var SplitArray = orgString.Split(sc); if (SplitArray.Length == 3) { String Year = SplitArray[0]; String Month = SplitArray[1]; String Day = SplitArray[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } } return(orgString); }
public static MyRootHtmlNode Anlayze(string htmlfile) { TableId = 0; DetailItemId = 0; TableList = new Dictionary <int, List <String> >(); DetailItemList = new Dictionary <int, List <String> >(); //一般来说第一个都是DIV, <div title="关于重大合同中标的公告" type="pdf"> var doc = new HtmlDocument(); doc.Load(htmlfile); var node = doc.DocumentNode.SelectNodes("//div[@type='pdf']"); var root = new MyRootHtmlNode(); root.Content = node[0].Attributes["title"].Value; //第二层是所有的一定是Paragraph foreach (var SecondLayerNode in node[0].ChildNodes) { //Console.WriteLine(SecondLayerNode.Name); //跳过#text的节 if (SecondLayerNode.Name == "div") { var title = ""; if (SecondLayerNode.Attributes.Contains("title")) { title = SecondLayerNode.Attributes["title"].Value; } else { title = SecondLayerNode.InnerText; } var secondNode = new MyHtmlNode(); secondNode.Content = title; AnlayzeParagraph(SecondLayerNode, secondNode); FindContentWithList(secondNode.Children); for (int i = 0; i < secondNode.Children.Count - 1; i++) { secondNode.Children[i].NextBrother = secondNode.Children[i + 1]; } for (int i = 1; i < secondNode.Children.Count; i++) { secondNode.Children[i].PreviewBrother = secondNode.Children[i - 1]; } root.Children.Add(secondNode); } } //最后一个段落的检索 var LastParagrah = root.Children.Last(); if (LastParagrah.Children.Count > 0) { //重大合同:1232951 var LastSentence = LastParagrah.Children.Last().Content; var sentence = Utility.ConvertUpperDateToLittle(LastSentence); var strDate = RegularTool.GetDate(sentence); if (!String.IsNullOrEmpty(strDate)) { var strBefore = Utility.GetStringBefore(sentence, strDate); if (!String.IsNullOrEmpty(strBefore)) { //尾部除去 LastParagrah.Children.RemoveAt(LastParagrah.Children.Count - 1); strBefore = LastSentence.Substring(0, LastSentence.LastIndexOf("年") - 4); LastParagrah.Children.Add(new MyHtmlNode() { Content = strBefore }); LastParagrah.Children.Add(new MyHtmlNode() { Content = strDate }); } } } for (int i = 0; i < root.Children.Count - 1; i++) { root.Children[i].NextBrother = root.Children[i + 1]; } for (int i = 1; i < root.Children.Count; i++) { root.Children[i].PreviewBrother = root.Children[i - 1]; } root.TableList = TableList; root.DetailItemList = DetailItemList; var txtfilename = htmlfile.Replace("html", "txt"); if (File.Exists(txtfilename)) { Adjust(root, txtfilename); } return(root); }
/// <summary> /// 根据表头标题抽取 /// </summary> /// <param name="root"></param> /// <param name="id"></param> /// <returns></returns> List <RecordBase> ExtractFromTable() { var StockHolderRule = new TableSearchTitleRule(); StockHolderRule.Name = "股东全称"; StockHolderRule.Title = new string[] { "股东名称", "名称", "增持主体", "增持人", "减持主体", "减持人", "姓名" }.ToList(); StockHolderRule.IsTitleEq = true; StockHolderRule.IsRequire = true; var ChangeDateRule = new TableSearchTitleRule(); ChangeDateRule.Name = "变动截止日期"; ChangeDateRule.Title = new string[] { "买卖时间", "日期", "减持期间", "增持期间", "减持股份期间", "增持股份期间", "减持时间", "增持时间", "减持股份时间", "增持股份时间", "买入时间", "卖出时间" }.ToList(); ChangeDateRule.IsTitleEq = false; ChangeDateRule.Normalize = NormailizeEndChangeDate; var ChangePriceRule = new TableSearchTitleRule(); ChangePriceRule.Name = "变动价格"; ChangePriceRule.Title = new string[] { "买入均价", "卖出均价", "成交均价", "减持价格", "增持价格", "减持股均价", "增持股均价", "减持均", "增持均", "价格区间" }.ToList(); ChangePriceRule.IsTitleEq = false; ChangePriceRule.Normalize = (x, y) => { var prices = RegularTool.GetRegular(x, RegularTool.MoneyExpress); if (prices.Count == 0) { if (x.Contains("元")) { return(Utility.GetStringBefore(x, "元")); } } else { //增减持,区间的情况,取最高价,假设最后一个数字是最大的 return(prices.Last().RawData); } return(x); }; var ChangeNumberRule = new TableSearchTitleRule(); ChangeNumberRule.Name = "变动数量"; ChangeNumberRule.Title = new string[] { "成交数量", "减持股数", "增持股数", "减持数量", "增持数量", "买入股份数", "卖出股份数", "股数" }.ToList(); ChangeNumberRule.IsTitleEq = false; ChangeNumberRule.Normalize = NumberUtility.NormalizerStockNumber; var Rules = new List <TableSearchTitleRule>(); Rules.Add(StockHolderRule); Rules.Add(ChangeDateRule); Rules.Add(ChangePriceRule); Rules.Add(ChangeNumberRule); var result = HTMLTable.GetMultiInfoByTitleRules(root, Rules, false); if (result.Count == 0) { //没有抽取到任何数据 Rules.Clear(); ChangeDateRule.IsRequire = true; Rules.Add(ChangeDateRule); Rules.Add(ChangePriceRule); Rules.Add(ChangeNumberRule); result = HTMLTable.GetMultiInfoByTitleRules(root, Rules, false); if (result.Count == 0) { return(new List <RecordBase>()); } var NewResult = new List <CellInfo[]>(); var Name = GetHolderName(); if (String.IsNullOrEmpty(Name.FullName) && String.IsNullOrEmpty(Name.ShortName)) { return(new List <RecordBase>()); } foreach (var item in result) { NewResult.Add(new CellInfo[] { new CellInfo() { RawData = String.IsNullOrEmpty(Name.FullName)?Name.ShortName:Name.FullName }, item[0], item[1], item[2] }); } result = NewResult; } var holderafterlist = GetHolderAfter(); var stockchangelist = new List <RecordBase>(); foreach (var rec in result) { var stockchange = new StockChangeRec(); stockchange.Id = Id; var ModifyName = rec[0].RawData; //表格里面长的名字可能被分页切割掉 //这里使用合计表进行验证 if (!holderafterlist.Select((z) => { return(z.Name); }).ToList().Contains(ModifyName)) { foreach (var item in holderafterlist) { if (item.Name.EndsWith("先生")) { break; //特殊处理,没有逻辑可言 } if (item.Name.StartsWith(ModifyName) && !item.Name.Equals(ModifyName)) { ModifyName = item.Name; break; } if (item.Name.EndsWith(ModifyName) && !item.Name.Equals(ModifyName)) { ModifyName = item.Name; break; } } } var Name = CompanyNameLogic.NormalizeCompanyName(this, ModifyName); stockchange.HolderFullName = Name.FullName.NormalizeTextResult(); stockchange.HolderShortName = Name.ShortName; if (stockchange.HolderFullName.Contains("简称")) { stockchange.HolderShortName = Utility.GetStringAfter(stockchange.HolderFullName, "简称"); stockchange.HolderShortName = stockchange.HolderShortName.Replace(")", String.Empty).Replace("“", String.Empty).Replace("”", String.Empty); stockchange.HolderFullName = Utility.GetStringBefore(stockchange.HolderFullName, "("); } stockchange.ChangeEndDate = rec[1].RawData; DateTime x; if (!DateTime.TryParse(stockchange.ChangeEndDate, out x)) { //无法处理的情况 if (!Program.IsDebugMode) { //非调试模式 stockchange.ChangeEndDate = String.Empty; } } if (!String.IsNullOrEmpty(rec[2].RawData)) { //股价区间化的去除 if (!(rec[2].RawData.Contains("-") || rec[2].RawData.Contains("~") || rec[2].RawData.Contains("至"))) { stockchange.ChangePrice = rec[2].RawData.Replace(" ", String.Empty); stockchange.ChangePrice = stockchange.ChangePrice.Replace("*", ""); stockchange.ChangePrice = stockchange.ChangePrice.NormalizeNumberResult(); } } if (!RegularTool.IsUnsign(stockchange.ChangePrice)) { if (!String.IsNullOrEmpty(stockchange.ChangePrice)) { Console.WriteLine("Error ChangePrice:[" + stockchange.ChangePrice + "]"); } stockchange.ChangePrice = String.Empty; } if (!String.IsNullOrEmpty(rec[3].RawData)) { stockchange.ChangeNumber = rec[3].RawData.Replace(" ", String.Empty); stockchange.ChangeNumber = stockchange.ChangeNumber.NormalizeNumberResult(); if (!RegularTool.IsUnsign(stockchange.ChangeNumber)) { if (!String.IsNullOrEmpty(stockchange.ChangeNumber)) { Console.WriteLine("Error ChangeNumber:[" + stockchange.ChangeNumber + "]"); } stockchange.ChangeNumber = String.Empty; } } //基本上所有的有效记录都有股东名和截至日期,所以,这里这么做,可能对于极少数没有截至日期的数据有伤害,但是对于整体指标来说是好的 if (string.IsNullOrEmpty(stockchange.HolderFullName) || string.IsNullOrEmpty(stockchange.ChangeEndDate)) { continue; } if (stockchange.ChangeNumber == "0" || stockchange.ChangePrice == "0") { continue; } stockchangelist.Add(stockchange); } //寻找所有的股东全称 var namelist = stockchangelist.Select(x => ((StockChangeRec)x).HolderFullName).Distinct().ToList(); var newRec = new List <StockChangeRec>(); foreach (var name in namelist) { var stocklist = stockchangelist.Where((x) => { return(((StockChangeRec)x).HolderFullName == name); }).ToList(); stocklist.Sort((x, y) => { return(((StockChangeRec)x).ChangeEndDate.CompareTo(((StockChangeRec)x).ChangeEndDate)); }); var last = (StockChangeRec)stocklist.Last(); for (int i = 0; i < holderafterlist.Count; i++) { var after = holderafterlist[i]; after.Name = after.Name.Replace(" ", ""); if (after.Name == last.HolderFullName || after.Name == last.HolderShortName) { stockchangelist.Remove(last); //结构体,无法直接修改!!使用删除,增加的方法 last.HoldNumberAfterChange = after.Count; last.HoldPercentAfterChange = after.Percent; newRec.Add(last); } } } if (holderafterlist.Count != namelist.Count) { if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("增持者数量确认!"); } } stockchangelist.AddRange(newRec); return(stockchangelist); }
public static List <(String MoneyAmount, String MoneyCurrency)> SeekMoney(string OrgString) { OrgString = OrgString.Replace(" ", String.Empty); OrgString = OrgString.Replace("〇", "0"); OrgString = OrgString.Replace("○", "0"); //本次HTML的特殊处理 OrgString = OrgString.Replace("一", "1"); OrgString = OrgString.Replace("二", "2"); OrgString = OrgString.Replace("三", "3"); OrgString = OrgString.Replace("四", "4"); OrgString = OrgString.Replace("五", "5"); OrgString = OrgString.Replace("六", "6"); OrgString = OrgString.Replace("七", "7"); OrgString = OrgString.Replace("八", "8"); OrgString = OrgString.Replace("九", "9"); OrgString = OrgString.Replace("十", "10"); OrgString = OrgString.Replace("0", "0"); //本次HTML的特殊处理 OrgString = OrgString.Replace("1", "1"); OrgString = OrgString.Replace("2", "2"); OrgString = OrgString.Replace("3", "3"); OrgString = OrgString.Replace("4", "4"); OrgString = OrgString.Replace("5", "5"); OrgString = OrgString.Replace("6", "6"); OrgString = OrgString.Replace("7", "7"); OrgString = OrgString.Replace("8", "8"); OrgString = OrgString.Replace("9", "9"); var MoneyList = new List <(String MoneyAmount, String MoneyCurrency)>(); var LastIndex = 0; var detectString = OrgString; while (true) { bool IsCurrencyMark = false; detectString = detectString.Substring(LastIndex); var MoneyCurrency = String.Empty; //可能同时存在多个关键字,这里选择最前面一个关键字 var MinIdx = -1; foreach (var Currency in CurrencyList) { if (detectString.IndexOf(Currency) != -1) { if (MinIdx == -1) { MoneyCurrency = Currency; MinIdx = detectString.IndexOf(Currency); } else { if (MinIdx > detectString.IndexOf(Currency)) { MoneyCurrency = Currency; MinIdx = detectString.IndexOf(Currency); } } } } if (MoneyCurrency == String.Empty) { if (detectString.Contains("¥")) { IsCurrencyMark = true; MoneyCurrency = "人民币"; int currencyMarkIdx = detectString.IndexOf("¥"); for (int k = currencyMarkIdx + 1; k < detectString.Length; k++) { var s = detectString.Substring(k, 1); if (RegularTool.IsNumeric(s) || s == ",") { if (k == detectString.Length - 1) { LastIndex = k; break; } continue; } LastIndex = k; break; } } else { break; } } else { LastIndex = detectString.IndexOf(MoneyCurrency); } Regex rex = new Regex(@"^\d+"); var MoneyAmount = String.Empty; for (int i = LastIndex - 1; i >= 0; i--) { var SingleChar = detectString.Substring(i, 1); //惩 本次特殊处理 if (SingleChar == "." || SingleChar == "," || SingleChar == "," || SingleChar == "万" || SingleChar == "惩" || SingleChar == "亿" || rex.IsMatch(SingleChar)) { MoneyAmount = SingleChar + MoneyAmount; continue; } else { MoneyAmount = String.Empty; if (LastIndex == i + 1) { break; } MoneyAmount = detectString.Substring(i + 1, LastIndex - i - 1); MoneyAmount = Normalizer.NormalizeNumberResult(MoneyAmount); if (!rex.IsMatch(MoneyAmount)) { MoneyAmount = String.Empty; break; //暂时认为一定要有阿拉伯数字 } MoneyList.Add((MoneyAmount, MoneyCurrency)); MoneyAmount = String.Empty; break; } } if (MoneyAmount != String.Empty) { MoneyList.Add((MoneyAmount, MoneyCurrency)); } if (!IsCurrencyMark) { LastIndex += MoneyCurrency.Length; } else { //"¥"在字符串末尾的时候,如果不及时退出,会发生死循环! if (MoneyAmount == String.Empty) { break; } } } return(MoneyList); }
public static struCompanyName AfterProcessFullName(string FullName) { var ShortName = String.Empty; var CompanyNameTrailingwords = new string[] { "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称" }; //暂时不做括号的正规化 foreach (var trailing in CompanyNameTrailingwords) { if (FullName.Contains(trailing)) { //获取简称 var BracketsList = RegularTool.GetChineseBrackets(FullName); foreach (var bracketItem in BracketsList) { var ShortNameList = RegularTool.GetChineseQuotation(bracketItem); if (ShortNameList.Count > 0) { ShortName = ShortNameList.First(); if (!String.IsNullOrEmpty(ShortName)) { ShortName = ShortName.Substring(1, ShortName.Length - 2); } } } FullName = Utility.GetStringBefore(FullName, trailing); } } if (FullName.Contains("及其")) { FullName = Utility.GetStringBefore(FullName, "及其"); } if (FullName.Contains("股东")) { FullName = Utility.GetStringAfter(FullName, "股东"); } if (FullName.Contains("一致行动人")) { FullName = Utility.GetStringAfter(FullName, "一致行动人"); } if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName)) { FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName; } //删除前导 FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName); FullName = CutOtherLeadingWords(FullName); if (ShortName != String.Empty) { return(new struCompanyName() { secFullName = FullName, secShortName = ShortName, Score = 80 }); } else { return(new struCompanyName() { secFullName = FullName, Score = 60 }); } }
struContract ExtractSingle(MyRootHtmlNode root, String Id) { contractType = String.Empty; foreach (var paragrah in root.Children) { foreach (var item in paragrah.Children) { if (item.Content.Contains("中标")) { contractType = "中标"; break; } if (item.Content.Contains("合同")) { contractType = "合同"; break; } } if (contractType != String.Empty) { break; } } if (contractType == String.Empty) { Console.WriteLine("contractType Null:" + Id); } var contract = new struContract(); //公告ID contract.id = Id; //甲方 contract.JiaFang = GetJiaFang(); contract.JiaFang = CompanyNameLogic.AfterProcessFullName(contract.JiaFang).secFullName; contract.JiaFang = contract.JiaFang.NormalizeTextResult(); if (!Nerlist.Contains(contract.JiaFang)) { //作为特殊单位,国家电网公司一般都是甲方 if (Nerlist.Contains("国家电网公司")) { contract.JiaFang = "国家电网公司"; } } //乙方 contract.YiFang = GetYiFang(); contract.YiFang = CompanyNameLogic.AfterProcessFullName(contract.YiFang).secFullName; contract.YiFang = contract.YiFang.NormalizeTextResult(); //按照规定除去括号 contract.YiFang = RegularTool.TrimBrackets(contract.YiFang); //项目 contract.ProjectName = GetProjectName(); if (contract.ProjectName.StartsWith("“") && contract.ProjectName.EndsWith("”")) { contract.ProjectName = contract.ProjectName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } if (contract.ProjectName.EndsWith(",签约双方")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, ",签约双方"); } if (contract.ProjectName.Contains("(以下简称")) { contract.ProjectName = Utility.GetStringAfter(contract.ProjectName, "(以下简称"); } contract.ProjectName = contract.ProjectName.NormalizeTextResult(); //合同 if (contractType == "中标") { //按照数据分析来看,应该工程名 在中标的时候填写,合同名在合同的时候填写 contract.ContractName = String.Empty; } else { contract.ContractName = GetContractName(); if (contract.ContractName.StartsWith("“") && contract.ContractName.EndsWith("”")) { contract.ContractName = contract.ContractName.TrimStart("“".ToCharArray()).TrimEnd("”".ToCharArray()); } //去掉书名号 contract.ContractName = contract.ContractName.Replace("《", String.Empty).Replace("》", String.Empty); if (contract.ContractName.Contains("(以下简称")) { contract.ContractName = Utility.GetStringAfter(contract.ContractName, "(以下简称"); } contract.ContractName = contract.ContractName.NormalizeTextResult(); } //金额 var money = GetMoney(); contract.ContractMoneyUpLimit = MoneyUtility.Format(money.MoneyAmount, String.Empty); contract.ContractMoneyDownLimit = contract.ContractMoneyUpLimit; //联合体 contract.UnionMember = GetUnionMember(contract.JiaFang, contract.YiFang); contract.UnionMember = contract.UnionMember.NormalizeTextResult(); //按照规定除去括号 contract.UnionMember = RegularTool.TrimBrackets(contract.UnionMember); return(contract); }
List <struHoldAfter> GetHolderAfter2ndStep() { var HoldList = new List <struHoldAfter>(); var keyword = new string[] { "增持后持股", "减持后持股" }; foreach (var table in root.TableList) { var HeaderRowNo = -1; var mt = new HTMLTable(table.Value); for (int RowCount = 1; RowCount <= mt.RowCount; RowCount++) { for (int ColumnCount = 1; ColumnCount < mt.ColumnCount; ColumnCount++) { var value = mt.CellValue(RowCount, ColumnCount); foreach (var key in keyword) { if (value.Contains(key)) { HeaderRowNo = RowCount; break; } } if (HeaderRowNo != -1) { break; } } if (HeaderRowNo != -1) { break; } } if (HeaderRowNo != -1) { //如果有5格 if (mt.ColumnCount != 5) { continue; } int PercentCol = -1; for (int rowno = HeaderRowNo + 1; rowno <= mt.RowCount; rowno++) { var value1 = mt.CellValue(rowno, 1); var Title4 = mt.CellValue(HeaderRowNo, 4); var value4 = mt.CellValue(rowno, 4); value4 = value4.Trim().Replace(",", String.Empty); value4 = value4.Trim().Replace(",", String.Empty); var Title5 = mt.CellValue(HeaderRowNo, 5).Replace(" ", ""); var value5 = mt.CellValue(rowno, 5); value5 = value5.Trim().Replace(",", String.Empty); value5 = value5.Trim().Replace(",", String.Empty); if (Title5.Contains("增持后持股比例(%)") || Title5.Contains("减持后持股比例(%)")) { PercentCol = 5; //Console.WriteLine(Title5); } if (PercentCol == 5 && !value5.Contains("%")) { value5 += "%"; } if (RegularTool.IsNumeric(value4) && RegularTool.IsPercent(value5)) { //Console.WriteLine("GetHolderAfter2ndStep:" + value1); HoldList.Add(new struHoldAfter() { Name = value1, Count = getAfterstock(Title4, value4), Percent = getAfterpercent(value5), Used = false }); continue; } } } } return(HoldList); }
/// <summary> /// 正则表达式检索方法(前置,正则,后置) /// </summary> /// <param name="loc"></param> /// <param name="OrgString"></param> /// <param name="regularfeature"></param> /// <param name="SplitChar"></param> /// <returns></returns> public static List <LocAndValue <String> > RegularExFinder(int loc, string OrgString, struRegularExpressFeature regularfeature, string SplitChar = "") { var list = new List <LocAndValue <String> >(); var reglist = RegularTool.GetRegular(OrgString, regularfeature.RegularExpress); foreach (var reg in reglist) { //根据前后词语进行过滤 bool IsBeforeOK = true; string BeforeString = ""; if (regularfeature.LeadingWordList != null) { IsBeforeOK = false; //前置词语 foreach (var leading in regularfeature.LeadingWordList) { if (reg.Index - leading.Length >= 0) { var word = OrgString.Substring(reg.Index - leading.Length, leading.Length); if (word.Equals(leading)) { BeforeString = leading; IsBeforeOK = true; break; } else { continue; } } } } if (!IsBeforeOK) { continue; } bool IsAfterOK = true; string AfterString = ""; if (regularfeature.TrailingWordList != null) { IsAfterOK = false; //后置词语 foreach (var trailing in regularfeature.TrailingWordList) { if (reg.Index + reg.Length + trailing.Length <= OrgString.Length) { var word = OrgString.Substring(reg.Index + reg.Length, trailing.Length); if (word.Equals(trailing)) { AfterString = trailing; IsAfterOK = true; break; } else { continue; } } } } if (IsBeforeOK && IsAfterOK) { var Loc = new LocAndValue <String>() { Value = BeforeString + SplitChar + reg.RawData + SplitChar + AfterString, StartIdx = reg.Index - BeforeString.Length, Loc = loc }; list.Add(Loc); } } return(list); }
public static string GetRangeDateEndDate(string orgString, DateTime BaseDate, string format = "yyyy-MM-dd") { orgString = orgString.Replace(" ", ""); orgString = orgString.Trim().Replace(",", String.Empty); //XXXX年XX月XX日 - XXXX年XX月XX日 var NumberList = RegularTool.GetNumberList(orgString); if (NumberList.Count == 6) { String Year = NumberList[3]; String Month = NumberList[4]; String Day = NumberList[5]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } //XXXX年XX月XX日 - XX月XX日 if (NumberList.Count == 5) { if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { String Year = NumberList[0]; String Month = NumberList[3]; String Day = NumberList[4]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } //XXXX年XX月XX日 - XX日 if (NumberList.Count == 4) { if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { String Year = NumberList[0]; String Month = NumberList[1]; String Day = NumberList[3]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } //XX月XX日 if (NumberList.Count == 2) { if (orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { if (BaseDate.Year == 0) { return(orgString); } String Month = NumberList[0]; String Day = NumberList[1]; int month; int day; if (int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(BaseDate.Year, month, day); return(d.ToString(format)); } } if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1) { /* * 数据主要应用于“股东增减持”类型公告的抽取,对于“变动截止日期”字段,存在少量公告中只公布了月份,未公布具体的日期。对这种情况的处理标准为: * 如果该月份在公告发布月份的前面,变动截止日期为该月份最后1个交易日; * 如果该月份是公告发布的月份,变动截止日期为公告发布日期(见本次更新表格); */ String Year = NumberList[0]; String Month = NumberList[1]; int year; int month; if (int.TryParse(Year, out year) && int.TryParse(Month, out month)) { //获得公告时间 if (year == BaseDate.Year && month == BaseDate.Month) { return(BaseDate.ToString(format)); } var d = DateUtility.GetWorkDay(year, month, -1); return(d.ToString(format)); } } if (orgString.IndexOf("月") != -1) { String Year = NumberList[0]; if (Year.Length != 4) { return(orgString); } String Month = NumberList[1]; int year; int month; if (int.TryParse(Year, out year) && int.TryParse(Month, out month)) { var d = DateUtility.GetWorkDay(year, month, -1); return(d.ToString(format)); } } } //XXXX年XX月XX日 if (orgString.Contains("年") && orgString.Contains("月") && orgString.Contains("月")) { String Year = Utility.GetStringBefore(orgString, "年"); String Month = RegularTool.GetValueBetweenString(orgString, "年", "月"); String Day = Utility.GetStringAfter(orgString, "月").Replace("日", String.Empty); int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } if (RegularTool.IsInt(orgString)) { if (orgString.Length == 8) { String Year = orgString.Substring(0, 4); String Month = orgString.Substring(4, 2); String Day = orgString.Substring(6, 2); int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { if (year < 1900 || year > 2100) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } } var SplitChar = new string[] { "/", ".", "-" }; foreach (var sc in SplitChar) { var SplitArray = orgString.Split(sc); if (SplitArray.Length == 3) { String Year = SplitArray[0]; String Month = SplitArray[1]; String Day = SplitArray[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } return(orgString); }