//获得日期 public static List <LocAndValue <DateTime> > LocateDate(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <DateTime> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { list.Add(new LocAndValue <DateTime>() { Loc = sentence.PositionId, Type = "日期", Value = DateUtility.GetWorkDay(year, month, day) }); } } } } return(list); }
/// <summary> /// 最后用抽取 /// </summary> static void Main_FINAL(string[] args) { Logger = new StreamWriter("Log.log"); //实体属性器日志设定 EntityProperty.Logger = Logger; //全局编码 Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); //结巴分词的地名修正词典 PosNS.ImportNS("Resources" + Path.DirectorySeparatorChar + "ns.dict"); if (!Directory.Exists("/home/118_4/submit")) { Directory.CreateDirectory("/home/118_4/submit"); } Console.WriteLine("Start To Extract Info Contract TRAIN"); StreamWriter ResultCSV = new StreamWriter(@"/home/118_4/submit/hetong.txt", false, utf8WithoutBom); Run <Contract>(@"/home/data/hetong", @"/home/118_4/temp/hetong", ResultCSV); Console.WriteLine("Complete Extract Info Contract"); Console.WriteLine("Start To Extract Info StockChange TRAIN"); Console.WriteLine("读取增减持信息:" + "/home/data/zengjianchi/zengjianchi_public.csv"); var sr = new StreamReader("/home/data/zengjianchi/zengjianchi_public.csv"); sr.ReadLine(); //Skip Header while (!sr.EndOfStream) { var line = sr.ReadLine().Split(","); var numbers = RegularTool.GetNumberList(line[0]); int year = int.Parse(numbers[0]); int month = int.Parse(numbers[1]); int day = int.Parse(numbers[2]); var AnnouceDate = new DateTime(year, month, day); PublishTime.Add(line[1], AnnouceDate); //Console.WriteLine("ID:" + line[1] + " Date:" + AnnouceDate.ToString("yyyy-MM-dd")); } sr.Close(); Console.WriteLine("读取增减持信息:" + PublishTime.Count); ResultCSV = new StreamWriter(@"/home/118_4/submit/zengjianchi.txt", false, utf8WithoutBom); Run <StockChange>(@"/home/data/zengjianchi", @"/home/118_4/temp/zengjianchi", ResultCSV); Console.WriteLine("Complete Extract Info StockChange"); Console.WriteLine("Start To Extract Info Reorganization TRAIN"); //替代训练结果 Console.WriteLine("加载替代训练结果"); ReOrganizationTraning.EvaluateMethodList = new string[] { "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法", "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法", "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法" }.ToList(); Console.WriteLine("加载替代训练结果:" + ReOrganizationTraning.EvaluateMethodList.Count); ResultCSV = new StreamWriter(@"/home/118_4/submit/chongzu.txt", false, utf8WithoutBom); Run <Reorganization>(@"/home/data/chongzu", "", ResultCSV); Console.WriteLine("Complete Extract Info Reorganization"); Logger.Close(); }
//获得日期 public static List <LocAndValue <(DateTime StartDate, DateTime EndDate)> > LocateDateRange(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <(DateTime StartDate, DateTime EndDate)> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetRangeDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); DateTime ST = new DateTime(); DateTime ED = new DateTime(); if (DateNumberList.Count == 6) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Year = DateNumberList[3]; Month = DateNumberList[4]; Day = DateNumberList[5]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 5) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Month = DateNumberList[3]; Day = DateNumberList[4]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 4) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Day = DateNumberList[3]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } } } } return(list); }
public List <String> GetCompanys(string OrgString) { var Rtn = new List <String>(); if (String.IsNullOrEmpty(OrgString)) { return(Rtn); } OrgString = OrgString.Replace(" ", ""); var Items = OrgString.Split(Utility.SplitChar); if (Items.Length > 3 && Items.Last().EndsWith("等")) { Items[Items.Length - 1] = Items[Items.Length - 1].Substring(0, Items[Items.Length - 1].Length - 1); } foreach (var SingleItem in Items) { var ExtractSingleItem = SingleItem; if (ExtractSingleItem.Equals("交易对方")) { continue; } var number = RegularTool.GetNumberList(ExtractSingleItem); if (number.Count == 1 && ExtractSingleItem.Contains("名")) { ExtractSingleItem = Utility.GetStringBefore(ExtractSingleItem, number[0]); } if (IsCompanyOrPerson(ExtractSingleItem)) { Rtn.Add(ExtractSingleItem); } else { //这里可能出现一些 “和” ,“及” 这样的文字,需要区分 var AndIdx = ExtractSingleItem.IndexOf("和"); if (AndIdx == -1) { AndIdx = ExtractSingleItem.IndexOf("及"); } if (AndIdx != -1 && AndIdx != 0 && AndIdx != (ExtractSingleItem.Length - 1)) { var FirstWord = ExtractSingleItem.Substring(0, AndIdx); if (FirstWord.Contains("等")) { FirstWord = Utility.GetStringBefore(FirstWord, "等"); } if (FirstWord.Contains("自然人")) { FirstWord = Utility.GetStringBefore(FirstWord, "自然人"); } var Secondword = ExtractSingleItem.Substring(AndIdx + 1); if (Secondword.Contains("等")) { Secondword = Utility.GetStringBefore(Secondword, "等"); } if (Secondword.Contains("自然人")) { Secondword = Utility.GetStringBefore(Secondword, "自然人"); } if (IsCompanyOrPerson(FirstWord) && IsCompanyOrPerson(Secondword)) { Rtn.Add(FirstWord); Rtn.Add(Secondword); } else { Console.WriteLine("无法匹配任何公司或者自然人:" + FirstWord + "|" + Secondword); return(new List <String>()); } } else { Console.WriteLine("无法匹配任何公司或者自然人:" + ExtractSingleItem); return(new List <String>()); } } } //Console.WriteLine("输入:" + OrgString); foreach (var item in Rtn) { //Console.WriteLine("输出:" + item); } return(Rtn); }
public static string NormailizeDate(string orgString, string keyword = "") { orgString = orgString.Trim().Replace(",", ""); var NumberList = RegularTool.GetNumberList(orgString); if (NumberList.Count == 6) { String Year = NumberList[3]; String Month = NumberList[4]; String Day = NumberList[5]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } if (NumberList.Count == 5) { if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { String Year = NumberList[0]; String Month = NumberList[3]; String Day = NumberList[4]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { if (month <= 12 && day <= 31) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } } } if (orgString.Contains("年") && orgString.Contains("月") && orgString.Contains("月")) { String Year = Utility.GetStringBefore(orgString, "年"); String Month = RegularTool.GetValueBetweenString(orgString, "年", "月"); String Day = Utility.GetStringAfter(orgString, "月").Replace("日", ""); int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } var SplitChar = new string[] { "/", ".", "-" }; foreach (var sc in SplitChar) { var SplitArray = orgString.Split(sc); if (SplitArray.Length == 3) { String Year = SplitArray[0]; String Month = SplitArray[1]; String Day = SplitArray[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = new DateTime(year, month, day); return(d.ToString("yyyy-MM-dd")); } } } return(orgString); }
public static string GetRangeDateEndDate(string orgString, DateTime BaseDate, string format = "yyyy-MM-dd") { orgString = orgString.Replace(" ", ""); orgString = orgString.Trim().Replace(",", String.Empty); //XXXX年XX月XX日 - XXXX年XX月XX日 var NumberList = RegularTool.GetNumberList(orgString); if (NumberList.Count == 6) { String Year = NumberList[3]; String Month = NumberList[4]; String Day = NumberList[5]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } //XXXX年XX月XX日 - XX月XX日 if (NumberList.Count == 5) { if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { String Year = NumberList[0]; String Month = NumberList[3]; String Day = NumberList[4]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } //XXXX年XX月XX日 - XX日 if (NumberList.Count == 4) { if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { String Year = NumberList[0]; String Month = NumberList[1]; String Day = NumberList[3]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } //XX月XX日 if (NumberList.Count == 2) { if (orgString.IndexOf("月") != -1 && orgString.IndexOf("日") != -1) { if (BaseDate.Year == 0) { return(orgString); } String Month = NumberList[0]; String Day = NumberList[1]; int month; int day; if (int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(BaseDate.Year, month, day); return(d.ToString(format)); } } if (orgString.IndexOf("年") != -1 && orgString.IndexOf("月") != -1) { /* * 数据主要应用于“股东增减持”类型公告的抽取,对于“变动截止日期”字段,存在少量公告中只公布了月份,未公布具体的日期。对这种情况的处理标准为: * 如果该月份在公告发布月份的前面,变动截止日期为该月份最后1个交易日; * 如果该月份是公告发布的月份,变动截止日期为公告发布日期(见本次更新表格); */ String Year = NumberList[0]; String Month = NumberList[1]; int year; int month; if (int.TryParse(Year, out year) && int.TryParse(Month, out month)) { //获得公告时间 if (year == BaseDate.Year && month == BaseDate.Month) { return(BaseDate.ToString(format)); } var d = DateUtility.GetWorkDay(year, month, -1); return(d.ToString(format)); } } if (orgString.IndexOf("月") != -1) { String Year = NumberList[0]; if (Year.Length != 4) { return(orgString); } String Month = NumberList[1]; int year; int month; if (int.TryParse(Year, out year) && int.TryParse(Month, out month)) { var d = DateUtility.GetWorkDay(year, month, -1); return(d.ToString(format)); } } } //XXXX年XX月XX日 if (orgString.Contains("年") && orgString.Contains("月") && orgString.Contains("月")) { String Year = Utility.GetStringBefore(orgString, "年"); String Month = RegularTool.GetValueBetweenString(orgString, "年", "月"); String Day = Utility.GetStringAfter(orgString, "月").Replace("日", String.Empty); int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } if (RegularTool.IsInt(orgString)) { if (orgString.Length == 8) { String Year = orgString.Substring(0, 4); String Month = orgString.Substring(4, 2); String Day = orgString.Substring(6, 2); int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { if (year < 1900 || year > 2100) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } } var SplitChar = new string[] { "/", ".", "-" }; foreach (var sc in SplitChar) { var SplitArray = orgString.Split(sc); if (SplitArray.Length == 3) { String Year = SplitArray[0]; String Month = SplitArray[1]; String Day = SplitArray[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { var d = DateUtility.GetWorkDay(year, month, day); return(d.ToString(format)); } } } return(orgString); }
static string NormalizerFreezeYear(string orgString, string TitleWord) { orgString = orgString.Replace(" ", String.Empty); if (orgString.Equals("十二")) { return("12"); } var x1 = Utility.GetStringAfter(orgString, "日起"); int x2; if (int.TryParse(x1, out x2)) { return(x2.ToString()); } x1 = Utility.GetStringBefore(orgString, "个月"); if (int.TryParse(x1, out x2)) { return(x2.ToString()); } x1 = RegularTool.GetValueBetweenString(orgString, "日起", "个月"); if (x1.Equals("十二")) { return("12"); } if (int.TryParse(x1, out x2)) { return(x2.ToString()); } if (orgString.Equals("十二")) { return("12"); } if (orgString.Equals("十二个月")) { return("12"); } if (orgString.Equals("1年")) { return("12"); } if (orgString.Equals("3年")) { return("36"); } //自2007年2月3日至2010年2月2日止 var numbers = RegularTool.GetNumberList(orgString); if (numbers.Count == 6) { var sty = 0; var edy = 0; if (int.TryParse(numbers[3], out edy) && int.TryParse(numbers[0], out sty)) { if (edy - sty == 1) { return("12"); } if (edy - sty == 3) { return("36"); } if (!Program.IsMultiThreadMode) { Program.Logger.WriteLine("限售期确认:" + orgString); } } } return(orgString.Trim()); }