public static void Anlayze(HTMLEngine.MyRootHtmlNode root, string KeyWord) { Console.WriteLine("关键字:[" + KeyWord + "]"); foreach (var paragrah in root.Children) { var segments = segmenter.Cut(paragrah.FullText).ToList(); // 默认为精确模式 Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); //寻找关键字的位置 for (int i = 0; i < segments.Count; i++) { if (segments[i].Equals(KeyWord)) { //前5个词语和后五个词语 var startInx = Math.Max(0, i - 5); var EndInx = Math.Min(i + 5, segments.Count); for (int s = startInx; s < i; s++) { Console.WriteLine("前导关键字:[" + segments[s] + "]"); } Console.WriteLine("关键字:[" + KeyWord + "]"); for (int s = i + 1; s < EndInx; s++) { Console.WriteLine("后续关键字:[" + segments[s] + "]"); } } } } }
public static List <LocAndValue <String> > LocatePercent(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; var BracketList = RegularTool.GetChineseBrackets(OrgString); Regex r = new Regex(RegularTool.PercentExpress); foreach (var item in r.Matches(OrgString).ToList()) { list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Description = "百分比", Value = item.Value, StartIdx = item.Index }); } } } return(list); }
//寻找同时含有关键字的列的表头 public static void PutTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord) { foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { if (t.CellValue(RowNo, ColNo).NormalizeKey().Equals(KeyWord.NormalizeKey())) { var title = t.CellValue(1, ColNo); if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } } } } } }
/// <summary> /// 增减持训练 /// </summary> /// <param name="TraningCnt">训练条数</param> public static void Traning(int TraningCnt = int.MaxValue) { var ChangeMethodTool = new TableAnlayzeTool(); var PreviewId = String.Empty; var PreviewRoot = new HTMLEngine.MyRootHtmlNode(); int Cnt = 0; foreach (var stockchange in TraningDataset.StockChangeList) { if (!PreviewId.Equals(stockchange.Id)) { var htmlfile = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\增减持\html\" + stockchange.Id + ".html"; PreviewRoot = new HTMLEngine().Anlayze(htmlfile, ""); PreviewId = stockchange.Id; Cnt++; if (Cnt == TraningCnt) { break; } } ChangeMethodTool.PutValueTrainingItem(PreviewRoot, new string[] { "减持方式", "增持方式" }.ToList()); } var rank = Utility.FindTop(10, ChangeMethodTool.TrainingValueResult); Program.Training.WriteLine("增减持方式"); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } }
public static void TrainingIncreaseTarget() { TraningDataset.InitIncreaseStock(); var PreviewId = ""; var PreviewRoot = new HTMLEngine.MyRootHtmlNode(); foreach (var increase in TraningDataset.IncreaseStockList) { if (PreviewId.Equals(increase.id)) { var htmlfile = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\定增\html\" + increase.id + ".html"; PreviewRoot = HTMLEngine.Anlayze(htmlfile); } TableAnlayzeTool.PutTrainingItem(PreviewRoot, increase.PublishTarget); } var Rank = new List <int>(); Rank = TableAnlayzeTool.TrainingTitleResult.Values.ToList(); Rank.Sort(); Rank.Reverse(); var Top10 = Rank[9]; foreach (var title in TableAnlayzeTool.TrainingTitleResult) { if (title.Value >= Top10) { Console.WriteLine(title.Key + ":" + title.Value); } } }
/// <summary> /// 自定义字符列表 /// </summary> /// <param name="root"></param> /// <param name="CustomerWord"></param> /// <returns></returns> public static List <LocAndValue <String> > LocateCustomerWord(HTMLEngine.MyRootHtmlNode root, List <String> CustomerWord, string description = "字符") { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; foreach (var word in CustomerWord) { int ScanStartIdx = 0; while (OrgString.IndexOf(word, ScanStartIdx) != -1) { list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Description = description, Value = word, StartIdx = OrgString.IndexOf(word, ScanStartIdx) }); ScanStartIdx = OrgString.IndexOf(word, ScanStartIdx) + word.Length; } } } } return(list); }
/// <summary> /// 寻找含有关键字的列的表头 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> public void PutTitleTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord) { foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (Transform != null) { value = Transform(value, title); } if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult())) { if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } } } } } }
/// <summary> /// 引号和书名号内容提取 /// </summary> /// <param name="root">原始HTML</param> /// <param name="IsSkipBracket">是否忽略括号内部的内容</param> /// <returns></returns> public static List <LocAndValue <String> > LocateQuotation(HTMLEngine.MyRootHtmlNode root, bool IsSkipBracket = true) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; var BracketList = RegularTool.GetChineseBrackets(OrgString); Regex r = new Regex(@"\《.*?\》"); foreach (var item in r.Matches(OrgString).ToList()) { bool IsContentInBracket = false; foreach (var bracketItem in BracketList) { if (bracketItem.Contains(item.Value)) { IsContentInBracket = true; break; } } if (IsSkipBracket && IsContentInBracket) { continue; } list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Type = "字符", Value = item.Value.Substring(1, item.Value.Length - 2) }); } r = new Regex(@"\“.*?\”"); foreach (var item in r.Matches(OrgString).ToList()) { bool IsContentInBracket = false; foreach (var bracketItem in BracketList) { if (bracketItem.Contains(item.Value)) { IsContentInBracket = true; break; } } if (IsSkipBracket && IsContentInBracket) { continue; } list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Type = "字符", Value = item.Value.Substring(1, item.Value.Length - 2) }); } } } return(list); }
//获得日期 public static List <LocAndValue <DateTime> > LocateDate(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <DateTime> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { list.Add(new LocAndValue <DateTime>() { Loc = sentence.PositionId, Type = "日期", Value = DateUtility.GetWorkDay(year, month, day) }); } } } } return(list); }
/// <summary> /// 增发对象训练 /// </summary> /// <param name="TraningCnt">训练条数</param> public static void Training(int TraningCnt = int.MaxValue) { var TargetTool = new TableAnlayzeTool(); var IncreaseNumberTool = new TableAnlayzeTool(); IncreaseNumberTool.Transform = NumberUtility.NormalizerStockNumber; var IncreaseMoneyTool = new TableAnlayzeTool(); IncreaseMoneyTool.Transform = MoneyUtility.Format; var PreviewId = String.Empty; var PreviewRoot = new HTMLEngine.MyRootHtmlNode(); int Cnt = 0; foreach (var increase in TraningDataset.IncreaseStockList) { if (!PreviewId.Equals(increase.Id)) { var htmlfile = Program.DocBase + @"\FDDC_announcements_round1_train_20180518\定增\html\" + increase.Id + ".html"; PreviewRoot = new HTMLEngine().Anlayze(htmlfile, ""); PreviewId = increase.Id; Cnt++; if (Cnt == TraningCnt) { break; } } TargetTool.PutTitleTrainingItem(PreviewRoot, increase.PublishTarget); IncreaseNumberTool.PutTitleTrainingItem(PreviewRoot, increase.IncreaseNumber); IncreaseMoneyTool.PutTitleTrainingItem(PreviewRoot, increase.IncreaseMoney); } var rank = Utility.FindTop(10, TargetTool.TrainingTitleResult); Program.Training.WriteLine("增发对象"); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } rank = Utility.FindTop(10, IncreaseNumberTool.TrainingTitleResult); Program.Training.WriteLine("增发数量"); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } rank = Utility.FindTop(10, IncreaseMoneyTool.TrainingTitleResult); Program.Training.WriteLine("增发金额"); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } }
public static List <struCompanyName> GetCompanyNameByCutWordFromHTML(HTMLEngine.MyRootHtmlNode root) { var namelist = new List <struCompanyName>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { GetCompany(namelist, sentence.Content, sentence.PositionId); } } return(namelist); }
public static string GetCompanyFullName(HTMLEngine.MyRootHtmlNode root) { var Extractor = new EntityProperty(); Extractor.TrailingWordList = new string[] { "公司董事会" }; Extractor.Extract(root); Extractor.CandidateWord.Reverse(); foreach (var item in Extractor.CandidateWord) { Program.Logger.WriteLine("全称:[" + item + "公司]"); return(item); } return(""); }
//获得金额 public static List <LocAndValue> LocateMoney(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = Utility.ConvertUpperDateToLittle(OrgString).Replace(" ", ""); var Money = Utility.SeekMoney(OrgString); } } return(list); }
List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root) { var result = new List <string>(); foreach (var item in KeyWordMap) { var cnt = ExtractPropertyByHTML.FindWordCnt(item.Key, root).Count; if (cnt > 0) { if (!result.Contains(item.Value)) { result.Add(item.Value); } } } return(result); }
List <string> ExtractByKeyWordMap(HTMLEngine.MyRootHtmlNode root) { var result = new List <string>(); foreach (var item in KeyWordMap) { var HasKey = ExtractPropertyByHTML.HasWord(item.Key, root); if (HasKey) { if (!result.Contains(item.Value)) { result.Add(item.Value); } } } return(result); }
public static void GetEvaluateMethodTitle(int TraningCnt = int.MaxValue) { var TargetTool = new TableAnlayzeTool(); var PreviewId = String.Empty; var PreviewRoot = new HTMLEngine.MyRootHtmlNode(); int Cnt = 0; foreach (var ReOrg in TraningDataset.ReorganizationList) { if (!PreviewId.Equals(ReOrg.Id)) { var htmlfile = Program.ReorganizationPath_TRAIN + Path.DirectorySeparatorChar + @"html" + Path.DirectorySeparatorChar + ReOrg.Id + ".html"; if (!System.IO.File.Exists(htmlfile)) { continue; } PreviewRoot = new HTMLEngine().Anlayze(htmlfile, ""); PreviewId = ReOrg.Id; Cnt++; if (Cnt == TraningCnt) { break; } } if (!String.IsNullOrEmpty(ReOrg.EvaluateMethod)) { TargetTool.PutTitleTrainingItemWithCodition(PreviewRoot, ReOrg.EvaluateMethod, ReOrg.TargetCompany); } } var rank = Utility.FindTop(10, TargetTool.TrainingTitleResult); Program.Training.WriteLine("评估方法"); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } Program.Training.WriteLine("标的"); rank = Utility.FindTop(10, TargetTool.TrainingTitleCondition); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } Program.Training.Flush(); }
/// <summary> /// 寻找表中交易对手的标题 /// </summary> /// <param name="TraningCnt"></param> public static void GetTradeCompanyTitle(int TraningCnt = int.MaxValue) { var TargetTool = new TableAnlayzeTool(); var PreviewId = String.Empty; var PreviewRoot = new HTMLEngine.MyRootHtmlNode(); int Cnt = 0; foreach (var ReOrg in TraningDataset.ReorganizationList) { if (!PreviewId.Equals(ReOrg.Id)) { var htmlfile = Program.ReorganizationPath_TRAIN + @"\html\" + ReOrg.Id + ".html"; if (!System.IO.File.Exists(htmlfile)) { continue; } PreviewRoot = new HTMLEngine().Anlayze(htmlfile, ""); PreviewId = ReOrg.Id; Cnt++; if (Cnt == TraningCnt) { break; } } foreach (var item in ReOrg.TradeCompany.Split(Utility.SplitChar)) { TargetTool.PutTitleTrainingItem(PreviewRoot, item); } } var rank = Utility.FindTop(10, TargetTool.TrainingTitleResult); Program.Training.WriteLine("交易对象"); foreach (var rec in rank) { Program.Training.WriteLine(rec.ToString()); } foreach (var item in TargetTool.WholeHeaderRow) { Program.Training.WriteLine(item); } Program.Training.Flush(); }
/// <summary> /// 股数 /// </summary> /// <param name="root"></param> /// <returns></returns> public static List <LocAndValue <String> > LocateStockNumber(HTMLEngine.MyRootHtmlNode root) { var targetRegular = new ExtractProperyBase.struRegularExpressFeature() { RegularExpress = @"\d+(,\d+)+", TrailingWordList = new string[] { "股" }.ToList() }; var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var ExpResult = ExtractPropertyByHTML.RegularExFinder(sentence.PositionId, sentence.Content, targetRegular, "|"); list.AddRange(ExpResult); } } return(list); }
//固定搭配 public static string GetCompanyShortName(HTMLEngine.MyRootHtmlNode root) { var companyList = new Dictionary <string, string>(); //从第一行开始找到 有限公司 有限责任公司, 如果有简称的话Value是简称 //股票简称:东方电气 //东方电气股份有限公司董事会 var Extractor = new EntityProperty(); Extractor.LeadingWordList = new string[] { "股票简称", "证券简称" }; Extractor.Extract(root); foreach (var item in Extractor.CandidateWord) { var ShortName = item.Replace(":", "").Replace(":", "").Trim(); if (Utility.GetStringBefore(ShortName, "、") != "") { ShortName = Utility.GetStringBefore(ShortName, "、"); } if (Utility.GetStringBefore(ShortName, ")") != "") { ShortName = Utility.GetStringBefore(ShortName, ")"); } if (Utility.GetStringBefore(ShortName, "公告") != "") { ShortName = Utility.GetStringBefore(ShortName, "公告"); } if (Utility.GetStringBefore(ShortName, "股票") != "") { ShortName = Utility.GetStringBefore(ShortName, "股票"); } if (Utility.GetStringBefore(ShortName, "证券") != "") { ShortName = Utility.GetStringBefore(ShortName, "证券"); } if (Utility.GetStringBefore(ShortName, " ") != "") { ShortName = Utility.GetStringBefore(ShortName, " "); } FDDC.Program.Logger.WriteLine("简称:[" + ShortName + "]"); return(ShortName); } return(""); }
/// <summary> /// 自定义字符列表 /// </summary> /// <param name="root"></param> /// <param name="CustomerWord"></param> /// <returns></returns> public static List <LocAndValue <String> > LocateCustomerWord(HTMLEngine.MyRootHtmlNode root, List <String> CustomerWord, string description = "字符") { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content.Replace(" ", ""); foreach (var word in CustomerWord) { if (String.IsNullOrEmpty(word)) { continue; } int ScanStartIdx = 0; int Count = 0; while (OrgString.IndexOf(word, ScanStartIdx) != -1) { list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Description = description, Value = word, StartIdx = OrgString.IndexOf(word, ScanStartIdx) }); Count++; if (Count > 5000) { //死循环的防止 Console.WriteLine("OrgString:" + OrgString); Console.WriteLine("word:[" + word + "]"); throw new System.ArgumentException(); } ScanStartIdx = OrgString.IndexOf(word, ScanStartIdx) + word.Length; } } } } return(list); }
//获得日期 public static List <LocAndValue> LocateDate(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = Utility.ConvertUpperDateToLittle(OrgString).Replace(" ", ""); if (!String.IsNullOrEmpty(RegularTool.GetDate(OrgString))) { list.Add(new LocAndValue() { Loc = sentence.PositionId, Value = RegularTool.GetDate(OrgString) }); } } } return(list); }
public static void AnlayzeEntitySurroundWords(HTMLEngine.MyRootHtmlNode root, string KeyWord) { Program.Training.WriteLine("关键字:[" + KeyWord + "]"); JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord(KeyWord); foreach (var paragrah in root.Children) { var segments = segmenter.Cut(paragrah.FullText.NormalizeKey()).ToList(); // 默认为精确模式 //Console.WriteLine("【精确模式】:{0}", string.Join("/ ", segments)); //寻找关键字的位置 for (int i = 0; i < segments.Count; i++) { if (segments[i].Equals(KeyWord)) { //前5个词语和后五个词语 var startInx = Math.Max(0, i - 5); var EndInx = Math.Min(i + 5, segments.Count); for (int s = startInx; s < i; s++) { Program.Training.WriteLine("前导关键字:[" + segments[s] + "]"); if (segments[s] == ":") { var leading = ""; for (int l = startInx; l < s; l++) { leading += segments[l]; } Console.WriteLine("冒号前导词:" + leading); } } Program.Training.WriteLine("关键字:[" + KeyWord + "]"); for (int s = i + 1; s < EndInx; s++) { Program.Training.WriteLine("后续关键字:[" + segments[s] + "]"); } } } } }
/// <summary> /// 某类标题的值 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> public void PutValueTrainingItem(HTMLEngine.MyRootHtmlNode root, List <string> TitleKeyWord) { foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo).NormalizeTextResult(); if (string.IsNullOrEmpty(value)) { continue; } foreach (var key in TitleKeyWord) { if (title.Equals(key)) { if (!TrainingValueResult.ContainsKey(value)) { TrainingValueResult.Add(value, 1); } else { TrainingValueResult[value]++; } } } } } } }
//词法分析 public static List <String> GetProjectName(HTMLEngine.MyRootHtmlNode root) { var posSeg = new PosSegmenter(); var namelist = new List <String>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var words = posSeg.Cut(sentence.Content).ToList(); for (int baseInd = 0; baseInd < words.Count; baseInd++) { if (words[baseInd].Word == "标段" || words[baseInd].Word == "工程" || words[baseInd].Word == "项目") { var projectName = ""; //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > -1; NRIdx--) { //地理 if (words[NRIdx].Flag == "ns") { projectName = ""; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { projectName += words[companyFullNameInd].Word; } namelist.Add(projectName); break; //不要继续寻找地名了 } } } } } } return(namelist); }
//获得日期 public static List <LocAndValue <(DateTime StartDate, DateTime EndDate)> > LocateDateRange(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <(DateTime StartDate, DateTime EndDate)> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = DateUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var datelist = DateUtility.GetRangeDate(OrgString); foreach (var strDate in datelist) { var DateNumberList = RegularTool.GetNumberList(strDate); DateTime ST = new DateTime(); DateTime ED = new DateTime(); if (DateNumberList.Count == 6) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Year = DateNumberList[3]; Month = DateNumberList[4]; Day = DateNumberList[5]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 5) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Month = DateNumberList[3]; Day = DateNumberList[4]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } if (DateNumberList.Count == 4) { String Year = DateNumberList[0]; String Month = DateNumberList[1]; String Day = DateNumberList[2]; int year; int month; int day; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ST = DateUtility.GetWorkDay(year, month, day); } Day = DateNumberList[3]; if (int.TryParse(Year, out year) && int.TryParse(Month, out month) && int.TryParse(Day, out day)) { ED = DateUtility.GetWorkDay(year, month, day); } list.Add(new LocAndValue <(DateTime StartDate, DateTime EndDate)>() { Loc = sentence.PositionId, Type = "日期范围", Value = (ST, ED) }); } } } } return(list); }
public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root) { var posSeg = new PosSegmenter(); var namelist = new List <struCompanyName>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var words = posSeg.Cut(sentence.Content).ToList(); for (int baseInd = 0; baseInd < words.Count; baseInd++) { var FullName = ""; var ShortName = ""; if (words[baseInd].Word == "有限公司" || (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙")) { //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > -1; NRIdx--) { //地理 if (words[NRIdx].Flag == "ns") { FullName = ""; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { FullName += words[companyFullNameInd].Word; } if (words[baseInd].Word == "有限") { FullName += words[baseInd + 1].Word; FullName += words[baseInd + 2].Word; } break; //不要继续寻找地名了 } } //是否能够在后面找到简称 for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++) { //地理 if (words[JCIdx].Word.Equals("简称")) { var ShortNameStart = -1; var ShortNameEnd = -1; for (int ShortNameIdx = baseInd; ShortNameIdx < words.Count; ShortNameIdx++) { if (words[ShortNameIdx].Word.Equals("“")) { ShortNameStart = ShortNameIdx + 1; } if (words[ShortNameIdx].Word.Equals("”")) { ShortNameEnd = ShortNameIdx - 1; break; } } if (ShortNameStart != -1 && ShortNameEnd != -1) { ShortName = ""; for (int i = ShortNameStart; i <= ShortNameEnd; i++) { ShortName += words[i].Word; } } } } if (FullName != "") { namelist.Add(new struCompanyName() { secFullName = FullName, secShortName = ShortName }); } } } } } return(namelist); }
//获得金额 public static List <LocAndValue <(String MoneyAmount, String MoneyCurrency)> > LocateMoney(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <(String MoneyAmount, String MoneyCurrency)> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; OrgString = MoneyUtility.ConvertUpperToLower(OrgString).Replace(" ", String.Empty); var Money = MoneyUtility.SeekMoney(OrgString); foreach (var money in Money) { list.Add(new LocAndValue <(String MoneyAmount, String MoneyCurrency)> { Loc = sentence.PositionId, Type = "金额", Value = money }); } } } return(list); }
public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root) { var posSeg = new PosSegmenter(); var namelist = new List <struCompanyName>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { if (string.IsNullOrEmpty(sentence.Content)) { continue; } var words = posSeg.Cut(sentence.Content).ToList(); var PreviewEndIdx = -1; for (int baseInd = 0; baseInd < words.Count; baseInd++) { var FullName = ""; var ShortName = ""; var IsSubCompany = false; var StartIdx = -1; if ( words[baseInd].Word == "有限公司" || (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "承包") || (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙") ) { //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--) { //地理 if (words[NRIdx].Flag == EntityWordAnlayzeTool.地名) { FullName = ""; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { FullName += words[companyFullNameInd].Word; } //承包公司 if (words[baseInd].Word == "公司") { //什么都不用做 } //(有限合伙) if (words[baseInd].Word == "有限") { FullName += words[baseInd + 1].Word; FullName += words[baseInd + 2].Word; } //子公司判断 if (NRIdx != 0 && words[NRIdx - 1].Word == "子公司") { IsSubCompany = true; } StartIdx = NRIdx; PreviewEndIdx = baseInd; break; //不要继续寻找地名了 } } //是否能够在后面找到简称 for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++) { //地理 if (words[JCIdx].Word.Equals("简称")) { var ShortNameStart = -1; var ShortNameEnd = -1; for (int ShortNameIdx = baseInd; ShortNameIdx < words.Count; ShortNameIdx++) { if (words[ShortNameIdx].Word.Equals("“")) { ShortNameStart = ShortNameIdx + 1; } if (words[ShortNameIdx].Word.Equals("”")) { ShortNameEnd = ShortNameIdx - 1; break; } } if (ShortNameStart != -1 && ShortNameEnd != -1) { ShortName = ""; for (int i = ShortNameStart; i <= ShortNameEnd; i++) { ShortName += words[i].Word; } } } } if (FullName != "") { namelist.Add(new struCompanyName() { secFullName = FullName, secShortName = ShortName, isSubCompany = IsSubCompany, positionId = sentence.PositionId, WordIdx = StartIdx }); } } } } } return(namelist); }
/// <summary> /// 带条件的标题检索 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> /// <param name="ConditionKey"></param> public void PutTitleTrainingItemWithCodition(HTMLEngine.MyRootHtmlNode root, string KeyWord, string ConditionKey) { if (root.TableList == null) { return; } foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { var IsConditionOK = false; var ConditionTitle = ""; for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (value.NormalizeTextResult().Contains(ConditionKey.NormalizeTextResult())) { ConditionTitle = title; IsConditionOK = true; break; } } if (!IsConditionOK) { continue; } //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (Transform != null) { value = Transform(value, title); } if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult())) { if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } if (!TrainingTitleCondition.ContainsKey(ConditionTitle)) { TrainingTitleCondition.Add(ConditionTitle, 1); } else { TrainingTitleCondition[ConditionTitle]++; } } } } } }
public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root) { var posSeg = new PosSegmenter(); var namelist = new List <struCompanyName>(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { if (string.IsNullOrEmpty(sentence.Content)) { continue; } var words = posSeg.Cut(sentence.Content).ToList(); var PreviewEndIdx = -1; for (int baseInd = 0; baseInd < words.Count; baseInd++) { var FullName = String.Empty; var ShortName = String.Empty; var IsSubCompany = false; if (words[baseInd].Word == "国家电网" && (baseInd + 1) < words.Count && words[baseInd + 1].Word == "公司") { namelist.Add(new struCompanyName() { secFullName = "国家电网公司", positionId = sentence.PositionId, WordIdx = baseInd, Score = 100 }); continue; } if ( words[baseInd].Word == "有限公司" || (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "有限责任") || (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "承包") || (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙") ) { //是否能够在后面找到简称 for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++) { //简称关键字 if (words[JCIdx].Word.Equals("简称") || words[JCIdx].Word.Equals("称")) { var ShortNameStart = -1; var ShortNameEnd = -1; for (int ShortNameIdx = JCIdx; ShortNameIdx < words.Count; ShortNameIdx++) { if (words[ShortNameIdx].Word.Equals("“")) { ShortNameStart = ShortNameIdx + 1; } if (words[ShortNameIdx].Word.Equals("”")) { ShortNameEnd = ShortNameIdx - 1; break; } } if (ShortNameStart != -1 && ShortNameEnd != -1) { ShortName = String.Empty; for (int i = ShortNameStart; i <= ShortNameEnd; i++) { ShortName += words[i].Word; } } break; } } var FirstShortNameWord = String.Empty; if (ShortName.Length == 4) { FirstShortNameWord = ShortName.Substring(0, 2); } var IsMarkClosed = true; var CompanyStartIdx = -1; var FirstShortNameIdx = -1; //包含简称的位置 //是否能够在前面找到地名 for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--) { if (words[NRIdx].Word == FirstShortNameWord) { FirstShortNameIdx = NRIdx; //备用 } //寻找地名?words[NRIdx].Flag == EntityWordAnlayzeTool.机构团体 //posSeg.Cut(words[NRIdx].Word + "市").First().Flag == EntityWordAnlayzeTool.地名 if (words[NRIdx].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx].Word)) { //注意,地名可能相连,例如:上海市嘉定 if (NRIdx != 0 && (words[NRIdx - 1].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx - 1].Word))) { continue; } FullName = String.Empty; for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++) { FullName += words[companyFullNameInd].Word; } //(有限合伙) if (words[baseInd].Word == "有限") { FullName += words[baseInd + 1].Word; if ((baseInd + 2) < words.Count) { FullName += words[baseInd + 2].Word; } } //子公司判断 if (NRIdx != 0 && words[NRIdx - 1].Word == "子公司") { IsSubCompany = true; } if (IsMarkClosed) { //皆大欢喜的局面 CompanyStartIdx = NRIdx; PreviewEndIdx = baseInd; break; //不要继续寻找地名了 } } if (words[NRIdx].Flag == LTPTrainingNER.词性标点) { if (words[NRIdx].Word != "(" && words[NRIdx].Word != ")") { break; } if (words[NRIdx].Word == ")") { IsMarkClosed = false; //打开 } if (words[NRIdx].Word == "(") { IsMarkClosed = true; //关闭 } } } if (CompanyStartIdx == -1) { if (FirstShortNameIdx == -1) { continue; } if (posSeg.Cut(ShortName).First().Flag == LTPTrainingNER.地名) { continue; } FullName = String.Empty; for (int NRIdx = FirstShortNameIdx; NRIdx <= baseInd; NRIdx++) { FullName += words[NRIdx].Word; } //(有限合伙) if (words[baseInd].Word == "有限") { FullName += words[baseInd + 1].Word; FullName += words[baseInd + 2].Word; } //子公司判断 if (FirstShortNameIdx != 0 && words[FirstShortNameIdx - 1].Word == "子公司") { IsSubCompany = true; } } if (FullName != String.Empty) { FullName = FullName.Replace(" ", String.Empty).Trim(); ShortName = ShortName.Replace(" ", String.Empty).Trim(); if (ShortName == "公司" || ShortName == "本公司") { ShortName = String.Empty; } if (ShortName == String.Empty) { var json = GetCompanyNameByFullName(FullName); ShortName = json.secShortName; } namelist.Add(new struCompanyName() { secFullName = FullName, secShortName = ShortName, isSubCompany = IsSubCompany, positionId = sentence.PositionId, WordIdx = CompanyStartIdx, Score = 100 }); } } } } } return(namelist); }