/// <summary> /// 引号和书名号内容提取 /// </summary> /// <param name="root">原始HTML</param> /// <param name="IsSkipBracket">是否忽略括号内部的内容</param> /// <returns></returns> public static List <LocAndValue <String> > LocateQuotation(HTMLEngine.MyRootHtmlNode root, bool IsSkipBracket = true) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; var BracketList = RegularTool.GetChineseBrackets(OrgString); Regex r = new Regex(@"\《.*?\》"); foreach (var item in r.Matches(OrgString).ToList()) { bool IsContentInBracket = false; foreach (var bracketItem in BracketList) { if (bracketItem.Contains(item.Value)) { IsContentInBracket = true; break; } } if (IsSkipBracket && IsContentInBracket) { continue; } list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Type = "字符", Value = item.Value.Substring(1, item.Value.Length - 2) }); } r = new Regex(@"\“.*?\”"); foreach (var item in r.Matches(OrgString).ToList()) { bool IsContentInBracket = false; foreach (var bracketItem in BracketList) { if (bracketItem.Contains(item.Value)) { IsContentInBracket = true; break; } } if (IsSkipBracket && IsContentInBracket) { continue; } list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Type = "字符", Value = item.Value.Substring(1, item.Value.Length - 2) }); } } } return(list); }
public static List <LocAndValue <String> > LocatePercent(HTMLEngine.MyRootHtmlNode root) { var list = new List <LocAndValue <String> >(); foreach (var paragrah in root.Children) { foreach (var sentence in paragrah.Children) { var OrgString = sentence.Content; var BracketList = RegularTool.GetChineseBrackets(OrgString); Regex r = new Regex(RegularTool.PercentExpress); foreach (var item in r.Matches(OrgString).ToList()) { list.Add(new LocAndValue <String>() { Loc = sentence.PositionId, Description = "百分比", Value = item.Value, StartIdx = item.Index }); } } } return(list); }
public static List <String> GetValueInChineseBracketsLeadingKeyWord(string OrgString, String KeyWord) { var WordList = new List <String>(); var BucketWords = RegularTool.GetChineseBrackets(OrgString); foreach (var word in BucketWords) { var value = Utility.GetStringAfter(word.Substring(1, word.Length - 2), KeyWord); if (value != String.Empty) { WordList.Add(value); } } return(WordList); }
public static struCompanyName AfterProcessFullName(string FullName) { var ShortName = String.Empty; var CompanyNameTrailingwords = new string[] { "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称" }; //暂时不做括号的正规化 foreach (var trailing in CompanyNameTrailingwords) { if (FullName.Contains(trailing)) { //获取简称 var BracketsList = RegularTool.GetChineseBrackets(FullName); foreach (var bracketItem in BracketsList) { var ShortNameList = RegularTool.GetChineseQuotation(bracketItem); if (ShortNameList.Count > 0) { ShortName = ShortNameList.First(); if (!String.IsNullOrEmpty(ShortName)) { ShortName = ShortName.Substring(1, ShortName.Length - 2); } } } FullName = Utility.GetStringBefore(FullName, trailing); } } if (FullName.Contains("及其")) { FullName = Utility.GetStringBefore(FullName, "及其"); } if (FullName.Contains("股东")) { FullName = Utility.GetStringAfter(FullName, "股东"); } if (FullName.Contains("一致行动人")) { FullName = Utility.GetStringAfter(FullName, "一致行动人"); } if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName)) { FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName; } //删除前导 FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName); FullName = CutOtherLeadingWords(FullName); if (ShortName != String.Empty) { return(new struCompanyName() { secFullName = FullName, secShortName = ShortName, Score = 80 }); } else { return(new struCompanyName() { secFullName = FullName, Score = 60 }); } }