Ejemplo n.º 1
0
    /// <summary>
    /// 引号和书名号内容提取
    /// </summary>
    /// <param name="root">原始HTML</param>
    /// <param name="IsSkipBracket">是否忽略括号内部的内容</param>
    /// <returns></returns>
    public static List <LocAndValue <String> > LocateQuotation(HTMLEngine.MyRootHtmlNode root, bool IsSkipBracket = true)
    {
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var   OrgString   = sentence.Content;
                var   BracketList = RegularTool.GetChineseBrackets(OrgString);
                Regex r           = new Regex(@"\《.*?\》");
                foreach (var item in r.Matches(OrgString).ToList())
                {
                    bool IsContentInBracket = false;
                    foreach (var bracketItem in BracketList)
                    {
                        if (bracketItem.Contains(item.Value))
                        {
                            IsContentInBracket = true;
                            break;
                        }
                    }
                    if (IsSkipBracket && IsContentInBracket)
                    {
                        continue;
                    }
                    list.Add(new LocAndValue <String>()
                    {
                        Loc   = sentence.PositionId,
                        Type  = "字符",
                        Value = item.Value.Substring(1, item.Value.Length - 2)
                    });
                }
                r = new Regex(@"\“.*?\”");
                foreach (var item in r.Matches(OrgString).ToList())
                {
                    bool IsContentInBracket = false;
                    foreach (var bracketItem in BracketList)
                    {
                        if (bracketItem.Contains(item.Value))
                        {
                            IsContentInBracket = true;
                            break;
                        }
                    }
                    if (IsSkipBracket && IsContentInBracket)
                    {
                        continue;
                    }
                    list.Add(new LocAndValue <String>()
                    {
                        Loc   = sentence.PositionId,
                        Type  = "字符",
                        Value = item.Value.Substring(1, item.Value.Length - 2)
                    });
                }
            }
        }
        return(list);
    }
Ejemplo n.º 2
0
    public static List <LocAndValue <String> > LocatePercent(HTMLEngine.MyRootHtmlNode root)
    {
        var list = new List <LocAndValue <String> >();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var   OrgString   = sentence.Content;
                var   BracketList = RegularTool.GetChineseBrackets(OrgString);
                Regex r           = new Regex(RegularTool.PercentExpress);
                foreach (var item in r.Matches(OrgString).ToList())
                {
                    list.Add(new LocAndValue <String>()
                    {
                        Loc         = sentence.PositionId,
                        Description = "百分比",
                        Value       = item.Value,
                        StartIdx    = item.Index
                    });
                }
            }
        }
        return(list);
    }
Ejemplo n.º 3
0
    public static List <String> GetValueInChineseBracketsLeadingKeyWord(string OrgString, String KeyWord)
    {
        var WordList    = new List <String>();
        var BucketWords = RegularTool.GetChineseBrackets(OrgString);

        foreach (var word in BucketWords)
        {
            var value = Utility.GetStringAfter(word.Substring(1, word.Length - 2), KeyWord);
            if (value != String.Empty)
            {
                WordList.Add(value);
            }
        }
        return(WordList);
    }
Ejemplo n.º 4
0
    public static struCompanyName AfterProcessFullName(string FullName)
    {
        var ShortName = String.Empty;
        var CompanyNameTrailingwords = new string[] {
            "(以下简称", "(下称", "(以下称", "(简称", "(以下简称", "(下称", "(以下称", "(简称"
        };

        //暂时不做括号的正规化
        foreach (var trailing in CompanyNameTrailingwords)
        {
            if (FullName.Contains(trailing))
            {
                //获取简称
                var BracketsList = RegularTool.GetChineseBrackets(FullName);
                foreach (var bracketItem in BracketsList)
                {
                    var ShortNameList = RegularTool.GetChineseQuotation(bracketItem);
                    if (ShortNameList.Count > 0)
                    {
                        ShortName = ShortNameList.First();
                        if (!String.IsNullOrEmpty(ShortName))
                        {
                            ShortName = ShortName.Substring(1, ShortName.Length - 2);
                        }
                    }
                }
                FullName = Utility.GetStringBefore(FullName, trailing);
            }
        }
        if (FullName.Contains("及其"))
        {
            FullName = Utility.GetStringBefore(FullName, "及其");
        }
        if (FullName.Contains("股东"))
        {
            FullName = Utility.GetStringAfter(FullName, "股东");
        }
        if (FullName.Contains("一致行动人"))
        {
            FullName = Utility.GetStringAfter(FullName, "一致行动人");
        }
        if (!String.IsNullOrEmpty(CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName))
        {
            FullName = CompanyNameLogic.GetCompanyNameByShortName(FullName).secFullName;
        }
        //删除前导
        FullName = EntityWordAnlayzeTool.TrimLeadingUL(FullName);
        FullName = CutOtherLeadingWords(FullName);
        if (ShortName != String.Empty)
        {
            return(new struCompanyName()
            {
                secFullName = FullName, secShortName = ShortName, Score = 80
            });
        }
        else
        {
            return(new struCompanyName()
            {
                secFullName = FullName, Score = 60
            });
        }
    }