Exemple #1
0
        /// <summary>
        /// Author节点的打分函数(越大越好)
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double AuthorNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            double AvgPossibility = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node));
                if (Text.Length > Strategy.MaxLenAuthor)
                {
                    continue;
                }

                double Posibility = Strategy.MediaType == Enums.MediaType.Forum ? 0.2 : 0;

                //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊人名,本节点的可能性记为0继续下一个
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 0)
                {
                    Posibility = 0.2;
                }

                //是否是日期时间
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6)
                {
                    Posibility *= 0.3;
                }

                //检查开头
                Match PrefixMatch = Regex.Match(Text, @"作\s*者|选\s*稿|编\s*辑|记\s*者");
                if (PrefixMatch.Success)
                {
                    Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' ');

                    //在第一个空格处截断
                    int SpaceIndex = Text.IndexOf(' ');
                    if (SpaceIndex > 0)
                    {
                        Text = Text.Substring(0, SpaceIndex);
                    }

                    Posibility = 1;
                }

                //禁用词替换
                Text = TextCleaner.RemoveStopWords(Text, StopWords);

                //检查Top10姓氏命中
                if (Regex.IsMatch(Text, @"[李王张刘陈杨赵黄周吴]"))
                {
                    if (Posibility > 0)
                    {
                        Posibility *= 1.1;
                    }
                    else
                    {
                        Posibility = 0.6;
                    }
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double IDClassScore = IDClassNameScore(Node, AuthorClassNames_Must, AuthorClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore;
                }

                //ID或Class中出现Reply则为0(只要主贴作者)
                IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched);
                if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了
                {
                    Posibility = 0;
                }

                //长度检查
                if (Text.Length * 3 < Strategy.List_BestAvgAuthorLen || Text.Length > Strategy.List_BestAvgAuthorLen * 3)
                {
                    Posibility /= 3; //超过3倍降权到1/3
                }
                else
                {
                    //Rate在1-3之间
                    double Rate = Text.Length >= Strategy.List_BestAvgAuthorLen ? Text.Length / (double)Strategy.List_BestAvgAuthorLen
                        : Strategy.List_BestAvgAuthorLen / (double)Text.Length;
                    //随着Rate提高降低
                    Posibility *= (0.4 + 0.35 * (3 - Rate));
                }

                //论坛出现英文字母、数字或下划线,+50%可能性
                if (Strategy.MediaType == Enums.MediaType.Forum && Regex.IsMatch(Text, @"[a-z,0-9,_]+", RegexOptions.IgnoreCase))
                {
                    Posibility *= 1.5;
                }

                //非名称分隔符(包含空格)出现,6折
                if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, Strategy.Language == Language.CHINESE) > 0)
                {
                    Posibility *= 0.6;
                }

                //累加节点的可能性
                AvgPossibility += Posibility;
            }

            //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgPossibility *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2:用ID和class的好于用序号的,增加20%可能性
            if (PathUsingName)
            {
                AvgPossibility *= 1.2;
            }

            return(AvgPossibility / Nodes.Count());
        }
Exemple #2
0
        /// <summary>
        /// 媒体名称节点的打分函数(越大越好)
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            double AvgPossibility = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node));
                if (Text.Length > Strategy.MaxLenMedia)
                {
                    continue;
                }

                double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0;

                //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1)
                {
                    Posibility = 0.2;
                }

                //是否是日期时间
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6)
                {
                    Posibility *= 0.3;
                }

                //检查开头
                Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex);
                if (PrefixMatch.Success)
                {
                    Text       = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' ');
                    Posibility = 1;
                }

                //禁用词替换
                Text = TextCleaner.RemoveStopWords(Text, StopWords);

                //检查结尾
                if (Regex.IsMatch(Text, @"[报台网刊]"))
                {
                    if (Posibility > 0)
                    {
                        Posibility = Math.Max(1, Posibility * 1.5);
                    }
                    else
                    {
                        Posibility = 0.8;
                    }
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore;
                }

                //长度检查
                if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3)
                {
                    Posibility /= 3; //超过3倍降权到1/3
                }
                else
                {
                    //Rate在1-3之间
                    double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen
                        : Strategy.List_BestAvgMediaLen / (double)Text.Length;
                    //随着Rate提高降低
                    Posibility *= (0.4 + 0.35 * (3 - Rate));
                }

                //非名称分隔符(包含空格)出现,6折
                if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0)
                {
                    Posibility *= 0.6;
                }

                //英文出现,再6折
                if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH)
                {
                    Posibility *= 0.6;
                }

                //累加节点的可能性
                AvgPossibility += Posibility;
            }

            //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgPossibility *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2:用ID和class的好于用序号的,增加20%可能性
            if (PathUsingName)
            {
                AvgPossibility *= 1.2;
            }

            //论坛媒体减分
            if (Strategy.MediaType == Enums.MediaType.Forum)
            {
                AvgPossibility /= 2;
            }

            return(AvgPossibility / Nodes.Count());
        }