コード例 #1
0
        /// <summary>
        /// 未知ItemCount情况下的Title模式评分公式(得分越大越好)
        /// </summary>
        /// <param name="Nodes">无用参数</param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount">无用参数</param>
        /// <returns></returns>
        public double TitleScore_UnkownItemCount(IEnumerable <HtmlNode> Nodes, HtmlPattern Pattern, ListStrategy Strategy, int BaseItemCount = 0)
        {
            double Score = /*model.VisualScore * 100 + */ Pattern.ItemCount * 4 + Pattern.AverageTextLength * 3;

            //根据class和id是否有特殊字样加权
            bool   IDClassNameMatched;
            double IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, TitleClassNames_Must, null, out IDClassNameMatched);

            //命中Must +20%
            if (IDClassNameMatched)
            {
                Score *= 1.2;
            }
            IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, null, TitleClassNames_MustNot, out IDClassNameMatched);
            //命中MustNot降一半
            if (IDClassNameMatched)
            {
                Score /= 2;
            }

            return(Score);
        }
コード例 #2
0
        /// <summary>
        /// 已知ItemCount情况下的Title模式评分公式(越大越好)
        /// </summary>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount">ItemBase的数量</param>
        /// <returns></returns>
        public double TitleScore_WithItemCount(IEnumerable <HtmlNode> Nodes, HtmlPattern Pattern, ListStrategy Strategy, int BaseItemCount)
        {
            double Score = 0;

            //如果没有指定最佳ItemCount,则根据Strategy提供
            bool NoItemCount = false;

            if (BaseItemCount <= 0)
            {
                BaseItemCount = Strategy.List_BestItemCount;
                NoItemCount   = true;
            }

            //Title长度打分10-100,均长跟BestAvg差异1/3或3倍以上的,10分
            double Rate = Pattern.AverageTextLength > Strategy.List_BestAvgTitleLen ? Pattern.AverageTextLength / (double)Strategy.List_BestAvgTitleLen : Strategy.List_BestAvgTitleLen / (double)Pattern.AverageTextLength;

            if (Rate > 3)
            {
                Score = 10;
            }
            else if (Rate > 2)
            {
                //1/2-2倍到3倍之间,从70降到10
                Score = 10 + 60 * (3 - Rate);
            }
            else //1/2-2倍以内的,100-70分
            {
                Score = 70 + 30 * (2 - Rate);
            }

            //先计算数量的可能性,在1/3-3倍之间都可以,可能性等比递减
            //fix 20141216: 基本上还是越多越好(例:两个标题平均长度接近的列表,一个9条,一个50条,都在2倍多,难以分辨)
            Rate = Pattern.ItemCount > BaseItemCount ? Pattern.ItemCount / (double)BaseItemCount : BaseItemCount / (double)Pattern.ItemCount;
            if (!NoItemCount || Pattern.ItemCount > BaseItemCount)
            {
                if (Rate > 3) //3倍以上5分
                {
                    Score *= 5;
                }
                else if (Rate > 2) //2-3倍,5-70分
                {
                    Score *= (5 + 65 * (3 - Rate));
                }
                else //2倍以内,70-100
                {
                    Score *= 70 + 30 * (2 - Rate);
                }
            }
            else //没有指定ItemCount的情况,且ItemCount<最佳值,则使用以下公式
            {
                if (Rate > 3) //3倍以上5分
                {
                    Score *= 3;
                }
                else if (Rate > 2) //2-3倍,5-70分
                {
                    Score *= (2.5 + 30 * (3 - Rate));
                }
                else //2倍以内,70-100
                {
                    Score *= 35 + 15 * (2 - Rate);
                }
            }

            //如果已经提取了RelXPath
            if (!string.IsNullOrEmpty(Pattern.RelXPath))
            {
                //XPath路径级别约小的越好,每多一个级别扣分
                if (Pattern.RelXPathLevel < 4) //小于3个级别,比较好
                {
                    Score *= 1 + 0.3 * (3 - Pattern.RelXPathLevel);
                }
                else
                {
                    Score *= 1 - 0.2 * (Pattern.RelXPathLevel - 4);
                }

                //用ID和class的好于用序号的,2倍得分
                if (Pattern.RelXPathUsingName)
                {
                    Score *= 2;
                }
            }

            //根据class和id是否有特殊字样加权
            bool   IDClassNameMatched;
            double IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, TitleClassNames_Must, null, out IDClassNameMatched);

            //命中Must +20%
            if (IDClassNameMatched)
            {
                Score *= 1.2;
            }
            IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, null, TitleClassNames_MustNot, out IDClassNameMatched);
            //命中MustNot降一半
            if (IDClassNameMatched)
            {
                Score /= 2;
            }

            //对Title处理,如果处于h1下面的则加分50%
            Score *= HtmlTagNameScore(string.IsNullOrEmpty(Pattern.RelXPath) ? Pattern.XPath : Pattern.RelXPath);

            return(Score);
        }