/// <summary> /// 未知ItemCount情况下的Title模式评分公式(得分越大越好) /// </summary> /// <param name="Nodes">无用参数</param> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount">无用参数</param> /// <returns></returns> public double TitleScore_UnkownItemCount(IEnumerable <HtmlNode> Nodes, HtmlPattern Pattern, ListStrategy Strategy, int BaseItemCount = 0) { double Score = /*model.VisualScore * 100 + */ Pattern.ItemCount * 4 + Pattern.AverageTextLength * 3; //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, TitleClassNames_Must, null, out IDClassNameMatched); //命中Must +20% if (IDClassNameMatched) { Score *= 1.2; } IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, null, TitleClassNames_MustNot, out IDClassNameMatched); //命中MustNot降一半 if (IDClassNameMatched) { Score /= 2; } return(Score); }
/// <summary> /// 已知ItemCount情况下的Title模式评分公式(越大越好) /// </summary> /// <param name="Pattern"></param> /// <param name="Strategy"></param> /// <param name="BaseItemCount">ItemBase的数量</param> /// <returns></returns> public double TitleScore_WithItemCount(IEnumerable <HtmlNode> Nodes, HtmlPattern Pattern, ListStrategy Strategy, int BaseItemCount) { double Score = 0; //如果没有指定最佳ItemCount,则根据Strategy提供 bool NoItemCount = false; if (BaseItemCount <= 0) { BaseItemCount = Strategy.List_BestItemCount; NoItemCount = true; } //Title长度打分10-100,均长跟BestAvg差异1/3或3倍以上的,10分 double Rate = Pattern.AverageTextLength > Strategy.List_BestAvgTitleLen ? Pattern.AverageTextLength / (double)Strategy.List_BestAvgTitleLen : Strategy.List_BestAvgTitleLen / (double)Pattern.AverageTextLength; if (Rate > 3) { Score = 10; } else if (Rate > 2) { //1/2-2倍到3倍之间,从70降到10 Score = 10 + 60 * (3 - Rate); } else //1/2-2倍以内的,100-70分 { Score = 70 + 30 * (2 - Rate); } //先计算数量的可能性,在1/3-3倍之间都可以,可能性等比递减 //fix 20141216: 基本上还是越多越好(例:两个标题平均长度接近的列表,一个9条,一个50条,都在2倍多,难以分辨) Rate = Pattern.ItemCount > BaseItemCount ? Pattern.ItemCount / (double)BaseItemCount : BaseItemCount / (double)Pattern.ItemCount; if (!NoItemCount || Pattern.ItemCount > BaseItemCount) { if (Rate > 3) //3倍以上5分 { Score *= 5; } else if (Rate > 2) //2-3倍,5-70分 { Score *= (5 + 65 * (3 - Rate)); } else //2倍以内,70-100 { Score *= 70 + 30 * (2 - Rate); } } else //没有指定ItemCount的情况,且ItemCount<最佳值,则使用以下公式 { if (Rate > 3) //3倍以上5分 { Score *= 3; } else if (Rate > 2) //2-3倍,5-70分 { Score *= (2.5 + 30 * (3 - Rate)); } else //2倍以内,70-100 { Score *= 35 + 15 * (2 - Rate); } } //如果已经提取了RelXPath if (!string.IsNullOrEmpty(Pattern.RelXPath)) { //XPath路径级别约小的越好,每多一个级别扣分 if (Pattern.RelXPathLevel < 4) //小于3个级别,比较好 { Score *= 1 + 0.3 * (3 - Pattern.RelXPathLevel); } else { Score *= 1 - 0.2 * (Pattern.RelXPathLevel - 4); } //用ID和class的好于用序号的,2倍得分 if (Pattern.RelXPathUsingName) { Score *= 2; } } //根据class和id是否有特殊字样加权 bool IDClassNameMatched; double IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, TitleClassNames_Must, null, out IDClassNameMatched); //命中Must +20% if (IDClassNameMatched) { Score *= 1.2; } IDClassScore = IDClassNameScore(Pattern.LeastCommonAncestor, null, TitleClassNames_MustNot, out IDClassNameMatched); //命中MustNot降一半 if (IDClassNameMatched) { Score /= 2; } //对Title处理,如果处于h1下面的则加分50% Score *= HtmlTagNameScore(string.IsNullOrEmpty(Pattern.RelXPath) ? Pattern.XPath : Pattern.RelXPath); return(Score); }