/// <summary> /// 进行匹配判断Id或ClassName的函数,内部使用 /// </summary> /// <param name="feature"></param> /// <param name="Nodes"></param> /// <returns></returns> internal static Feature CheckforIdorClassName(Feature feature, IEnumerable <HtmlNode> Nodes, Feature stencilfeature, bool forListPage) { int nodestested = 0; Dictionary <string, int> temp = new Dictionary <string, int>(); foreach (HtmlNode node in Nodes) { HtmlNode Node = node; if (XPathUtility.isTopNode(Node)) { break; } temp = new Dictionary <string, int>(); foreach (string key in stencilfeature.IdClassnameRecord.Keys) { if (stencilfeature.IdClassnameRecord[key] == 1) { temp.Add(key, 0); } } for (int i = 1; i < 16; i++) { if (XPathUtility.isTopNode(Node.ParentNode)) { break; } foreach (string key in stencilfeature.IdClassnameRecord.Keys) { if (stencilfeature.IdClassnameRecord[key] == 1 && temp[key] == 0 && XPathUtility.IDClassContain(Node, key)) { temp[key] = i; } } Node = Node.ParentNode; } foreach (string key in temp.Keys) { if (nodestested != 0 && feature.IdClassnameRecord[key] != temp[key]) { feature.IdClassnameRecord[key] = 0; } feature.IdClassnameRecord[key] = temp[key]; } nodestested++; } foreach (string key in temp.Keys) { if (feature.IdClassnameRecord[key] == 0) { feature.IdClassnameRecord[key] = 15; } } return(feature); }
/// <summary> /// 对一个Node的ID和ClassName打分,越大越好 /// </summary> /// <param name="Node"></param> /// <param name="Must">如果命中则1分</param> /// <param name="MustNot">如果命中则0分</param> /// <param name="Matched">是否命中任何类名</param> /// <param name="UpLevel">向上追溯几个级别(不含自身),上一级得分对折累加在自己的得分上</param> /// <returns></returns> static double IDClassNameScore(HtmlNode Node, IEnumerable <string> MustKeywords, IEnumerable <string> MustNotKeywords, out bool Matched, int UpLevel = 2) { Matched = false; if (Node == null) { return(0); } int Level = 0; double Score = 0; double LevelScore = 1; do { bool Must = XPathUtility.IDClassContain(Node, MustKeywords); bool MustNot = XPathUtility.IDClassContain(Node, MustNotKeywords); if (Must && !MustNot) { Score += LevelScore * 1.5; //命中正面关键词且无负面 } if (Must && MustNot) { Score += LevelScore * 0.5; //正负面都命中 } if (!Must && MustNot) { if (Level == 0) //只有负面 { Matched = true; return(0); //第一级则直接返回不可能 } else { Score -= LevelScore; } } if (Must || MustNot) //任何命中要有标志 { Matched = true; } //向上一级 LevelScore *= 0.8; Node = Node.ParentNode; } while (!XPathUtility.isTopNode(Node) && Level++ < UpLevel); //什么都没有命中保持1 return(Score); }
/// <summary> /// 检查Node的名称标签,对H1之类加权 /// </summary> /// <param name="Node"></param> /// <param name="UpLevel">向上追溯几个级别(不含自身)</param> /// <returns>1表示没有命中,>1表示命中,及加分</returns> static double HtmlTagNameScore(HtmlNode Node, int UpLevel = 2) { if (Node == null) { return(1); } int Level = 0; while (!XPathUtility.isTopNode(Node) && Level++ < UpLevel) { if (HighLevelTagName.Contains(Node.Name.ToLower())) { return(1.5); } else { Node = Node.ParentNode; } } return(1); }