Beispiel #1
0
        /// <summary>
        /// 进行匹配判断Id或ClassName的函数,内部使用
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="Nodes"></param>
        /// <returns></returns>
        internal static Feature CheckforIdorClassName(Feature feature, IEnumerable <HtmlNode> Nodes, Feature stencilfeature, bool forListPage)
        {
            int nodestested = 0;
            Dictionary <string, int> temp = new Dictionary <string, int>();

            foreach (HtmlNode node in Nodes)
            {
                HtmlNode Node = node;
                if (XPathUtility.isTopNode(Node))
                {
                    break;
                }

                temp = new Dictionary <string, int>();
                foreach (string key in stencilfeature.IdClassnameRecord.Keys)
                {
                    if (stencilfeature.IdClassnameRecord[key] == 1)
                    {
                        temp.Add(key, 0);
                    }
                }

                for (int i = 1; i < 16; i++)
                {
                    if (XPathUtility.isTopNode(Node.ParentNode))
                    {
                        break;
                    }
                    foreach (string key in stencilfeature.IdClassnameRecord.Keys)
                    {
                        if (stencilfeature.IdClassnameRecord[key] == 1 && temp[key] == 0 && XPathUtility.IDClassContain(Node, key))
                        {
                            temp[key] = i;
                        }
                    }
                    Node = Node.ParentNode;
                }
                foreach (string key in temp.Keys)
                {
                    if (nodestested != 0 && feature.IdClassnameRecord[key] != temp[key])
                    {
                        feature.IdClassnameRecord[key] = 0;
                    }
                    feature.IdClassnameRecord[key] = temp[key];
                }
                nodestested++;
            }
            foreach (string key in temp.Keys)
            {
                if (feature.IdClassnameRecord[key] == 0)
                {
                    feature.IdClassnameRecord[key] = 15;
                }
            }
            return(feature);
        }
Beispiel #2
0
        /// <summary>
        /// 对一个Node的ID和ClassName打分,越大越好
        /// </summary>
        /// <param name="Node"></param>
        /// <param name="Must">如果命中则1分</param>
        /// <param name="MustNot">如果命中则0分</param>
        /// <param name="Matched">是否命中任何类名</param>
        /// <param name="UpLevel">向上追溯几个级别(不含自身),上一级得分对折累加在自己的得分上</param>
        /// <returns></returns>
        static double IDClassNameScore(HtmlNode Node, IEnumerable <string> MustKeywords, IEnumerable <string> MustNotKeywords, out bool Matched, int UpLevel = 2)
        {
            Matched = false;
            if (Node == null)
            {
                return(0);
            }
            int    Level      = 0;
            double Score      = 0;
            double LevelScore = 1;

            do
            {
                bool Must    = XPathUtility.IDClassContain(Node, MustKeywords);
                bool MustNot = XPathUtility.IDClassContain(Node, MustNotKeywords);
                if (Must && !MustNot)
                {
                    Score += LevelScore * 1.5;     //命中正面关键词且无负面
                }
                if (Must && MustNot)
                {
                    Score += LevelScore * 0.5;   //正负面都命中
                }
                if (!Must && MustNot)
                {
                    if (Level == 0) //只有负面
                    {
                        Matched = true;
                        return(0);   //第一级则直接返回不可能
                    }
                    else
                    {
                        Score -= LevelScore;
                    }
                }

                if (Must || MustNot) //任何命中要有标志
                {
                    Matched = true;
                }

                //向上一级
                LevelScore *= 0.8;
                Node        = Node.ParentNode;
            } while (!XPathUtility.isTopNode(Node) && Level++ < UpLevel);

            //什么都没有命中保持1
            return(Score);
        }
Beispiel #3
0
        /// <summary>
        /// 检查Node的名称标签,对H1之类加权
        /// </summary>
        /// <param name="Node"></param>
        /// <param name="UpLevel">向上追溯几个级别(不含自身)</param>
        /// <returns>1表示没有命中,>1表示命中,及加分</returns>
        static double HtmlTagNameScore(HtmlNode Node, int UpLevel = 2)
        {
            if (Node == null)
            {
                return(1);
            }
            int Level = 0;

            while (!XPathUtility.isTopNode(Node) && Level++ < UpLevel)
            {
                if (HighLevelTagName.Contains(Node.Name.ToLower()))
                {
                    return(1.5);
                }
                else
                {
                    Node = Node.ParentNode;
                }
            }

            return(1);
        }