Beispiel #1
0
        /// <summary>
        /// 进行匹配判断Id或ClassName的函数,内部使用
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="Nodes"></param>
        /// <returns></returns>
        internal static Feature CheckforIdorClassName(Feature feature, IEnumerable <HtmlNode> Nodes, Feature stencilfeature, bool forListPage)
        {
            int nodestested = 0;
            Dictionary <string, int> temp = new Dictionary <string, int>();

            foreach (HtmlNode node in Nodes)
            {
                HtmlNode Node = node;
                if (XPathUtility.isTopNode(Node))
                {
                    break;
                }

                temp = new Dictionary <string, int>();
                foreach (string key in stencilfeature.IdClassnameRecord.Keys)
                {
                    if (stencilfeature.IdClassnameRecord[key] == 1)
                    {
                        temp.Add(key, 0);
                    }
                }

                for (int i = 1; i < 16; i++)
                {
                    if (XPathUtility.isTopNode(Node.ParentNode))
                    {
                        break;
                    }
                    foreach (string key in stencilfeature.IdClassnameRecord.Keys)
                    {
                        if (stencilfeature.IdClassnameRecord[key] == 1 && temp[key] == 0 && XPathUtility.IDClassContain(Node, key))
                        {
                            temp[key] = i;
                        }
                    }
                    Node = Node.ParentNode;
                }
                foreach (string key in temp.Keys)
                {
                    if (nodestested != 0 && feature.IdClassnameRecord[key] != temp[key])
                    {
                        feature.IdClassnameRecord[key] = 0;
                    }
                    feature.IdClassnameRecord[key] = temp[key];
                }
                nodestested++;
            }
            foreach (string key in temp.Keys)
            {
                if (feature.IdClassnameRecord[key] == 0)
                {
                    feature.IdClassnameRecord[key] = 15;
                }
            }
            return(feature);
        }
        public FieldPatternScore(string DatePath, string ViewPath, string ReplyPath, string MediaPath, string AuthorPath, double CrossScore)
        {
            this.DatePath   = DatePath;
            this.ViewPath   = ViewPath;
            this.ReplyPath  = ReplyPath;
            this.MediaPath  = MediaPath;
            this.AuthorPath = AuthorPath;
            this.Score      = CrossScore;

            int CountName = 0, CountLevel = 0;

            if (!string.IsNullOrEmpty(DatePath))
            {
                if (XPathUtility.isXPathUsingName(DatePath))
                {
                    CountName++;
                }
                CountLevel += XPathUtility.CountXPathLevel(DatePath);
            }
            if (!string.IsNullOrEmpty(ViewPath))
            {
                if (XPathUtility.isXPathUsingName(ViewPath))
                {
                    CountName++;
                }
                CountLevel += XPathUtility.CountXPathLevel(ViewPath);
            }
            if (!string.IsNullOrEmpty(ReplyPath))
            {
                if (XPathUtility.isXPathUsingName(ReplyPath))
                {
                    CountName++;
                }
                CountLevel += XPathUtility.CountXPathLevel(ReplyPath);
            }
            if (!string.IsNullOrEmpty(MediaPath))
            {
                if (XPathUtility.isXPathUsingName(MediaPath))
                {
                    CountName++;
                }
                CountLevel += XPathUtility.CountXPathLevel(MediaPath);
            }
            if (!string.IsNullOrEmpty(AuthorPath))
            {
                if (XPathUtility.isXPathUsingName(AuthorPath))
                {
                    CountName++;
                }
                CountLevel += XPathUtility.CountXPathLevel(AuthorPath);
            }
            this.CountLevel     = CountLevel;
            this.CountUsingName = CountName;
        }
Beispiel #3
0
        /// <summary>
        /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText
        /// </summary>
        /// <param name="BaseNode">一个Item的根节点</param>
        /// <param name="RelXPath">相对XPath路径</param>
        /// <param name="CleanConnectionMark">是否清洗文本</param>
        /// <returns></returns>
        internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true)
        {
            if (BaseNode == null)
            {
                return(null);
            }

            if (string.IsNullOrWhiteSpace(RelXPath))
            {
                if (CleanConnectionMark)
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode)));
                }
                else
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false));
                }
            }

            string innerTextValue = "";

            try
            {
                HtmlNodeNavigator navigator = (HtmlNodeNavigator)BaseNode.CreateNavigator();
                var node = navigator.SelectSingleNode(RelXPath);
                innerTextValue = node.Value;
            }
            catch (Exception ex)
            { }

            if (string.IsNullOrWhiteSpace(innerTextValue))
            {
                IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath);
                if (MatchNodes != null)
                {
                    MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n)));
                }
                if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() == 0))
                {
                    return(null);
                }

                innerTextValue = XPathUtility.InnerTextNonDescendants(MatchNodes.First());
            }

            if (CleanConnectionMark)
            {
                return(TextCleaner.FullClean(innerTextValue));
            }
            else
            {
                return(TextCleaner.FullClean(innerTextValue, true, true, true, false, true, false));
            }
        }
Beispiel #4
0
        /// <summary>
        /// 对一个Node的ID和ClassName打分,越大越好
        /// </summary>
        /// <param name="Node"></param>
        /// <param name="Must">如果命中则1分</param>
        /// <param name="MustNot">如果命中则0分</param>
        /// <param name="Matched">是否命中任何类名</param>
        /// <param name="UpLevel">向上追溯几个级别(不含自身),上一级得分对折累加在自己的得分上</param>
        /// <returns></returns>
        static double IDClassNameScore(HtmlNode Node, IEnumerable <string> MustKeywords, IEnumerable <string> MustNotKeywords, out bool Matched, int UpLevel = 2)
        {
            Matched = false;
            if (Node == null)
            {
                return(0);
            }
            int    Level      = 0;
            double Score      = 0;
            double LevelScore = 1;

            do
            {
                bool Must    = XPathUtility.IDClassContain(Node, MustKeywords);
                bool MustNot = XPathUtility.IDClassContain(Node, MustNotKeywords);
                if (Must && !MustNot)
                {
                    Score += LevelScore * 1.5;     //命中正面关键词且无负面
                }
                if (Must && MustNot)
                {
                    Score += LevelScore * 0.5;   //正负面都命中
                }
                if (!Must && MustNot)
                {
                    if (Level == 0) //只有负面
                    {
                        Matched = true;
                        return(0);   //第一级则直接返回不可能
                    }
                    else
                    {
                        Score -= LevelScore;
                    }
                }

                if (Must || MustNot) //任何命中要有标志
                {
                    Matched = true;
                }

                //向上一级
                LevelScore *= 0.8;
                Node        = Node.ParentNode;
            } while (!XPathUtility.isTopNode(Node) && Level++ < UpLevel);

            //什么都没有命中保持1
            return(Score);
        }
Beispiel #5
0
        /// <summary>
        /// 检查Node的名称标签,对H1之类加权
        /// </summary>
        /// <param name="Node"></param>
        /// <param name="UpLevel">向上追溯几个级别(不含自身)</param>
        /// <returns>1表示没有命中,>1表示命中,及加分</returns>
        static double HtmlTagNameScore(HtmlNode Node, int UpLevel = 2)
        {
            if (Node == null)
            {
                return(1);
            }
            int Level = 0;

            while (!XPathUtility.isTopNode(Node) && Level++ < UpLevel)
            {
                if (HighLevelTagName.Contains(Node.Name.ToLower()))
                {
                    return(1.5);
                }
                else
                {
                    Node = Node.ParentNode;
                }
            }

            return(1);
        }
Beispiel #6
0
        /// <summary>
        /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText
        /// </summary>
        /// <param name="BaseNode">一个Item的根节点</param>
        /// <param name="RelXPath">相对XPath路径</param>
        /// <param name="CleanConnectionMark">是否清洗文本</param>
        /// <returns></returns>
        internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true)
        {
            if (BaseNode == null)
            {
                return(null);
            }

            if (string.IsNullOrWhiteSpace(RelXPath) && postion == 0)
            {
                if (CleanConnectionMark)
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode)));
                }
                else
                {
                    return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false));
                }
            }

            IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath);

            if (MatchNodes != null)
            {
                MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n)));
            }
            if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() <= postion))
            {
                return(null);
            }

            if (CleanConnectionMark)
            {
                return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion))));
            }
            else
            {
                return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)), true, true, true, false, true, false));
            }
        }
Beispiel #7
0
 /// <summary>
 /// 验证标题是否合法
 /// </summary>
 /// <param name="Title"></param>
 /// <returns></returns>
 public bool ValidateTitle(HtmlNode Title)
 {
     return(ValidateTitle(XPathUtility.InnerTextNonDescendants(Title)) && !(Title.Attributes["href"] == null) && HTMLCleaner.isUrlGood(Title.Attributes["href"].Value));
 }
Beispiel #8
0
        /// <summary>
        /// Author节点的打分函数(越大越好)
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double AuthorNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            double AvgPossibility = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node));
                if (Text.Length > Strategy.MaxLenAuthor)
                {
                    continue;
                }

                double Posibility = Strategy.MediaType == Enums.MediaType.Forum ? 0.2 : 0;

                //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊人名,本节点的可能性记为0继续下一个
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 0)
                {
                    Posibility = 0.2;
                }

                //是否是日期时间
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6)
                {
                    Posibility *= 0.3;
                }

                //检查开头
                Match PrefixMatch = Regex.Match(Text, @"作\s*者|选\s*稿|编\s*辑|记\s*者");
                if (PrefixMatch.Success)
                {
                    Text = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' ');

                    //在第一个空格处截断
                    int SpaceIndex = Text.IndexOf(' ');
                    if (SpaceIndex > 0)
                    {
                        Text = Text.Substring(0, SpaceIndex);
                    }

                    Posibility = 1;
                }

                //禁用词替换
                Text = TextCleaner.RemoveStopWords(Text, StopWords);

                //检查Top10姓氏命中
                if (Regex.IsMatch(Text, @"[李王张刘陈杨赵黄周吴]"))
                {
                    if (Posibility > 0)
                    {
                        Posibility *= 1.1;
                    }
                    else
                    {
                        Posibility = 0.6;
                    }
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double IDClassScore = IDClassNameScore(Node, AuthorClassNames_Must, AuthorClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore;
                }

                //ID或Class中出现Reply则为0(只要主贴作者)
                IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched);
                if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了
                {
                    Posibility = 0;
                }

                //长度检查
                if (Text.Length * 3 < Strategy.List_BestAvgAuthorLen || Text.Length > Strategy.List_BestAvgAuthorLen * 3)
                {
                    Posibility /= 3; //超过3倍降权到1/3
                }
                else
                {
                    //Rate在1-3之间
                    double Rate = Text.Length >= Strategy.List_BestAvgAuthorLen ? Text.Length / (double)Strategy.List_BestAvgAuthorLen
                        : Strategy.List_BestAvgAuthorLen / (double)Text.Length;
                    //随着Rate提高降低
                    Posibility *= (0.4 + 0.35 * (3 - Rate));
                }

                //论坛出现英文字母、数字或下划线,+50%可能性
                if (Strategy.MediaType == Enums.MediaType.Forum && Regex.IsMatch(Text, @"[a-z,0-9,_]+", RegexOptions.IgnoreCase))
                {
                    Posibility *= 1.5;
                }

                //非名称分隔符(包含空格)出现,6折
                if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, Strategy.Language == Language.CHINESE) > 0)
                {
                    Posibility *= 0.6;
                }

                //累加节点的可能性
                AvgPossibility += Posibility;
            }

            //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgPossibility *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2:用ID和class的好于用序号的,增加20%可能性
            if (PathUsingName)
            {
                AvgPossibility *= 1.2;
            }

            return(AvgPossibility / Nodes.Count());
        }
Beispiel #9
0
        /// <summary>
        /// 媒体名称节点的打分函数(越大越好)
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double MediaNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            double AvgPossibility = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node));
                if (Text.Length > Strategy.MaxLenMedia)
                {
                    continue;
                }

                double Posibility = Strategy.MediaType == MediaType.WebNews ? 0.1 : 0;

                //检查是否命中日期或点击数的特殊字符,还不能全部排除,可能有特殊的报纸名称哦,本节点的可能性记为0继续下一个
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustNotMediaAuthorDate, false) > 1)
                {
                    Posibility = 0.2;
                }

                //是否是日期时间
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_Dates, false) > Text.Length * 0.6)
                {
                    Posibility *= 0.3;
                }

                //检查开头
                Match PrefixMatch = Regex.Match(Text, MediaPrefixRegex);
                if (PrefixMatch.Success)
                {
                    Text       = Text.Substring(PrefixMatch.Index + PrefixMatch.Length).TrimStart(':', ':', ']', '】', ' ');
                    Posibility = 1;
                }

                //禁用词替换
                Text = TextCleaner.RemoveStopWords(Text, StopWords);

                //检查结尾
                if (Regex.IsMatch(Text, @"[报台网刊]"))
                {
                    if (Posibility > 0)
                    {
                        Posibility = Math.Max(1, Posibility * 1.5);
                    }
                    else
                    {
                        Posibility = 0.8;
                    }
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double IDClassScore = IDClassNameScore(Node, MediaClassNames_Must, MediaClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    Posibility = (Posibility == 0 ? 1 : Posibility) * IDClassScore;
                }

                //长度检查
                if (Text.Length * 3 < Strategy.List_BestAvgMediaLen || Text.Length > Strategy.List_BestAvgMediaLen * 3)
                {
                    Posibility /= 3; //超过3倍降权到1/3
                }
                else
                {
                    //Rate在1-3之间
                    double Rate = Text.Length >= Strategy.List_BestAvgMediaLen ? Text.Length / (double)Strategy.List_BestAvgMediaLen
                        : Strategy.List_BestAvgMediaLen / (double)Text.Length;
                    //随着Rate提高降低
                    Posibility *= (0.4 + 0.35 * (3 - Rate));
                }

                //非名称分隔符(包含空格)出现,6折
                if (TextCleaner.CountCharInSet(Text, NoneNameSeperatorSet, true) > 0)
                {
                    Posibility *= 0.6;
                }

                //英文出现,再6折
                if (LanguageUtility.DetectedLanguage(Text) == Enums.Language.ENGLISH)
                {
                    Posibility *= 0.6;
                }

                //累加节点的可能性
                AvgPossibility += Posibility;
            }

            //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgPossibility *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2:用ID和class的好于用序号的,增加20%可能性
            if (PathUsingName)
            {
                AvgPossibility *= 1.2;
            }

            //论坛媒体减分
            if (Strategy.MediaType == Enums.MediaType.Forum)
            {
                AvgPossibility /= 2;
            }

            return(AvgPossibility / Nodes.Count());
        }
Beispiel #10
0
        /// <summary>
        /// ViewReply节点的打分函数(越大越好)
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <param name="Contain2Numbers">元素中是否包含了2个数字</param>
        /// <param name="MustBeReply">是否一定是评论(class或内容有明确指示的情况)</param>
        /// <param name="AvgNumber">平均数,用于比较是View或Reply;如果在一个Element中有两个数字,则为长度2的数组,均值大的是第一个</param>
        /// <returns></returns>
        internal static double ViewNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName, out bool Contain2Numbers, out bool MustBeReply, out double[] AvgNumber)
        {
            Contain2Numbers = false;
            MustBeReply     = false;
            AvgNumber       = null;

            //平均字符密度的检查
            double AvgCharFreq = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false);
                if (Text.Length > Strategy.MaxLenView)
                {
                    continue;
                }

                //检查是否命中日期的特殊字符
                if (TextCleaner.CountCharInSet(Text, FrequentCharSet_MustDate, false) > 0)
                {
                    return(0);
                }
                //检查字符集密度, 不合格直接可能性为0
                int CharCount = TextCleaner.CountCharInSet(Text, FrequentCharSet_ViewReply);
                AvgCharFreq += CharCount / (double)Text.Length;
            }
            AvgCharFreq = AvgCharFreq / Nodes.Count();
            if (AvgCharFreq < FrequentChars_Min || AvgCharFreq > FrequentChars_Max)
            {
                return(0);
            }

            //尝试提取其中的连续数字再解析,统计每个Text有几个数字,及其极值分布
            int        TotalCount = 0;
            int        ParsedNode = 0;
            List <int> Ints0      = new List <int>(BaseItemCount);
            List <int> Ints1      = new List <int>(BaseItemCount);

            int    ViewNameMatched = 0;
            double IDClassScore    = 0;

            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true, false, true, false);
                if (Text.Length > Strategy.MaxLenView)
                {
                    continue;
                }

                MatchCollection digiText = Regex.Matches(Text, @"\d{1,9}");

                if (digiText.Count == 0)
                {
                    continue;
                }
                //数量超过2则不可能
                if (digiText.Count > 2)
                {
                    return(0);
                }

                TotalCount += digiText.Count;
                //记录每一个Parse出来的int
                int Val0 = int.Parse(digiText[0].Captures[0].Value);
                Ints0.Add(Val0);
                if (digiText.Count > 1)
                {
                    Ints1.Add(int.Parse(digiText[1].Captures[0].Value));
                }

                //根据class和id是否有特殊字样加权
                bool   IDClassNameMatched;
                double s = IDClassNameScore(Node, ViewClassNames_Must, ViewClassNames_MustNot, out IDClassNameMatched);
                if (IDClassNameMatched)
                {
                    IDClassScore += s;
                    ViewNameMatched++;

                    if (s > 0) //正面命中的话,看看是不是reply
                    {
                        IDClassNameScore(Node, ReplyClassNames_Must, null, out IDClassNameMatched);
                    }
                    if (IDClassNameMatched) //有一个命中reply,这个标志就被设定了
                    {
                        MustBeReply = true;
                    }
                }

                //根据内容关键词来判断是否MustBeReply
                foreach (string ReplyKeyword in ReplyWords)
                {
                    if (Text.Contains(ReplyKeyword))
                    {
                        MustBeReply = true;
                    }
                }

                ParsedNode++;
            }


            //标志本元素包含两个数字
            if (Ints1.Count > 0 && Ints1.Count == Ints0.Count)
            {
                AvgNumber = new double[2];
                double Sum0 = Ints0.Sum();
                double Sum1 = Ints1.Sum();
                AvgNumber[0] = (Sum0 > Sum1 ? Sum0 : Sum1) / ParsedNode;
                AvgNumber[1] = (Sum0 > Sum1 ? Sum1 : Sum0) / ParsedNode;

                Contain2Numbers = true;
            }
            else if (Ints0.Count > 0 && ParsedNode > 0)
            {
                AvgNumber    = new double[1];
                AvgNumber[0] = Ints0.Sum() / ParsedNode;
            }
            else
            {
                AvgCharFreq = 0;
            }

            //如果每个Text只有一个数字,加分20%-50%
            if (TotalCount == Nodes.Count())
            {
                AvgCharFreq *= (Ints0.Max() <= 31) ? 1.2 : 1.5;
            }
            else if (Ints0.Count > 0)
            {
                //有两个数字的检查极值
                if (Ints0.Max() <= 12 && Ints1.Max() <= 31 || Ints0.Max() <= 31 && Ints1.Max() <= 12)
                {
                    AvgCharFreq *= 0.8;
                }
                else //排除日期可能,得分翻倍
                {
                    AvgCharFreq *= 2;
                }
            }

            //根据class和id是否有特殊字样加权
            if (ViewNameMatched > 0)
            {
                IDClassScore /= ViewNameMatched;
                AvgCharFreq   = (AvgCharFreq == 0 ? 1 : AvgCharFreq) * IDClassScore;
            }

            //微调1:级别约小的越好,2级以上每多一个级别扣10%可能性
            if (PathLevel > 1)
            {
                AvgCharFreq *= (1 - 0.1 * (PathLevel - 1));
            }

            //微调2:用ID和class的好于用序号的,增加20%可能性
            if (PathUsingName)
            {
                AvgCharFreq *= 1.2;
            }

            return(AvgCharFreq);
        }
Beispiel #11
0
        /// <summary>
        /// 日期节点Pattern的评估函数(考虑时差和数量,可以理解为0-1之间的可能性数值,越大越好)
        /// </summary>
        /// <param name="Nodes"></param>
        /// <param name="Pattern"></param>
        /// <param name="Strategy"></param>
        /// <param name="BaseItemCount"></param>
        /// <returns></returns>
        internal static double DateNodeRelPatternScore(IEnumerable <HtmlNode> Nodes, ListStrategy Strategy, int BaseItemCount, int PathLevel, bool PathUsingName)
        {
            //直接用DateTimeParser.Parser了,所以不用考察字符密度了
            double SumDiff    = 0;
            int    ParseCount = 0;

            //所有日期和Now的差距综合
            foreach (HtmlNode Node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(Node), true, true, true);
                if (Text.Length > Strategy.MaxLenDate)
                {
                    continue;
                }

                DateTime?Val = DateTimeParser.Parser(Text);
                if (Val != null)
                {
                    ParseCount++;
                    DateTime d = (DateTime)Val;

                    //根据class和id是否有特殊字样加权
                    bool   IDClassNameMatched;
                    double IDClassScore = IDClassNameScore(Node, DateClassNames_Must, DateClassNames_MustNot, out IDClassNameMatched);

                    if (d.Hour == 0 && d.Minute == 0 && d.Second == 0)
                    {
                        //只有日期,得分为距离Now的时差
                        SumDiff += Math.Abs((DateTime.Now - d).TotalDays / (IDClassNameMatched ? IDClassScore : 1));
                    }
                    else
                    {
                        //有精确时间,则只计入一半的时差
                        SumDiff += Math.Abs((DateTime.Now - d).TotalDays / 2 / (IDClassNameMatched ? IDClassScore : 1));
                    }
                }
            }

            double Possibility = 0;

            //先计算数量的可能性,在1/3-3倍之间都可以,可能性等比递减
            if (ParseCount * 3 < BaseItemCount || BaseItemCount * 3 < ParseCount)
            {
                return(0);
            }
            if (ParseCount >= BaseItemCount)
            {
                Possibility = 1 / (ParseCount / BaseItemCount); //倍数的倒数
            }
            else
            {
                Possibility = 1 / (BaseItemCount / ParseCount);
            }

            //根据平均时差来计算可能性
            double AvgDiff = SumDiff / ParseCount;

            if (AvgDiff > 180) //180以后统一为30%
            {
                Possibility *= 0.3;
            }
            else if (AvgDiff > 7) //1周以后为30%-100%,等比递减
            {
                Possibility *= (0.3 + 0.004046 * (180 - AvgDiff));
            }
            else if (AvgDiff > 1) //1周以内100%-150%
            {
                Possibility *= (1 + 0.08333 * (7 - AvgDiff));
            }
            else //1天以内的,150%-200%,便于区分原帖和评论的时间
            {
                Possibility *= (1.5 + 0.5 * (1 - AvgDiff));
            }

            //微调1:级别约小的越好,2级以上每多一个级别扣20%可能性
            if (PathLevel > 1)
            {
                Possibility *= (1 - 0.2 * (PathLevel - 1));
            }

            //微调2:用ID和class的好于用序号的,增加20%可能性
            if (PathUsingName)
            {
                Possibility *= 1.2;
            }

            return(Possibility);
        }
Beispiel #12
0
        public Feature GetFeature_ItemPage(IEnumerable <HtmlNode> Nodes, int ItemCount, Feature stencilfeature)
        {
            if (Nodes == null || ItemCount == 0)
            {
                return(null);
            }
            Feature feature = new Feature(0);

            feature.FigureFeatures["ItemCount"] = Nodes.Count();
            int[]    TextLen = new int[Nodes.Count()];
            int[]    DigiLen = new int[Nodes.Count()];
            double[] Diff    = new double[Nodes.Count()];
            int      i       = 0;

            int[] intone      = new int[Nodes.Count()];
            bool  havetwonums = true;
            int   DigitCount  = 0;

            foreach (HtmlNode node in Nodes)
            {
                string Text = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node));
                TextLen[i] = Text.Length;
                DigiLen[i] = TextCleaner.CountDigitChars(Text);
                if (Nodes.Count() >= ItemCount * 0.8 && Nodes.Count() <= ItemCount * 1.2 && (stencilfeature.FigureFeatures["AvgDateDistance"] == 1 || stencilfeature.FigureFeatures["DateParseCount"] == 1 || stencilfeature.FigureFeatures["DateCountRate"] == 1) && Text.Length > 1)
                {
                    if (Text.Contains("秒前") && Text.Length < 5)
                    {
                        Text = "昨日";
                    }
                    DateTime?Val = DateTimeParser.Parser(Text);
                    if (Val != null)
                    {
                        double diff = Math.Abs((DateTime.Now - (DateTime)Val).TotalDays);
                        if (diff < 4096 && Text.Length < Threshold.MaxDateLength)
                        {
                            Diff[i] = diff;
                            feature.FigureFeatures["DateParseCount"] += 1;
                        }
                    }
                }
                string          Textfordigit = TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(node), true, true, true, false, true, false);
                MatchCollection digiText     = Regex.Matches(Textfordigit, @"\d{1,9}");
                switch (digiText.Count)
                {
                case 1:
                    DigitCount++;
                    intone[i]   = int.Parse(digiText[0].Captures[0].Value);
                    havetwonums = false;
                    break;

                case 2:
                    DigitCount++;
                    intone[i] = int.Parse(digiText[0].Captures[0].Value) - int.Parse(digiText[1].Captures[0].Value);
                    break;

                default:
                    havetwonums = false;
                    break;
                }
                if (TextLen.Sum() < 50)
                {
                    feature = CheckforChars(feature, Text, stencilfeature, false);
                }
                i += 1;
            }

            //ID和CLASS NAME的识别
            feature = CheckforIdorClassName(feature, Nodes, stencilfeature, false);
            if (stencilfeature.FigureFeatures["DigitCountRate"] == 1)
            {
                feature.FigureFeatures["DigitCountRate"] = 10 * DigitCount / ItemCount;
            }
            if (stencilfeature.FigureFeatures["AvgTextLen"] == 1)
            {
                feature.FigureFeatures["AvgTextLen"] = TextLen.Average();
            }
            if (stencilfeature.FigureFeatures["AllTextLen"] == 1)
            {
                feature.FigureFeatures["AllTextLen"] = TextLen.Sum();
            }
            if (stencilfeature.FigureFeatures["AvgDateDistance"] == 1)
            {
                feature.FigureFeatures["AvgDateDistance"] = Diff.Average();
            }
            intone = intone.Where(inton => inton > 0).ToArray();
            if (stencilfeature.FigureFeatures["AvgNumber"] == 1)
            {
                feature.FigureFeatures["AvgNumber"] = intone.Count() == 0 ? 0 : Math.Log(intone.Where(inton => inton > 0).Average(), 2);
            }
            if (stencilfeature.FigureFeatures["DateCountRate"] == 1 && feature.FigureFeatures["ItemCount"] != 0)
            {
                feature.FigureFeatures["DateCountRate"] = 10 * feature.FigureFeatures["DateParseCount"] / ItemCount;
            }
            if (stencilfeature.FigureFeatures["RateTitleDigits"] == 1)
            {
                feature.FigureFeatures["RateTitleDigits"] = TextLen.Sum() + DigiLen.Sum() == 0 ? 0 : 10 * (double)(DigiLen.Sum()) / (double)(TextLen.Sum() + DigiLen.Sum());
            }
            if (stencilfeature.BoolFeatures["twonuminregularshape"] == 1)
            {
                feature.BoolFeatures["twonuminregularshape"] = (havetwonums && (intone.Where(k => k > 0).Count() == 0 || intone.Where(k => k < 0).Count() == 0)) ? 0 : 1;
            }

            //曾经考虑过把数字特征的方差也统计进来,或者把标准差与平均值之比放进来。有用吗
            return(feature);
        }