예제 #1
0
        /// <summary>///html/body/div[4]/div[1]/div[6]/div[1]/ul/div[1]/div[2]/ol/li/a
        /// 验证一个List的模式是否能应用于某一个页面(只是检查是否明显不可能)
        /// 与下面的函数都是直接从ListStrategy里copy过来的
        /// </summary>
        /// <param name="Url"></param>
        /// <param name="HTML"></param>
        /// <param name="XPath"></param>
        /// <returns></returns>
        public bool ValidateListXPath(string Url, string HTML, XpathPattern XPath)
        {
            //获取root节点(有些网站页面不带html标签的,直接从head开始写)
            HtmlNode rootNode = HtmlUtility.getSafeHtmlRootNode(HTML, true, true);

            if (rootNode == null)
            {
                return(false);
            }

            HtmlNodeCollection rootNodes = rootNode.SelectNodes(XPath.ItemRootXPath);

            if (rootNodes == null)
            {
                return(false);
            }

            var TitleNode = rootNodes.Select(f => f.SelectSingleNode(XPath.TitleXPath)).Where(f => f != null);

            if (TitleNode == null || TitleNode.Count() == 0 || (TitleNode.Count() == 1 && TitleNode.FirstOrDefault() == null))
            {
                return(false);
            }

            //获取时有可能第一个为空
            TitleNode = TitleNode.Where(f => f != null);

            List <HtmlNode> TitleNodes = TitleNode.Where(a => !string.IsNullOrEmpty(a.InnerText)).ToList();
            double          Score      = ScoreforListTitle(TitleNodes);

            return((Score > Threshold.LeastTitleScore || (Url.Contains("tieba.baidu.com") && Score > 100)) && ValidateListXPath(Url, rootNode, XPath));
        }
예제 #2
0
        /// <summary>
        /// 针对不存在 HtmlNode 时使用,如果 Html 也不存在,将使用默认 Http Helper 获取网页源码
        /// </summary>
        /// <param name="Url"></param>
        /// <param name="Path"></param>
        /// <returns></returns>
        public static List <Article> ExtractItemFromList(string Url, XpathPattern Pattern, string Html = null)
        {
            if (string.IsNullOrWhiteSpace(Html))
            {
                Html = HttpHelper.GetHttpContent(Url);
            }
            HtmlNode rootNode = HtmlUtility.getSafeHtmlRootNode(Html, true, true);

            return(ExtractItemFromList(Url, rootNode, Pattern));
        }
예제 #3
0
        public static ArticleList ParseList(string Html, string Pattern, string Url = null, bool RecogNextPage = true)
        {
            //输入检查
            if (string.IsNullOrWhiteSpace(Html) || string.IsNullOrWhiteSpace(Pattern))
            {
                return(null);
            }

            //检查 Pattern 的格式,判断是否符合要求
            XpathPattern xpathPattern = null;

            try {
                xpathPattern = JsonConvert.DeserializeObject <XpathPattern>(Pattern);
            }
            catch (Exception ex)
            {
                Logger.Error(string.Format("Pattern 的格式不符合 Xpath Parser 的定义,请检查!Url:{0}, Pattern:{1}.", Url, Pattern), ex);
            }

            ArticleList articleList = new ArticleList();

            List <Article> articles = new List <Article>();

            #region Article 集合
            HashSet <string> ItemIDs  = new HashSet <string>();
            HtmlNode         htmlNode = HtmlUtility.getSafeHtmlRootNode(Html, true, true);

            articles = ExtractItemFromList(Url, htmlNode, xpathPattern);
            #endregion Item集合


            articleList.Articles = articles;
            articleList.Count    = articleList?.Count ?? 0;

            return(articleList);
        }
예제 #4
0
        /// <summary>
        /// 验证一个List的模式是否能应用于某一个页面(只是检查是否明显不可能)
        /// </summary>
        /// <param name="Url"></param>
        /// <param name="RootNode"></param>
        /// <param name="XPath"></param>
        /// <returns></returns>
        public bool ValidateListXPath(string Url, HtmlNode RootNode, XpathPattern XPath)
        {
            if (string.IsNullOrEmpty(Url) || RootNode == null || XPath == null)
            {
                return(false);
            }

            List <Article> Content = XpathParser.ExtractItemFromList(Url, RootNode, XPath);

            if (Content == null || Content.Count < 3)
            {
                return(false);
            }

            int TitleCount = 0, DateCount = 0, ViewCount = 0, ReplyCount = 0, MediaCount = 0, AuthorCount = 0;

            foreach (Article ele in Content)
            {
                if (!string.IsNullOrEmpty(ele.Title) && !string.IsNullOrEmpty(ele.Url))
                {
                    TitleCount++;
                }
                if (!string.IsNullOrEmpty(XPath.DateXPath) && ele.PubDate != null)
                {
                    DateCount++;
                }
                if (!string.IsNullOrEmpty(XPath.ViewXPath) && ele.ViewDataList?.FirstOrDefault()?.View >= 0)
                {
                    ViewCount++;
                }
                if (!string.IsNullOrEmpty(XPath.ReplyXPath) && ele.ViewDataList?.FirstOrDefault()?.Reply >= 0)
                {
                    ReplyCount++;
                }
                if (!string.IsNullOrEmpty(XPath.MediaNameXPath) && !string.IsNullOrEmpty(ele.MediaName))
                {
                    MediaCount++;
                }
                if (!string.IsNullOrEmpty(XPath.AuthorXPath) && !string.IsNullOrEmpty(ele.Author))
                {
                    AuthorCount++;
                }
            }

            if (TitleCount < Content.Count * 0.9)
            {
                return(false);
            }
            if (!string.IsNullOrEmpty(XPath.DateXPath) && DateCount < Content.Count * 0.9)
            {
                return(false);
            }
            if (!string.IsNullOrEmpty(XPath.ViewXPath) && ViewCount < Content.Count * 0.4)
            {
                return(false);
            }
            if (!string.IsNullOrEmpty(XPath.ReplyXPath) && ReplyCount < Content.Count * 0.1)
            {
                return(false);
            }
            if (!string.IsNullOrEmpty(XPath.MediaNameXPath) && MediaCount < Content.Count * 0.9)
            {
                return(false);
            }
            if (!string.IsNullOrEmpty(XPath.AuthorXPath) && AuthorCount < Content.Count * 0.9)
            {
                return(false);
            }

            return(true);
        }
예제 #5
0
        public static bool ParseItem(string Html, string Pattern, string Url, ref Article BaseArticle)
        {
            //输入检查
            if (string.IsNullOrWhiteSpace(Html) || string.IsNullOrWhiteSpace(Pattern))
            {
                return(false);
            }

            //检查 Pattern 的格式,判断是否符合要求
            XpathPattern xpathPattern = null;

            try
            {
                xpathPattern = JsonConvert.DeserializeObject <XpathPattern>(Pattern);
            }
            catch (Exception ex)
            {
                Logger.Error(string.Format("Pattern 的格式不符合 Xpath Parser 的定义,请检查!Url:{0}, Pattern:{1}.", Url, Pattern), ex);
            }

            HtmlNode itempagenode = HtmlUtility.getSafeHtmlRootNode(Html, true, true);

            //提取文章正文
            if (string.IsNullOrEmpty(BaseArticle.HtmlContent) && !string.IsNullOrWhiteSpace(xpathPattern.ItemContentXPath))
            {
                try
                {
                    BaseArticle.HtmlContent = HTMLCleaner.CleanContent(itempagenode.SelectNodes(xpathPattern.ItemContentXPath), Url, true);
                    BaseArticle.Content     = HTMLCleaner.CleanHTML(BaseArticle.HtmlContent, false);
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析正文出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex);
                }
            }

            //确认标题
            if (string.IsNullOrEmpty(BaseArticle.Title) && !string.IsNullOrWhiteSpace(xpathPattern.ItemTitleXPath))
            {
                try
                {
                    BaseArticle.Title = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemTitleXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析标题出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemTitleXPath), ex);
                }
            }

            //确认时间
            if (!string.IsNullOrWhiteSpace(xpathPattern.ItemDateXPath))
            {
                try
                {
                    DateTime Pubdate = DateTimeParser.Parser(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemDateXPath)));

                    if (BaseArticle.PubDate <= DateTime.MinValue.AddYears(1) && Pubdate.Year > 2000) //发布时间过旧
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                    else if (BaseArticle.PubDate.Hour == 0 && BaseArticle.PubDate.Minute == 0 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && Pubdate.Year > 2000) //发布时间没有时与分
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                    else if (Pubdate.Year > 2000 && (Pubdate.Hour != 0 || Pubdate.Minute != 0) && (BaseArticle.PubDate - Pubdate) > new TimeSpan(0, 1, 59) && BaseArticle.PubDate >= DateTime.Now.AddMinutes(-10)) //发布时间拒当前时间很近且相差较大
                    {
                        BaseArticle.PubDate = Pubdate;
                    }
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析标题出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemContentXPath), ex);
                }
            }

            //确认媒体
            if (string.IsNullOrEmpty(BaseArticle.MediaName) && !string.IsNullOrWhiteSpace(xpathPattern.ItemMediaNameXPath))
            {
                try
                {
                    BaseArticle.MediaName = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemMediaNameXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析媒体出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemMediaNameXPath), ex);
                }
            }

            //确认作者
            if (string.IsNullOrEmpty(BaseArticle.Author) && !string.IsNullOrWhiteSpace(xpathPattern.ItemAuthorXPath))
            {
                try
                {
                    BaseArticle.Author = TextCleaner.FullClean(HTMLCleaner.GetCleanInnerText(itempagenode.SelectSingleNode(xpathPattern.ItemAuthorXPath)));
                }
                catch (Exception ex)
                {
                    Logger.Error(string.Format("从详情页解析作者出错,Url:{0}, Pattern:{1}.", Url, xpathPattern.ItemAuthorXPath), ex);
                }
            }

            return(true);
        }
예제 #6
0
        /// <summary>
        /// 从List页面上根据各字段XPath提取内容集合
        /// </summary>
        /// <param name="Url">网址</param>
        /// <param name="RootNode">Document的根节点</param>
        /// <param name="Path">根据此ListPath来提取内容</param>
        /// <param name="List_MinCountItem">至少List几个Item(用于判定旧网站中大量A堆砌在同一个元素下的情况)</param>
        /// <param name="needscalepages">是否需要翻页,默认为否</param>
        /// <returns></returns>
        public static List <Article> ExtractItemFromList(string Url, HtmlNode RootNode, XpathPattern Path)
        {
            List <Article> Content = new List <Article>();

            //fix a null bug by carey. 2014-09-10
            HtmlNodeCollection rootNodes = RootNode.SelectNodes(Path.ItemRootXPath);

            if (rootNodes != null && rootNodes.Count > 0)
            {
                foreach (HtmlNode BaseNode in rootNodes)
                {
                    //正常情况下,每个BaseNode有一个Item,但是某些网站可能存在多个
                    if (string.IsNullOrWhiteSpace(Path.TitleXPath) || BaseNode.SelectNodes(Path.TitleXPath) == null)
                    {
                        continue;
                    }

                    //如果 BaseNode 的数量小于6,则判断是否存在多个可匹配的 Title 项;如果存在的话则记录数量
                    List <HtmlNode> nodecollection      = new List <HtmlNode>();
                    int             singleNodeItemCount = 0;

                    if (!string.IsNullOrWhiteSpace(Path.UrlXPath))
                    {
                        nodecollection = BaseNode.SelectNodes(Path.UrlXPath).Where(n => n.Attributes.Contains("href")).ToList();
                    }
                    else
                    {
                        nodecollection = BaseNode.SelectNodes(Path.TitleXPath).Where(n => n.Attributes.Contains("href")).ToList();
                    }

                    if (!string.IsNullOrWhiteSpace(Path.TitleXPath))
                    {
                        Path.TitleXPath = Path.UrlXPath;
                    }

                    singleNodeItemCount = nodecollection?.Count ?? 0;

                    if (nodecollection != null && nodecollection.Count() > 0 && nodecollection.Any(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)))
                    {
                        Article[] articleNodeItems = new Article[singleNodeItemCount];

                        for (int i = 0; i < singleNodeItemCount; i++)
                        {
                            articleNodeItems[i]       = new Article();
                            articleNodeItems[i].Title = ExtractInnerTextFromBaseNode(BaseNode, Path.TitleXPath, i);
                            if (articleNodeItems[i].Title != null)
                            {
                                try
                                {
                                    articleNodeItems[i].Url = nodecollection.Where(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)).ElementAt(i).Attributes["href"].Value;
                                    if (articleNodeItems[i].Url.Contains(".pdf"))
                                    {
                                        continue;
                                    }
                                    if (articleNodeItems[i].Url.StartsWith("javascript:openArticle"))
                                    {
                                        articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(articleNodeItems[i].Url.IndexOf("('") + 2);
                                        articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(0, articleNodeItems[i].Url.IndexOf("')"));
                                    }
                                    articleNodeItems[i].Url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true);
                                    string url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true);
                                    articleNodeItems[i].Url = url;
                                    if (articleNodeItems[i].Url.Contains('@'))
                                    {
                                        continue;
                                    }
                                }
                                catch (Exception ex)
                                {
                                    articleNodeItems[i].Url = null;
                                }
                            }
                            if (!string.IsNullOrWhiteSpace(Path.MediaNameXPath))
                            {
                                articleNodeItems[i].MediaName = ExtractInnerTextFromBaseNode(BaseNode, Path.MediaNameXPath, i);
                                articleNodeItems[i].MediaName = ExtractSegmentFromInnerText(articleNodeItems[i].MediaName, MediaPrefixRegex);
                                articleNodeItems[i].MediaName = HTMLCleaner.CleanMediaName(articleNodeItems[i].MediaName);//清洗
                            }
                            if (!string.IsNullOrWhiteSpace(Path.AuthorXPath))
                            {
                                articleNodeItems[i].Author = ExtractInnerTextFromBaseNode(BaseNode, Path.AuthorXPath, i);
                                articleNodeItems[i].Author = ExtractSegmentFromInnerText(articleNodeItems[i].Author, AuthorPrefixRegex);
                                articleNodeItems[i].Author = HTMLCleaner.CleanAuthor(articleNodeItems[i].Author);//清洗
                            }
                            if (!string.IsNullOrWhiteSpace(Path.DateXPath))
                            {
                                articleNodeItems[i].PubDate = DateTimeParser.Parser(ExtractInnerTextFromBaseNode(BaseNode, Path.DateXPath, i));
                            }
                            if (!string.IsNullOrWhiteSpace(Path.AbsTractXPath))
                            {
                                articleNodeItems[i].AbsTract = ExtractInnerTextFromBaseNode(BaseNode, Path.AbsTractXPath, i);
                            }
                            //点击数的提取逻辑
                            string ViewString = string.Empty;
                            if (!string.IsNullOrWhiteSpace(Path.ViewXPath) || !string.IsNullOrWhiteSpace(Path.ReplyXPath))
                            {
                                ViewData currentViewData = new ViewData();
                                currentViewData.FetchTime = DateTime.Now;

                                ViewString = ExtractInnerTextFromBaseNode(BaseNode, Path.ViewXPath, i, false);
                                if (!string.IsNullOrEmpty(ViewString))
                                {
                                    MatchCollection digiText = Regex.Matches(ViewString, @"\d{1,9}");
                                    if (digiText.Count == 1)
                                    {
                                        currentViewData.View = int.Parse(digiText[0].Captures[0].Value);
                                    }
                                    else if (digiText.Count > 1 && Path.ViewXPath == Path.ReplyXPath) //View和Reply在一个格子里,这里容易出现多个的情况,不建议使用
                                    {
                                        int a = int.Parse(digiText[0].Captures[0].Value);
                                        int b = int.Parse(digiText[1].Captures[0].Value);
                                        currentViewData.View  = a >= b ? a : b;
                                        currentViewData.Reply = a >= b ? b : a;
                                    }
                                }

                                //评论数的提取逻辑
                                if (!string.IsNullOrEmpty(Path.ReplyXPath) && Path.ViewXPath != Path.ReplyXPath)
                                {
                                    string ReplyString = ExtractInnerTextFromBaseNode(BaseNode, Path.ReplyXPath, i, false);
                                    if (!string.IsNullOrEmpty(ReplyString))
                                    {
                                        MatchCollection digiText = Regex.Matches(ReplyString, @"\d{1,9}");
                                        if (digiText.Count > 0) //单独的Reply
                                        {
                                            currentViewData.Reply = int.Parse(digiText[0].Captures[0].Value);
                                        }
                                    }
                                }
                                if (articleNodeItems[i].ViewDataList == null)
                                {
                                    articleNodeItems[i].ViewDataList = new List <ViewData>();
                                }

                                articleNodeItems[i].ViewDataList.Add(currentViewData);
                            }
                        }

                        Content.AddRange(articleNodeItems.Where(f => !string.IsNullOrWhiteSpace(f.Url)));
                    }
                }
            }

            return(Content);
        }