Ejemplo n.º 1
0
        public static bool validateNextPage(string Html, string Pattern, ref string Url)
        {
            HtmlNode htmlNode = HtmlUtility.getSafeHtmlRootNode(Html);

            List <HtmlNode>    atagHtmlNodes = htmlNode.SelectNodes("//a[@href]").ToList();
            List <testNextUrl> x             = new List <testNextUrl>();
            StringBuilder      Result        = new StringBuilder();

            int intX = 0;

            foreach (HtmlNode tmpNode in atagHtmlNodes)
            {
                intX++;
                if (Regex.Match(tmpNode.InnerText, @".*[一二三四五六七八九十\d页].*").Success)
                {
                    testNextUrl tmp = new testNextUrl();
                    tmp.index   = intX;
                    tmp.urlText = HTMLCleaner.CleanHTML(tmpNode.InnerText, true);
                    tmp.urlLink = HtmlUtility.AbsoluteUrl(tmpNode.Attributes["href"].Value, Url, true);
                    x.Add(tmp);
                }
            }
            if (x.Count > 0)
            {
                if (Url.Equals(TrianNextUrl(x).urlLink))
                {
                    return(false);
                }
                else
                {
                    Url = TrianNextUrl(x).urlLink;
                }
            }
            if (Url.Length == 0)
            {
                return(false);
            }
            else
            {
                return(true);
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 从List页面上根据各字段XPath提取内容集合
        /// </summary>
        /// <param name="Url">网址</param>
        /// <param name="RootNode">Document的根节点</param>
        /// <param name="Path">根据此ListPath来提取内容</param>
        /// <param name="List_MinCountItem">至少List几个Item(用于判定旧网站中大量A堆砌在同一个元素下的情况)</param>
        /// <param name="needscalepages">是否需要翻页,默认为否</param>
        /// <returns></returns>
        public static List <Article> ExtractItemFromList(string Url, HtmlNode RootNode, XpathPattern Path)
        {
            List <Article> Content = new List <Article>();

            //fix a null bug by carey. 2014-09-10
            HtmlNodeCollection rootNodes = RootNode.SelectNodes(Path.ItemRootXPath);

            if (rootNodes != null && rootNodes.Count > 0)
            {
                foreach (HtmlNode BaseNode in rootNodes)
                {
                    //正常情况下,每个BaseNode有一个Item,但是某些网站可能存在多个
                    if (string.IsNullOrWhiteSpace(Path.TitleXPath) || BaseNode.SelectNodes(Path.TitleXPath) == null)
                    {
                        continue;
                    }

                    //如果 BaseNode 的数量小于6,则判断是否存在多个可匹配的 Title 项;如果存在的话则记录数量
                    List <HtmlNode> nodecollection      = new List <HtmlNode>();
                    int             singleNodeItemCount = 0;

                    if (!string.IsNullOrWhiteSpace(Path.UrlXPath))
                    {
                        nodecollection = BaseNode.SelectNodes(Path.UrlXPath).Where(n => n.Attributes.Contains("href")).ToList();
                    }
                    else
                    {
                        nodecollection = BaseNode.SelectNodes(Path.TitleXPath).Where(n => n.Attributes.Contains("href")).ToList();
                    }

                    if (!string.IsNullOrWhiteSpace(Path.TitleXPath))
                    {
                        Path.TitleXPath = Path.UrlXPath;
                    }

                    singleNodeItemCount = nodecollection?.Count ?? 0;

                    if (nodecollection != null && nodecollection.Count() > 0 && nodecollection.Any(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)))
                    {
                        Article[] articleNodeItems = new Article[singleNodeItemCount];

                        for (int i = 0; i < singleNodeItemCount; i++)
                        {
                            articleNodeItems[i]       = new Article();
                            articleNodeItems[i].Title = ExtractInnerTextFromBaseNode(BaseNode, Path.TitleXPath, i);
                            if (articleNodeItems[i].Title != null)
                            {
                                try
                                {
                                    articleNodeItems[i].Url = nodecollection.Where(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)).ElementAt(i).Attributes["href"].Value;
                                    if (articleNodeItems[i].Url.Contains(".pdf"))
                                    {
                                        continue;
                                    }
                                    if (articleNodeItems[i].Url.StartsWith("javascript:openArticle"))
                                    {
                                        articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(articleNodeItems[i].Url.IndexOf("('") + 2);
                                        articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(0, articleNodeItems[i].Url.IndexOf("')"));
                                    }
                                    articleNodeItems[i].Url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true);
                                    string url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true);
                                    articleNodeItems[i].Url = url;
                                    if (articleNodeItems[i].Url.Contains('@'))
                                    {
                                        continue;
                                    }
                                }
                                catch (Exception ex)
                                {
                                    articleNodeItems[i].Url = null;
                                }
                            }
                            if (!string.IsNullOrWhiteSpace(Path.MediaNameXPath))
                            {
                                articleNodeItems[i].MediaName = ExtractInnerTextFromBaseNode(BaseNode, Path.MediaNameXPath, i);
                                articleNodeItems[i].MediaName = ExtractSegmentFromInnerText(articleNodeItems[i].MediaName, MediaPrefixRegex);
                                articleNodeItems[i].MediaName = HTMLCleaner.CleanMediaName(articleNodeItems[i].MediaName);//清洗
                            }
                            if (!string.IsNullOrWhiteSpace(Path.AuthorXPath))
                            {
                                articleNodeItems[i].Author = ExtractInnerTextFromBaseNode(BaseNode, Path.AuthorXPath, i);
                                articleNodeItems[i].Author = ExtractSegmentFromInnerText(articleNodeItems[i].Author, AuthorPrefixRegex);
                                articleNodeItems[i].Author = HTMLCleaner.CleanAuthor(articleNodeItems[i].Author);//清洗
                            }
                            if (!string.IsNullOrWhiteSpace(Path.DateXPath))
                            {
                                articleNodeItems[i].PubDate = DateTimeParser.Parser(ExtractInnerTextFromBaseNode(BaseNode, Path.DateXPath, i));
                            }
                            if (!string.IsNullOrWhiteSpace(Path.AbsTractXPath))
                            {
                                articleNodeItems[i].AbsTract = ExtractInnerTextFromBaseNode(BaseNode, Path.AbsTractXPath, i);
                            }
                            //点击数的提取逻辑
                            string ViewString = string.Empty;
                            if (!string.IsNullOrWhiteSpace(Path.ViewXPath) || !string.IsNullOrWhiteSpace(Path.ReplyXPath))
                            {
                                ViewData currentViewData = new ViewData();
                                currentViewData.FetchTime = DateTime.Now;

                                ViewString = ExtractInnerTextFromBaseNode(BaseNode, Path.ViewXPath, i, false);
                                if (!string.IsNullOrEmpty(ViewString))
                                {
                                    MatchCollection digiText = Regex.Matches(ViewString, @"\d{1,9}");
                                    if (digiText.Count == 1)
                                    {
                                        currentViewData.View = int.Parse(digiText[0].Captures[0].Value);
                                    }
                                    else if (digiText.Count > 1 && Path.ViewXPath == Path.ReplyXPath) //View和Reply在一个格子里,这里容易出现多个的情况,不建议使用
                                    {
                                        int a = int.Parse(digiText[0].Captures[0].Value);
                                        int b = int.Parse(digiText[1].Captures[0].Value);
                                        currentViewData.View  = a >= b ? a : b;
                                        currentViewData.Reply = a >= b ? b : a;
                                    }
                                }

                                //评论数的提取逻辑
                                if (!string.IsNullOrEmpty(Path.ReplyXPath) && Path.ViewXPath != Path.ReplyXPath)
                                {
                                    string ReplyString = ExtractInnerTextFromBaseNode(BaseNode, Path.ReplyXPath, i, false);
                                    if (!string.IsNullOrEmpty(ReplyString))
                                    {
                                        MatchCollection digiText = Regex.Matches(ReplyString, @"\d{1,9}");
                                        if (digiText.Count > 0) //单独的Reply
                                        {
                                            currentViewData.Reply = int.Parse(digiText[0].Captures[0].Value);
                                        }
                                    }
                                }
                                if (articleNodeItems[i].ViewDataList == null)
                                {
                                    articleNodeItems[i].ViewDataList = new List <ViewData>();
                                }

                                articleNodeItems[i].ViewDataList.Add(currentViewData);
                            }
                        }

                        Content.AddRange(articleNodeItems.Where(f => !string.IsNullOrWhiteSpace(f.Url)));
                    }
                }
            }

            return(Content);
        }