public static bool validateNextPage(string Html, string Pattern, ref string Url) { HtmlNode htmlNode = HtmlUtility.getSafeHtmlRootNode(Html); List <HtmlNode> atagHtmlNodes = htmlNode.SelectNodes("//a[@href]").ToList(); List <testNextUrl> x = new List <testNextUrl>(); StringBuilder Result = new StringBuilder(); int intX = 0; foreach (HtmlNode tmpNode in atagHtmlNodes) { intX++; if (Regex.Match(tmpNode.InnerText, @".*[一二三四五六七八九十\d页].*").Success) { testNextUrl tmp = new testNextUrl(); tmp.index = intX; tmp.urlText = HTMLCleaner.CleanHTML(tmpNode.InnerText, true); tmp.urlLink = HtmlUtility.AbsoluteUrl(tmpNode.Attributes["href"].Value, Url, true); x.Add(tmp); } } if (x.Count > 0) { if (Url.Equals(TrianNextUrl(x).urlLink)) { return(false); } else { Url = TrianNextUrl(x).urlLink; } } if (Url.Length == 0) { return(false); } else { return(true); } }
/// <summary> /// 从List页面上根据各字段XPath提取内容集合 /// </summary> /// <param name="Url">网址</param> /// <param name="RootNode">Document的根节点</param> /// <param name="Path">根据此ListPath来提取内容</param> /// <param name="List_MinCountItem">至少List几个Item(用于判定旧网站中大量A堆砌在同一个元素下的情况)</param> /// <param name="needscalepages">是否需要翻页,默认为否</param> /// <returns></returns> public static List <Article> ExtractItemFromList(string Url, HtmlNode RootNode, XpathPattern Path) { List <Article> Content = new List <Article>(); //fix a null bug by carey. 2014-09-10 HtmlNodeCollection rootNodes = RootNode.SelectNodes(Path.ItemRootXPath); if (rootNodes != null && rootNodes.Count > 0) { foreach (HtmlNode BaseNode in rootNodes) { //正常情况下,每个BaseNode有一个Item,但是某些网站可能存在多个 if (string.IsNullOrWhiteSpace(Path.TitleXPath) || BaseNode.SelectNodes(Path.TitleXPath) == null) { continue; } //如果 BaseNode 的数量小于6,则判断是否存在多个可匹配的 Title 项;如果存在的话则记录数量 List <HtmlNode> nodecollection = new List <HtmlNode>(); int singleNodeItemCount = 0; if (!string.IsNullOrWhiteSpace(Path.UrlXPath)) { nodecollection = BaseNode.SelectNodes(Path.UrlXPath).Where(n => n.Attributes.Contains("href")).ToList(); } else { nodecollection = BaseNode.SelectNodes(Path.TitleXPath).Where(n => n.Attributes.Contains("href")).ToList(); } if (!string.IsNullOrWhiteSpace(Path.TitleXPath)) { Path.TitleXPath = Path.UrlXPath; } singleNodeItemCount = nodecollection?.Count ?? 0; if (nodecollection != null && nodecollection.Count() > 0 && nodecollection.Any(n => !string.IsNullOrEmpty(n.Attributes["href"].Value))) { Article[] articleNodeItems = new Article[singleNodeItemCount]; for (int i = 0; i < singleNodeItemCount; i++) { articleNodeItems[i] = new Article(); articleNodeItems[i].Title = ExtractInnerTextFromBaseNode(BaseNode, Path.TitleXPath, i); if (articleNodeItems[i].Title != null) { try { articleNodeItems[i].Url = nodecollection.Where(n => !string.IsNullOrEmpty(n.Attributes["href"].Value)).ElementAt(i).Attributes["href"].Value; if (articleNodeItems[i].Url.Contains(".pdf")) { continue; } if (articleNodeItems[i].Url.StartsWith("javascript:openArticle")) { articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(articleNodeItems[i].Url.IndexOf("('") + 2); articleNodeItems[i].Url = articleNodeItems[i].Url.Substring(0, articleNodeItems[i].Url.IndexOf("')")); } articleNodeItems[i].Url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true); string url = HtmlUtility.AbsoluteUrl(articleNodeItems[i].Url, Url, true); articleNodeItems[i].Url = url; if (articleNodeItems[i].Url.Contains('@')) { continue; } } catch (Exception ex) { articleNodeItems[i].Url = null; } } if (!string.IsNullOrWhiteSpace(Path.MediaNameXPath)) { articleNodeItems[i].MediaName = ExtractInnerTextFromBaseNode(BaseNode, Path.MediaNameXPath, i); articleNodeItems[i].MediaName = ExtractSegmentFromInnerText(articleNodeItems[i].MediaName, MediaPrefixRegex); articleNodeItems[i].MediaName = HTMLCleaner.CleanMediaName(articleNodeItems[i].MediaName);//清洗 } if (!string.IsNullOrWhiteSpace(Path.AuthorXPath)) { articleNodeItems[i].Author = ExtractInnerTextFromBaseNode(BaseNode, Path.AuthorXPath, i); articleNodeItems[i].Author = ExtractSegmentFromInnerText(articleNodeItems[i].Author, AuthorPrefixRegex); articleNodeItems[i].Author = HTMLCleaner.CleanAuthor(articleNodeItems[i].Author);//清洗 } if (!string.IsNullOrWhiteSpace(Path.DateXPath)) { articleNodeItems[i].PubDate = DateTimeParser.Parser(ExtractInnerTextFromBaseNode(BaseNode, Path.DateXPath, i)); } if (!string.IsNullOrWhiteSpace(Path.AbsTractXPath)) { articleNodeItems[i].AbsTract = ExtractInnerTextFromBaseNode(BaseNode, Path.AbsTractXPath, i); } //点击数的提取逻辑 string ViewString = string.Empty; if (!string.IsNullOrWhiteSpace(Path.ViewXPath) || !string.IsNullOrWhiteSpace(Path.ReplyXPath)) { ViewData currentViewData = new ViewData(); currentViewData.FetchTime = DateTime.Now; ViewString = ExtractInnerTextFromBaseNode(BaseNode, Path.ViewXPath, i, false); if (!string.IsNullOrEmpty(ViewString)) { MatchCollection digiText = Regex.Matches(ViewString, @"\d{1,9}"); if (digiText.Count == 1) { currentViewData.View = int.Parse(digiText[0].Captures[0].Value); } else if (digiText.Count > 1 && Path.ViewXPath == Path.ReplyXPath) //View和Reply在一个格子里,这里容易出现多个的情况,不建议使用 { int a = int.Parse(digiText[0].Captures[0].Value); int b = int.Parse(digiText[1].Captures[0].Value); currentViewData.View = a >= b ? a : b; currentViewData.Reply = a >= b ? b : a; } } //评论数的提取逻辑 if (!string.IsNullOrEmpty(Path.ReplyXPath) && Path.ViewXPath != Path.ReplyXPath) { string ReplyString = ExtractInnerTextFromBaseNode(BaseNode, Path.ReplyXPath, i, false); if (!string.IsNullOrEmpty(ReplyString)) { MatchCollection digiText = Regex.Matches(ReplyString, @"\d{1,9}"); if (digiText.Count > 0) //单独的Reply { currentViewData.Reply = int.Parse(digiText[0].Captures[0].Value); } } } if (articleNodeItems[i].ViewDataList == null) { articleNodeItems[i].ViewDataList = new List <ViewData>(); } articleNodeItems[i].ViewDataList.Add(currentViewData); } } Content.AddRange(articleNodeItems.Where(f => !string.IsNullOrWhiteSpace(f.Url))); } } } return(Content); }