/// <summary> /// Recursively appends subsequent pages of a multipage article. /// </summary> /// <param name="document">Compiled document</param> /// <param name="url">Url of current page</param> private void AppendNextPage(XDocument document, string url) { _curPageNum++; var contentDiv = document.GetElementById("readInner"); if (_curPageNum > _MaxPages) { url = "<div style='text-align: center'><a href='" + url + "'>View Next Page</a></div>"; contentDiv.Add(XDocument.Parse(url)); return; } string nextContent = _urlFetcher.Fetch(url); if (string.IsNullOrEmpty(nextContent)) { return; } bool mainContentExtracted; string extractedTitle; string nextPageLink; var nextDocument = _transcoder.TranscodeToXml(nextContent, url, out mainContentExtracted, out extractedTitle, out nextPageLink); var nextInner = nextDocument.GetElementById("readInner"); var header = nextInner.Element("h1"); if (header != null) { header.Remove(); } /* * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. * Compare it against all of the the previous document's we've gotten. If the previous * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. */ var firstP = nextInner.GetElementsByTagName("p").Count() > 0 ? nextInner.GetElementsByTagName("p").First() : null; if (firstP != null && firstP.GetInnerHtml().Length > 100) { //string innerHtml = firstP.GetInnerHtml(); //var existingContent = contentDiv.GetInnerHtml(); //existingContent = Regex.Replace(existingContent, "xmlns(:[a-z]+)?=['\"][^'\"]+['\"]", "", RegexOptions.IgnoreCase); //existingContent = Regex.Replace(existingContent, @"\s+", ""); //innerHtml = Regex.Replace(innerHtml, @"\s+", ""); // TODO: This test could probably be improved to compare the actual markup. string existingContent = contentDiv.Value; string innerHtml = firstP.Value; if (!string.IsNullOrEmpty(existingContent) && !string.IsNullOrEmpty(innerHtml) && existingContent.IndexOf(innerHtml) != -1) { _parsedPages.Add(url); return; } } /* Add the content to the existing html */ var nextDiv = new XElement("div"); if (_pageSeparatorBuilder != null) { nextDiv.SetInnerHtml(_pageSeparatorBuilder(_curPageNum)); } nextDiv.SetId(_PageIdPrefix + _curPageNum); nextDiv.SetClass("page"); nextDiv.Add(nextInner.Nodes()); contentDiv.Add(nextDiv); _parsedPages.Add(url); /* Only continue if we haven't already seen the next page page */ if (!string.IsNullOrEmpty(nextPageLink) && !_parsedPages.Contains(nextPageLink)) { AppendNextPage(document, nextPageLink); } }
private static XElement TryFindArticleContentElement(XDocument document, string articleContentElementHint) { if (document == null) { throw new ArgumentNullException("document"); } if (string.IsNullOrEmpty(articleContentElementHint)) { throw new ArgumentException("Argument can't be null nor empty.", "articleContentElementHint"); } return document.GetElementById(articleContentElementHint); //针对tag名称或者内容节点,只对html5页面有效。目前国内网站使用div id过滤效果更好 //return document // .GetElementsByTagName(articleContentElementHint) // .FirstOrDefault(); }