/// <summary>
        /// Recursively appends subsequent pages of a multipage article.
        /// </summary>
        /// <param name="document">Compiled document</param>
        /// <param name="url">Url of current page</param>
        private void AppendNextPage(XDocument document, string url)
        {
            _curPageNum++;

              var contentDiv = document.GetElementById("readInner");

              if (_curPageNum > _MaxPages)
              {
            url = "<div style='text-align: center'><a href='" + url + "'>View Next Page</a></div>";
            contentDiv.Add(XDocument.Parse(url));
            return;
              }

              string nextContent = _urlFetcher.Fetch(url);

              if (string.IsNullOrEmpty(nextContent))
              {
            return;
              }

              bool mainContentExtracted;
              string extractedTitle;
              string nextPageLink;
              var nextDocument = _transcoder.TranscodeToXml(nextContent, url, out mainContentExtracted, out extractedTitle, out nextPageLink);
              var nextInner = nextDocument.GetElementById("readInner");
              var header = nextInner.Element("h1");

              if (header != null)
              {
            header.Remove();
              }

              /*
               * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
               * Compare it against all of the the previous document's we've gotten. If the previous
               * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
              */
              var firstP = nextInner.GetElementsByTagName("p").Count() > 0 ? nextInner.GetElementsByTagName("p").First() : null;

              if (firstP != null && firstP.GetInnerHtml().Length > 100)
              {
            //string innerHtml = firstP.GetInnerHtml();
            //var existingContent = contentDiv.GetInnerHtml();
            //existingContent = Regex.Replace(existingContent, "xmlns(:[a-z]+)?=['\"][^'\"]+['\"]", "", RegexOptions.IgnoreCase);
            //existingContent = Regex.Replace(existingContent, @"\s+", "");
            //innerHtml = Regex.Replace(innerHtml, @"\s+", "");

            // TODO: This test could probably be improved to compare the actual markup.
            string existingContent = contentDiv.Value;
            string innerHtml = firstP.Value;

            if (!string.IsNullOrEmpty(existingContent) && !string.IsNullOrEmpty(innerHtml) && existingContent.IndexOf(innerHtml) != -1)
            {
              _parsedPages.Add(url);
              return;
            }
              }

              /* Add the content to the existing html */
              var nextDiv = new XElement("div");

              if (_pageSeparatorBuilder != null)
              {
            nextDiv.SetInnerHtml(_pageSeparatorBuilder(_curPageNum));
              }

              nextDiv.SetId(_PageIdPrefix + _curPageNum);
              nextDiv.SetClass("page");
              nextDiv.Add(nextInner.Nodes());
              contentDiv.Add(nextDiv);
              _parsedPages.Add(url);

              /* Only continue if we haven't already seen the next page page */
              if (!string.IsNullOrEmpty(nextPageLink) && !_parsedPages.Contains(nextPageLink))
              {
            AppendNextPage(document, nextPageLink);
              }
        }
예제 #2
0
        private static XElement TryFindArticleContentElement(XDocument document, string articleContentElementHint)
        {
            if (document == null)
            {
                throw new ArgumentNullException("document");
            }

            if (string.IsNullOrEmpty(articleContentElementHint))
            {
                throw new ArgumentException("Argument can't be null nor empty.", "articleContentElementHint");
            }
            return document.GetElementById(articleContentElementHint);
            //针对tag名称或者内容节点,只对html5页面有效。目前国内网站使用div id过滤效果更好
            //return document
//        .GetElementsByTagName(articleContentElementHint)
            //      .FirstOrDefault();
        }