public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { string pageContent = await GetWebPageAsync(link.Url, token); IHtmlDocument doc = await Parser.ParseAsync(pageContent, token); IElement postBodyEl = (from e in doc.All where e.LocalName == "div" where e.HasAttribute("class") let classAttribute = e.GetAttribute("class") where classAttribute.Contains("post_body") select e).FirstOrDefault(); if (postBodyEl == null) return null; RemoveNavigation(postBodyEl); RemoveDonation(postBodyEl); ExpandSpoilers(postBodyEl); RemoveEmptyTags(postBodyEl); var content = CleanupHTML(postBodyEl.InnerHtml); return new WebNovelChapter { Url = link.Url, Content = content }; }
protected virtual IEnumerable<ChapterLink> CollectChapterLinks(string baseUrl, IEnumerable<IElement> linkElements, Func<IElement, bool> linkFilter = null) { if (linkFilter != null) linkElements = linkElements.Where(linkFilter); linkElements = linkElements.Where(p => p.LocalName == "a"); foreach (IElement e in linkElements) { if (string.IsNullOrWhiteSpace(e.TextContent) || !e.HasAttribute("href")) continue; string url = UrlHelper.ToAbsoluteUrl(baseUrl, e.GetAttribute("href")); if (string.IsNullOrEmpty(url)) continue; ChapterLink link = new ChapterLink { Name = WebUtility.HtmlDecode(e.TextContent), Url = url }; yield return link; } }
public override async Task<WebNovelChapter> GetChapterAsync( ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { string baseContent = await GetWebPageAsync(link.Url, token); IHtmlDocument doc = await Parser.ParseAsync(baseContent, token); IElement titleElement = doc.DocumentElement.FirstWhereHasClass(TitleClasses); WebNovelChapter chapter = ParseChapter(doc.DocumentElement, token); chapter.Url = link.Url; if (titleElement != null) chapter.ChapterName = titleElement.Text().Trim(); return chapter; }
protected override IEnumerable<ChapterLink> CollectChapterLinks(string baseUrl, IEnumerable<IElement> linkElements, Func<IElement, bool> linkFilter = null) { foreach (IElement chapterElement in linkElements) { IElement linkElement = chapterElement.Descendents<IElement>().FirstOrDefault(p => p.LocalName == "a"); if (linkElement == null || !linkElement.HasAttribute("title") || !linkElement.HasAttribute("href")) continue; string title = linkElement.GetAttribute("title"); ChapterLink link = new ChapterLink { Name = title, Url = linkElement.GetAttribute("href"), Unknown = false }; yield return link; } }
public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { string content = await GetWebPageAsync(link.Url, token); IHtmlDocument doc = await Parser.ParseAsync(content, token); IElement titleElement = doc.DocumentElement.FirstWhereHasClass(ChapterTitleClasses); IElement chapterElement = doc.DocumentElement.FirstWhereHasClass(ChapterClasses); var chContentElements = chapterElement.WhereHasClass(ChapterContentClasses, element => element.LocalName == "sentence"); string contents = string.Join("<br/><br/>", chContentElements.Select(p => p.InnerHtml)); string nextChapter = doc.QuerySelector("ul.pager > li.next > a")?.GetAttribute("href"); return new WebNovelChapter { ChapterName = titleElement?.TextContent, Content = contents, NextChapterUrl = nextChapter }; }
protected override IEnumerable<ChapterLink> CollectChapterLinks(string baseUrl, IEnumerable<IElement> linkElements, Func<IElement, bool> linkFilter = null) { foreach (IElement possibleChapter in linkElements) { if (!possibleChapter.HasAttribute("href")) continue; string chTitle = WebUtility.HtmlDecode(possibleChapter.TextContent); string chLink = possibleChapter.GetAttribute("href"); chLink = UrlHelper.ToAbsoluteUrl(BaseUrl, chLink); ChapterLink link = new ChapterLink { Name = chTitle, Url = chLink, Unknown = true }; if (PossibleChapterNameParts.Any(p => chTitle.IndexOf(p, StringComparison.CurrentCultureIgnoreCase) >= 0)) link.Unknown = false; yield return link; } }
public virtual Task<WebNovelChapter> GetChapterAsync(ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { throw new NotImplementedException(); }
public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { string content = await GetWebPageAsync(link.Url, token); IHtmlDocument doc = await Parser.ParseAsync(content, token); var paged = GetPagedChapterUrls(doc.DocumentElement); WebNovelChapter chapter = ParseChapter(doc.DocumentElement, token); if (chapter == null) return null; chapter.Url = link.Url; chapter.NextChapterUrl = UrlHelper.ToAbsoluteUrl(link.Url, chapter.NextChapterUrl); foreach (var page in paged) { string pageContent = await GetWebPageAsync(page, token); IHtmlDocument pageDoc = await Parser.ParseAsync(pageContent, token); chapter.Content += ParseChapter(pageDoc.DocumentElement, token).Content; } return chapter; }
public override async Task<WebNovelChapter> GetChapterAsync(ChapterLink link, ChapterRetrievalOptions options = default(ChapterRetrievalOptions), CancellationToken token = default(CancellationToken)) { string baseContent = await GetWebPageAsync(link.Url, token); IHtmlDocument doc = await Parser.ParseAsync(baseContent, token); IElement contentElement = doc.GetElementById("mw-content-text"); if (contentElement == null) return null; doc.GetElementById("toc")?.Remove(); RemoveTables(contentElement); foreach (IElement linkElement in contentElement.Descendents<IElement>().Where(p => p.LocalName == "a")) { if (!linkElement.HasAttribute("href")) continue; string rel = WebUtility.HtmlDecode(linkElement.GetAttribute("href")); linkElement.SetAttribute("href", UrlHelper.ToAbsoluteUrl(BaseUrl, rel)); IElement imgElement = linkElement.Descendents<IElement>().FirstOrDefault(p => p.LocalName == "img"); if (imgElement != null) { foreach (var attrib in imgElement.Attributes.Where(p => p.LocalName != "width" && p.LocalName != "height").ToList()) imgElement.RemoveAttribute(attrib.Name); string linkImgUrl = linkElement.GetAttribute("href"); string imgPageContent = await GetWebPageAsync(linkImgUrl, token); IHtmlDocument imgDoc = await Parser.ParseAsync(imgPageContent, token); IElement fullImageElement = (from e in imgDoc.Descendents<IElement>() where e.LocalName == "div" where e.HasAttribute("class") let classAttribute = e.GetAttribute("class") where classAttribute == "fullMedia" let imgLink = e.Descendents<IElement>().FirstOrDefault(p => p.LocalName == "a") select imgLink).FirstOrDefault(); if (fullImageElement == null || !fullImageElement.HasAttribute("href")) continue; string imageLink = fullImageElement.GetAttribute("href"); imgElement.SetAttribute("src", UrlHelper.ToAbsoluteUrl(BaseUrl, imageLink)); } } return new WebNovelChapter { Url = link.Url, Content = contentElement.InnerHtml }; }