// Example Valid JSON configuration for the class // { // "Parser": "RegExp", // "BaseAddress": "http://beyondtheimpossible.org/comic/1-before-the-beginning-2/", // "NextButtonSelector": "(?:href=\"(\\S+)\")? class=\"comic-nav-base comic-nav-next\">", // "ChapterTitleSelector": "class=\"post-title\">([^<]*)<", // "ChapterContentSelector": "<div class=\"entry\">((?:.|\n)*)<div class=\"post-extras\">", // "Author": "Ffurla", // "Date": "2016-03-18T13:24:36.2855417+01:00", // "Title": "Beyond the Impossible", // "Description": null // }, protected override void ScrapeWebPage(WebComicEntry entry, EPubDocument ebook, string nextPageUrl = null) { if (entry.Content != WebComicEntry.ContentType.Text) { throw new NotSupportedException( $"This parser does not support the following content type {Enum.GetName(typeof(WebComicEntry.ContentType), entry.Content)}"); } do { string title = string.Empty; string content = string.Empty; var currentUrl = nextPageUrl ?? entry.BaseAddress; try { using (var wc = new WebClient() { Encoding = Encoding.UTF8 }) { var s = WebUtility.HtmlDecode(wc.DownloadString(currentUrl)); var m = Regex.Match(s, entry.NextButtonSelector); if (m.Groups[1].Success) { nextPageUrl = m.Groups[1].Value; } else { nextPageUrl = string.Empty; } m = Regex.Match(s, entry.ChapterTitleSelector); if (m.Groups[1].Success) { title = m.Groups[1].Value; content = $"<h1>{title}</h1>"; } m = Regex.Match(s, entry.ChapterContentSelector); if (m.Groups[1].Success) { var v = m.Groups[1].Value; content += v; } this.AddPage(ebook, content, title, currentUrl); } } catch (WebException ex) { if (((HttpWebResponse)ex.Response).StatusCode == HttpStatusCode.NotFound) { return; } } }while (!nextPageUrl.IsEmpty()); }
public static void AddAdditionalMessageDisplay(WebComicEntry entry, string content) { lock (AdditionalMessageDisplay) { if (!AdditionalMessageDisplay.ContainsKey(entry)) { AdditionalMessageDisplay[entry] = new Queue <string>(); } AdditionalMessageDisplay[entry].Enqueue(content); } }
private string HandleMixedContent( Document ebook, XPathNodeIterator iter, WebClient wc, string imagesDir, Page p, string content, WebComicEntry entry) { if (entry.ImageTags.Contains(iter.Current.Name)) { string url; // If the image link is in the value of the tag instead of an attibute, a dot should be used to indicate that if (entry.ImageSourceAttributes[iter.Current.Name] == ".") { url = iter.Current.Value; } else { url = iter.Current.GetAttribute(entry.ImageSourceAttributes[iter.Current.Name], string.Empty); } using (MemoryStream memImg = new MemoryStream()) { this.DownloadImage(wc, memImg, url); ebook.AddImageData($"image{this.ImageCounter}.png", memImg.GetBuffer()); if (!Settings.Instance.CommandLineOptions.SaveProgressFolder.IsEmpty()) { Unless(Directory.Exists(imagesDir), () => Directory.CreateDirectory(imagesDir)); var imagePath = Path.Combine(imagesDir, $"image{this.ImageCounter}.png"); File.WriteAllBytes(imagePath, memImg.GetBuffer()); p.ImagesPath.Add(imagePath); } } // Image processing var temp = $"<img src=\"image{this.ImageCounter}.png\" alt=\"\"/>"; content += temp; this.ImageCounter++; } else { // Text processing var temp = $"<{iter.Current.Name}>{iter.Current.Value}</{iter.Current.Name}>"; content += temp; } return(content); }
public void StartScraping(WebComicEntry entry) { this._entry = entry; var ebook = new Document(); bool existing; string nextPageUrl = null; this._sanitizedName = new string(entry.Title.Where(c => !InvalidChars.Contains(c)).ToArray()); string outputName = this.DetectBestName(this._sanitizedName, out existing); this._workingDirPath = Path.Combine(Settings.Instance.CommandLineOptions.SaveProgressFolder, this._sanitizedName); if (!Settings.Instance.CommandLineOptions.SaveProgressFolder.IsEmpty() && !Directory.Exists(this._workingDirPath)) { Directory.CreateDirectory( Path.Combine(this._workingDirPath)); } if (File.Exists(Path.Combine(this._workingDirPath, "Pages.json"))) { var temp = JsonConvert.DeserializeObject <List <Page> >( File.ReadAllText(Path.Combine(this._workingDirPath, "Pages.json"))); if (temp.Any()) { this.Pages = temp; nextPageUrl = this.RecoverProgress(ebook); } } if (!existing || Settings.Instance.CommandLineOptions.Redownload) { ebook.AddStylesheetData("style.css", Resources.style); this.SetMetadata(ebook); this.ScrapeWebPage(this._entry, ebook, nextPageUrl); ebook.Generate(outputName); ConsoleDisplay.MainMessage(entry, "Finished Compiling book"); } else { ConsoleDisplay.MainMessage(entry, "Book already compiled"); } }
protected abstract void ScrapeWebPage(WebComicEntry entry, Document ebook, string nextPageUrl = null);
// Valid JSON configuration for this class // { // "Parser": "XPath", // "BaseAddress": "http://beyondtheimpossible.org/comic/1-before-the-beginning-2/", // "NextButtonSelector": "//@href[@class='comic-nav-base comic-nav-next']", // "ChapterTitleSelector": "//*[@class='post-title']", // "ChapterContentSelector": "//*[@class='entry']", // "Author": "Ffurla", // "Date": "2016-03-18T13:24:36.2855417+01:00", // "Title": "Beyond the Impossible", // "Description": null // } protected override void ScrapeWebPage(WebComicEntry entry, Document ebook, string nextPageUrl = null) { // http://htmlagilitypack.codeplex.com/wikipage?title=Examples do { string content = string.Empty; string title; var currentUrl = nextPageUrl ?? entry.BaseAddress; try { using (var wc = new WebClient()) { using (var ms = new MemoryStream(wc.DownloadData(currentUrl))) { HtmlDocument hDoc = new HtmlDocument(); hDoc.Load(ms, true); XPathNavigator xNav = hDoc.CreateNavigator(); try { title = xNav.SelectSingleNode(entry.ChapterTitleSelector).Value; } catch { if (entry.IgnoreMissingChapterName) { title = null; } else { ConsoleDisplay.AddAdditionalMessageDisplay( entry, $"Title not found for page {this.PageCounter}, replacing with default value"); title = WebUtility.HtmlEncode($"Chapter - {this.PageCounter}"); } } XPathNodeIterator xIter = xNav.Select(entry.ChapterContentSelector); if (entry.Content == WebComicEntry.ContentType.Text) { content += $"<h1>{title}</h1>"; while (xIter.MoveNext()) { var temp = $"<{xIter.Current.Name}>{xIter.Current.Value}</{xIter.Current.Name}>"; content += temp; } this.AddPage(ebook, content, title, currentUrl, entry.IgnoreMissingChapterName); } else if (entry.Content == WebComicEntry.ContentType.Image) { while (xIter.MoveNext()) { this.AddImage(ebook, wc, xIter.Current.Value, currentUrl); } } else if (entry.Content == WebComicEntry.ContentType.Mixed) { while (xIter.MoveNext()) { var subIter = xIter.Current.SelectChildren(XPathNodeType.Element); this.AddCompositePage(ebook, subIter, title, wc, currentUrl, entry); } } var tempNextPageUrl = xNav.SelectSingleNode(entry.NextButtonSelector)?.Value; try { var uri = new Uri(tempNextPageUrl); nextPageUrl = tempNextPageUrl; } catch (UriFormatException) { nextPageUrl = string.Format(entry.AddressPattern, tempNextPageUrl); } catch (ArgumentNullException) { //The end of the book..... return; } catch (NullReferenceException) { //The end of the book..... return; } } } } catch (WebException ex) { if (((HttpWebResponse)ex.Response).StatusCode == HttpStatusCode.NotFound) { return; } } }while (!nextPageUrl.IsEmpty()); }
protected void AddCompositePage( Document ebook, XPathNodeIterator iter, string title, WebClient wc, string currentUrl, WebComicEntry entry) { var page = this.PageTemplate.Replace("%%TITLE%%", title); var content = string.Empty; title = WebUtility.HtmlDecode(title); var pagesDir = Path.Combine(this.WorkingDirPath, entry.Title, "Pages"); var imagesDir = Path.Combine(this.WorkingDirPath, entry.Title, "Images"); var p = new Page() { Title = title, Order = this.PageCounter, Type = WebComicEntry.ContentType.Mixed, PageUrl = currentUrl, ImagesPath = new List <string>() }; while (iter.MoveNext()) { if (entry.IncludeTags.Contains(iter.Current.Name)) { content = this.HandleMixedContent(ebook, iter, wc, imagesDir, p, content, entry); } else if (entry.InteruptAtTag != iter.Current.Name) { var exprBuilder = new StringBuilder(); foreach (var tag in entry.IncludeTags) { exprBuilder.Append($".//{tag}|"); } var subIter = iter.Current.Select(exprBuilder.ToString().TrimEnd('|')); while (subIter.MoveNext()) { content = this.HandleMixedContent(ebook, subIter, wc, imagesDir, p, content, entry); } } // On break sur le tag indiqué if (entry.InteruptAtTag == iter.Current.Name) { break; } } page = page.Replace("%%CONTENT%%", content); string pageName = $"page{this.PageCounter}.xhtml"; if (!Settings.Instance.CommandLineOptions.SaveProgressFolder.IsEmpty()) { Unless(Directory.Exists(pagesDir), () => Directory.CreateDirectory(pagesDir)); var pagePath = Path.Combine(pagesDir, pageName); p.Path = pagePath; File.WriteAllText(pagePath, page); this.Pages.Add(p); File.WriteAllText( Path.Combine(this.WorkingDirPath, "Pages.json"), JsonConvert.SerializeObject(this.Pages)); } ebook.AddXhtmlData(pageName, page); ebook.AddNavPoint(title.IsEmpty() ? $"Page {this.PageCounter}" : title, pageName, this.NavCounter++); ConsoleDisplay.MainMessage(entry, $"Completed Page {this.PageCounter}"); this.PageCounter++; }
public static void MainMessage(WebComicEntry entry, string content) { MainMessageDisplay[entry] = content; }