Example #1
0
        // Example Valid JSON configuration for the class
        // {
        // "Parser": "RegExp",
        // "BaseAddress": "http://beyondtheimpossible.org/comic/1-before-the-beginning-2/",
        // "NextButtonSelector": "(?:href=\"(\\S+)\")? class=\"comic-nav-base comic-nav-next\">",
        // "ChapterTitleSelector": "class=\"post-title\">([^<]*)<",
        // "ChapterContentSelector": "<div class=\"entry\">((?:.|\n)*)<div class=\"post-extras\">",
        // "Author": "Ffurla",
        // "Date": "2016-03-18T13:24:36.2855417+01:00",
        // "Title": "Beyond the Impossible",
        // "Description": null
        // },

        protected override void ScrapeWebPage(WebComicEntry entry, EPubDocument ebook, string nextPageUrl = null)
        {
            if (entry.Content != WebComicEntry.ContentType.Text)
            {
                throw new NotSupportedException(
                          $"This parser does not support the following content type {Enum.GetName(typeof(WebComicEntry.ContentType), entry.Content)}");
            }

            do
            {
                string title      = string.Empty;
                string content    = string.Empty;
                var    currentUrl = nextPageUrl ?? entry.BaseAddress;
                try
                {
                    using (var wc = new WebClient()
                    {
                        Encoding = Encoding.UTF8
                    })
                    {
                        var s = WebUtility.HtmlDecode(wc.DownloadString(currentUrl));

                        var m = Regex.Match(s, entry.NextButtonSelector);
                        if (m.Groups[1].Success)
                        {
                            nextPageUrl = m.Groups[1].Value;
                        }
                        else
                        {
                            nextPageUrl = string.Empty;
                        }

                        m = Regex.Match(s, entry.ChapterTitleSelector);
                        if (m.Groups[1].Success)
                        {
                            title   = m.Groups[1].Value;
                            content = $"<h1>{title}</h1>";
                        }

                        m = Regex.Match(s, entry.ChapterContentSelector);
                        if (m.Groups[1].Success)
                        {
                            var v = m.Groups[1].Value;
                            content += v;
                        }

                        this.AddPage(ebook, content, title, currentUrl);
                    }
                }
                catch (WebException ex)
                {
                    if (((HttpWebResponse)ex.Response).StatusCode == HttpStatusCode.NotFound)
                    {
                        return;
                    }
                }
            }while (!nextPageUrl.IsEmpty());
        }
Example #2
0
        public static void AddAdditionalMessageDisplay(WebComicEntry entry, string content)
        {
            lock (AdditionalMessageDisplay)
            {
                if (!AdditionalMessageDisplay.ContainsKey(entry))
                {
                    AdditionalMessageDisplay[entry] = new Queue <string>();
                }

                AdditionalMessageDisplay[entry].Enqueue(content);
            }
        }
Example #3
0
        private string HandleMixedContent(
            Document ebook,
            XPathNodeIterator iter,
            WebClient wc,
            string imagesDir,
            Page p,
            string content,
            WebComicEntry entry)
        {
            if (entry.ImageTags.Contains(iter.Current.Name))
            {
                string url;

                // If the image link is in the value of the tag instead of an attibute, a dot should be used to indicate that
                if (entry.ImageSourceAttributes[iter.Current.Name] == ".")
                {
                    url = iter.Current.Value;
                }
                else
                {
                    url = iter.Current.GetAttribute(entry.ImageSourceAttributes[iter.Current.Name], string.Empty);
                }

                using (MemoryStream memImg = new MemoryStream())
                {
                    this.DownloadImage(wc, memImg, url);
                    ebook.AddImageData($"image{this.ImageCounter}.png", memImg.GetBuffer());

                    if (!Settings.Instance.CommandLineOptions.SaveProgressFolder.IsEmpty())
                    {
                        Unless(Directory.Exists(imagesDir), () => Directory.CreateDirectory(imagesDir));
                        var imagePath = Path.Combine(imagesDir, $"image{this.ImageCounter}.png");
                        File.WriteAllBytes(imagePath, memImg.GetBuffer());
                        p.ImagesPath.Add(imagePath);
                    }
                }

                // Image processing
                var temp = $"<img src=\"image{this.ImageCounter}.png\" alt=\"\"/>";
                content += temp;
                this.ImageCounter++;
            }
            else
            {
                // Text processing
                var temp = $"<{iter.Current.Name}>{iter.Current.Value}</{iter.Current.Name}>";
                content += temp;
            }

            return(content);
        }
        public void StartScraping(WebComicEntry entry)
        {
            this._entry = entry;
            var    ebook = new Document();
            bool   existing;
            string nextPageUrl = null;

            this._sanitizedName = new string(entry.Title.Where(c => !InvalidChars.Contains(c)).ToArray());

            string outputName = this.DetectBestName(this._sanitizedName, out existing);

            this._workingDirPath = Path.Combine(Settings.Instance.CommandLineOptions.SaveProgressFolder, this._sanitizedName);

            if (!Settings.Instance.CommandLineOptions.SaveProgressFolder.IsEmpty() &&
                !Directory.Exists(this._workingDirPath))
            {
                Directory.CreateDirectory(
                    Path.Combine(this._workingDirPath));
            }

            if (File.Exists(Path.Combine(this._workingDirPath, "Pages.json")))
            {
                var temp =
                    JsonConvert.DeserializeObject <List <Page> >(
                        File.ReadAllText(Path.Combine(this._workingDirPath, "Pages.json")));
                if (temp.Any())
                {
                    this.Pages  = temp;
                    nextPageUrl = this.RecoverProgress(ebook);
                }
            }

            if (!existing || Settings.Instance.CommandLineOptions.Redownload)
            {
                ebook.AddStylesheetData("style.css", Resources.style);
                this.SetMetadata(ebook);
                this.ScrapeWebPage(this._entry, ebook, nextPageUrl);
                ebook.Generate(outputName);
                ConsoleDisplay.MainMessage(entry, "Finished Compiling book");
            }
            else
            {
                ConsoleDisplay.MainMessage(entry, "Book already compiled");
            }
        }
 protected abstract void ScrapeWebPage(WebComicEntry entry, Document ebook, string nextPageUrl = null);
Example #6
0
        // Valid JSON configuration for this class
        // {
        // "Parser": "XPath",
        // "BaseAddress": "http://beyondtheimpossible.org/comic/1-before-the-beginning-2/",
        // "NextButtonSelector": "//@href[@class='comic-nav-base comic-nav-next']",
        // "ChapterTitleSelector": "//*[@class='post-title']",
        // "ChapterContentSelector": "//*[@class='entry']",
        // "Author": "Ffurla",
        // "Date": "2016-03-18T13:24:36.2855417+01:00",
        // "Title": "Beyond the Impossible",
        // "Description": null
        // }
        protected override void ScrapeWebPage(WebComicEntry entry, Document ebook, string nextPageUrl = null)
        {
            // http://htmlagilitypack.codeplex.com/wikipage?title=Examples
            do
            {
                string content = string.Empty;
                string title;
                var    currentUrl = nextPageUrl ?? entry.BaseAddress;
                try
                {
                    using (var wc = new WebClient())
                    {
                        using (var ms = new MemoryStream(wc.DownloadData(currentUrl)))
                        {
                            HtmlDocument hDoc = new HtmlDocument();
                            hDoc.Load(ms, true);
                            XPathNavigator xNav = hDoc.CreateNavigator();

                            try
                            {
                                title = xNav.SelectSingleNode(entry.ChapterTitleSelector).Value;
                            }
                            catch
                            {
                                if (entry.IgnoreMissingChapterName)
                                {
                                    title = null;
                                }
                                else
                                {
                                    ConsoleDisplay.AddAdditionalMessageDisplay(
                                        entry,
                                        $"Title not found for page {this.PageCounter}, replacing with default value");
                                    title = WebUtility.HtmlEncode($"Chapter - {this.PageCounter}");
                                }
                            }

                            XPathNodeIterator xIter = xNav.Select(entry.ChapterContentSelector);

                            if (entry.Content == WebComicEntry.ContentType.Text)
                            {
                                content += $"<h1>{title}</h1>";

                                while (xIter.MoveNext())
                                {
                                    var temp = $"<{xIter.Current.Name}>{xIter.Current.Value}</{xIter.Current.Name}>";
                                    content += temp;
                                }

                                this.AddPage(ebook, content, title, currentUrl, entry.IgnoreMissingChapterName);
                            }
                            else if (entry.Content == WebComicEntry.ContentType.Image)
                            {
                                while (xIter.MoveNext())
                                {
                                    this.AddImage(ebook, wc, xIter.Current.Value, currentUrl);
                                }
                            }
                            else if (entry.Content == WebComicEntry.ContentType.Mixed)
                            {
                                while (xIter.MoveNext())
                                {
                                    var subIter = xIter.Current.SelectChildren(XPathNodeType.Element);
                                    this.AddCompositePage(ebook, subIter, title, wc, currentUrl, entry);
                                }
                            }

                            var tempNextPageUrl = xNav.SelectSingleNode(entry.NextButtonSelector)?.Value;

                            try
                            {
                                var uri = new Uri(tempNextPageUrl);
                                nextPageUrl = tempNextPageUrl;
                            }
                            catch (UriFormatException)
                            {
                                nextPageUrl = string.Format(entry.AddressPattern, tempNextPageUrl);
                            }
                            catch (ArgumentNullException)
                            {
                                //The end of the book.....
                                return;
                            }
                            catch (NullReferenceException)
                            {
                                //The end of the book.....
                                return;
                            }
                        }
                    }
                }
                catch (WebException ex)
                {
                    if (((HttpWebResponse)ex.Response).StatusCode == HttpStatusCode.NotFound)
                    {
                        return;
                    }
                }
            }while (!nextPageUrl.IsEmpty());
        }
Example #7
0
        protected void AddCompositePage(
            Document ebook,
            XPathNodeIterator iter,
            string title,
            WebClient wc,
            string currentUrl,
            WebComicEntry entry)
        {
            var page    = this.PageTemplate.Replace("%%TITLE%%", title);
            var content = string.Empty;

            title = WebUtility.HtmlDecode(title);

            var pagesDir  = Path.Combine(this.WorkingDirPath, entry.Title, "Pages");
            var imagesDir = Path.Combine(this.WorkingDirPath, entry.Title, "Images");
            var p         = new Page()
            {
                Title      = title,
                Order      = this.PageCounter,
                Type       = WebComicEntry.ContentType.Mixed,
                PageUrl    = currentUrl,
                ImagesPath = new List <string>()
            };

            while (iter.MoveNext())
            {
                if (entry.IncludeTags.Contains(iter.Current.Name))
                {
                    content = this.HandleMixedContent(ebook, iter, wc, imagesDir, p, content, entry);
                }
                else if (entry.InteruptAtTag != iter.Current.Name)
                {
                    var exprBuilder = new StringBuilder();
                    foreach (var tag in entry.IncludeTags)
                    {
                        exprBuilder.Append($".//{tag}|");
                    }

                    var subIter = iter.Current.Select(exprBuilder.ToString().TrimEnd('|'));

                    while (subIter.MoveNext())
                    {
                        content = this.HandleMixedContent(ebook, subIter, wc, imagesDir, p, content, entry);
                    }
                }

                // On break sur le tag indiqué
                if (entry.InteruptAtTag == iter.Current.Name)
                {
                    break;
                }
            }

            page = page.Replace("%%CONTENT%%", content);

            string pageName = $"page{this.PageCounter}.xhtml";

            if (!Settings.Instance.CommandLineOptions.SaveProgressFolder.IsEmpty())
            {
                Unless(Directory.Exists(pagesDir), () => Directory.CreateDirectory(pagesDir));
                var pagePath = Path.Combine(pagesDir, pageName);
                p.Path = pagePath;
                File.WriteAllText(pagePath, page);

                this.Pages.Add(p);

                File.WriteAllText(
                    Path.Combine(this.WorkingDirPath, "Pages.json"),
                    JsonConvert.SerializeObject(this.Pages));
            }

            ebook.AddXhtmlData(pageName, page);
            ebook.AddNavPoint(title.IsEmpty() ? $"Page {this.PageCounter}" : title, pageName, this.NavCounter++);
            ConsoleDisplay.MainMessage(entry, $"Completed Page {this.PageCounter}");
            this.PageCounter++;
        }
Example #8
0
 public static void MainMessage(WebComicEntry entry, string content)
 {
     MainMessageDisplay[entry] = content;
 }