public void Download(string downloadPage, string filePath, EBook book) { try { WebClient wc = new WebClient(); HTMLScraper scraper = new HTMLScraper(); wc.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0)"); wc.DownloadProgressChanged += (sender, ex) => { lblDownloadPercent.Text = "Downloaded " + ex.BytesReceived + "b of " + ex.TotalBytesToReceive + "b " + book.fileType; }; wc.DownloadFileCompleted += (sender, ex) => { if (ex.Cancelled) { //TODO log error } else { _completed = true; } }; wc.DownloadFileAsync(new Uri(downloadPage), filePath); Thread.Sleep(10000); //while (!_completed) // Thread.Sleep(1000); } catch (Exception e) { } }
//Returns html link to download page public string ProcessWebRequest(string hostSiteSearchPage, string target) { Stream stream; StreamReader reader; Uri websiteURI = new Uri(hostSiteSearchPage + target); //declare page to search HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(websiteURI); HTMLScraper htmlScraper = new HTMLScraper(); request.UserAgent = "A .NET Web Crawler"; string htmlData = ""; //Get the HTML text / Load it into parser using (WebResponse response = request.GetResponse()) { stream = response.GetResponseStream(); reader = new StreamReader(stream); htmlData = reader.ReadToEnd(); } return(htmlData); }
/* * Main function for program. * TODO: init */ public void DownloadBook() { EBook book = new EBook(); WebRequestHandler wrh = new WebRequestHandler(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); HTMLScraper scraper = new HTMLScraper(); EmailHandler emailHandler = new EmailHandler(); WebClientHandler wch = new WebClientHandler(); string downloadFolder = ""; string coverImageHref = ""; string hostSiteSearchPage = "http://libgen.io/search.php?req="; string htmlData = ""; string filePath = ""; string downloadPage = ""; string downloadLink = ""; try { tbLog.Text += "Initializing"; lblDownloadPercent.Text = "Downloading"; book.bookTitle = tbBookName.Text; book.author = tbAuthor.Text; downloadFolder = tbDownloadFolder.Text; Directory.CreateDirectory(downloadFolder); book.uriBookTitle = Regex.Replace(book.bookTitle + " - " + book.author, @"\s+", "+"); filePath = downloadFolder + "\\" + book.bookTitle + " - " + book.author + book.fileType; //C:\users\Alex\Downloads\Outliers-Malcolm Gladwell.mobi book.coverImageLocation = downloadFolder + "\\" + book.bookTitle + " Cover"; //Parse htmldata with XPath and return link for file types mobi, epub, pdf //returns htmlData page for initial search htmlData = wrh.ProcessWebRequest(hostSiteSearchPage, book.uriBookTitle); //Will set book.filetype, returns html with link to download downloadPage = scraper.GetBookLinks(htmlData, book); book.fileLocation = filePath + book.fileType; htmlData = wrh.ProcessWebRequest(downloadPage, ""); downloadLink = scraper.GetDownloadLink(htmlData); if (downloadLink != "") { lblDownloadPercent.Text = "Downloading " + book.fileType; //Download the EBook Download(downloadLink, book.fileLocation, book); if (book.fileType == ".pdf" || book.fileType == ".epub") { coverImageHref = scraper.GetCoverImage(book); Download(coverImageHref, book.coverImageLocation, book); } emailHandler.SendEmail(book, tbKindleEmail.Text); } else { tbLog.Text += "No link results"; } } catch (Exception s) { //TODO Log error } }
public ActionResult Index(string targetURL) { string articleContent = HTMLScraper.GetArticleContent(targetURL); return(Json(articleContent, JsonRequestBehavior.AllowGet)); }
public ActionResult SubmitScrape(SearchResultViewModel formData) { string urlAddress = string.Format("https://www.google.co.uk/search?num={0}&q={1}", formData.SearchResultAmount, formData.SearchTerms); string rawHTML = _webService.GetHTML(urlAddress); formData.SearchResult = string.Format("The URL appeared in the search results in position(s): {0}", HTMLScraper.GetResults(rawHTML, formData.SearchURL)); return(RedirectToAction("SearchResults", "Home", formData)); }