public async Task <EbookPageModel> FindEbookUrl(EbookPageModel webPageModel) { string html; try { html = await HttpDownloader.DownloadHtmlPage(webPageModel.Url); } catch { throw; } IBrowsingContext context = BrowsingContext.New(); IDocument document = await context.OpenAsync(response => response.Content(html)); IEnumerable <IElement> downloadLinkElements = document.QuerySelectorAll("span.download-links"); if (downloadLinkElements.Count() == 0) { throw new TargetNotFindException("无法在HTML中找到下载链接"); } IElement detail = document.QuerySelector("div.book-detail"); IEnumerable <IElement> terms = detail.QuerySelectorAll("dd"); IEnumerable <IElement> authorsElements = terms.ElementAt(0).QuerySelectorAll("a"); webPageModel.Author = string.Join(";", authorsElements.Select(x => x.InnerHtml)); webPageModel.ISBN = terms.ElementAt(1).InnerHtml; webPageModel.Year = Int32.Parse(terms.ElementAt(2).InnerHtml); webPageModel.Pages = Int32.Parse(terms.ElementAt(3).InnerHtml); webPageModel.Language = terms.ElementAt(4).InnerHtml; webPageModel.FileSize = terms.ElementAt(5).InnerHtml; //webPageModel.FileFormat = terms.ElementAt(6).InnerHtml; IEnumerable <IElement> tagElements = terms.ElementAt(7).QuerySelectorAll("a"); webPageModel.Category = string.Join(";", tagElements.Select(x => x.InnerHtml)); webPageModel.Description = document.QuerySelector("div.entry-content").InnerHtml; webPageModel.Title = document.QuerySelector("h1.single-title").InnerHtml; IElement subTitleElement = document.QuerySelector("h4"); if (subTitleElement != null) { webPageModel.SubTitle = subTitleElement.InnerHtml; } IElement imageElement = document.QuerySelector("header.entry-header"); IElement imageElement1 = imageElement.QuerySelector("img"); string imageUrl = imageElement1.GetAttribute("src"); string fileUniqueName = TimeHelper.GetTimeStamp(); webPageModel.Image = new EbookImageModel(fileUniqueName, imageUrl); List <EbookFileModel> ebooks = new List <EbookFileModel>(); int i = 1; foreach (Element downloadLinkElement in downloadLinkElements) { string url = downloadLinkElement.QuerySelector("a").GetAttribute("href"); string fileName = fileUniqueName + "_" + i; ebooks.Add(new EbookFileModel(webPageModel, url, fileName)); i++; //Debug.WriteLine("fileName=" + fileName); } webPageModel.EBooks = ebooks; return(webPageModel); }
private static async Task ParseAndDownload(EbookPageModel webpage) { EbookPageModel page = await parser.FindEbookUrl(webpage); //下载图书文件 int success = 0; foreach (EbookFileModel ebook in page.EBooks) { try { byte[] file = await HttpDownloader.DownloadBytes(ebook.DownloadUrl); FileWriter.WriteToFile(file, ebook.FileSavePath); success++; ebook.IsDownloaded = true; DownloadFileCount++; Debug.WriteLine("下载文件数量=" + DownloadFileCount); } catch (TaskCanceledException) { Debug.WriteLine("下载图书文件超时"); continue; } catch (HttpRequestException e) { Debug.WriteLine("访问图书下载连接失败:" + e.Message); continue; } catch (Exception e) { Debug.WriteLine(e.Message); continue; } } if (success == 0) { return; } //下载图书图片 try { byte[] imageBytes = await HttpDownloader.DownloadBytes(page.Image.ImageUrl); FileWriter.WriteToFile(imageBytes, page.Image.ImageSavePath); //如果图片下载成功,则填入地址,失败则为null page.ImagePath = page.Image.ImageSavePath; } catch (TaskCanceledException) { Debug.WriteLine("下载图片超时"); } catch (HttpRequestException e) { Debug.WriteLine("访问图片连接失败:" + e.Message); } catch (Exception e) { Debug.WriteLine(e.Message); } //添加对象属性 page.FilePaths = string.Join(", ", page.EBooks.Where(x => x.IsDownloaded).Select(x => x.FileSavePath)); page.FileFormat = string.Join(", ", page.EBooks.Where(x => x.IsDownloaded).Select(x => x.FileExtention)); page.FileCount = page.EBooks.Where(x => x.IsDownloaded).Count(); //写入数据库 //using (var db=new MyAppDbContext()) //{ // db.Blogs.Add(page); // db.SaveChanges(); // RecordCount++; // Debug.WriteLine("写入记录条数=" + RecordCount); //} }