public void Download(DownloadServiceOption options) { var worker = new WebRequestWorker(); foreach (var urlAndPattern in options.UrlsAndPatterns) { var uri = urlAndPattern.Key; var configFile = urlAndPattern.Value; Console.WriteLine("Start processing " + uri); var categoryResult = worker.DownloadResponse(new CrawlingOption(uri)); if (categoryResult.StatusCode == HttpStatusCode.OK) { var configs = SerializationHelper.DeserializeFrom <List <DataItem> >(configFile); var pageLinkPattern = configs.Get(Constants.PagePattern).Value; var fileLinkPattern = configs.Get(Constants.FilePattern).Value; var categoryPages = GetArticleLinks(categoryResult.ReadAsText(), pageLinkPattern); ProcessAllPages(categoryPages, fileLinkPattern, options.TargetFolder); } else { Console.WriteLine(categoryResult.StatusDescription); } } }
public void Download(DownloadServiceOption options) { var worker = new WebRequestWorker(); foreach (var urlAndPattern in options.UrlsAndPatterns) { var uri = urlAndPattern.Key; var configFile = urlAndPattern.Value; Console.WriteLine("Start processing " + uri); var categoryResult = worker.DownloadResponse(new CrawlingOption(uri)); if (categoryResult.StatusCode == HttpStatusCode.OK) { var configs = SerializationHelper.DeserializeFrom<List<DataItem>>(configFile); var pageLinkPattern = configs.Get(Constants.PagePattern).Value; var fileLinkPattern = configs.Get(Constants.FilePattern).Value; var categoryPages = GetArticleLinks(categoryResult.ReadAsText(), pageLinkPattern); ProcessAllPages(categoryPages, fileLinkPattern, options.TargetFolder); } else { Console.WriteLine(categoryResult.StatusDescription); } } }
public void CanDownloadFiles() { var service = new DhtnDownloadService(); var options = new DownloadServiceOption() { TargetFolder = @"D:\\Wip\\Practices\\OpenSource\\ts2015\\DHTN", UrlsAndPatterns = new Dictionary <string, string>() { { "http://tuyensinh.tnu.edu.vn/article/details/363", @"/article/Download/\d+" } } }; service.Download(options); }
public void CanDownloadFiles() { var service = new DhxdDownloadService("*****@*****.**", @"D:\Wip\Practices\ts2015\Docs\GoogleServiceAccount\tuyensinhquocgia-f36c1d0e7788.p12", "TSQG"); var options = new DownloadServiceOption() { TargetFolder = @"E:\Projects\github\ts2015\DHXD-HN", UrlsAndPatterns = new Dictionary<string, string>() { {"http://tuyensinh.nuce.edu.vn/tin-tuc/thong-bao-diem-chuan-va-danh-sach-trung-tuyen-dai-hoc-he-chinh-quy", @".*\/(.*?.pdf)$|google.com"} } }; service.Download(options); }
public void CanDownloadFiles() { var service = new DhxdDownloadService("*****@*****.**", @"D:\Wip\Practices\ts2015\Docs\GoogleServiceAccount\tuyensinhquocgia-f36c1d0e7788.p12", "TSQG"); var options = new DownloadServiceOption() { TargetFolder = @"E:\Projects\github\ts2015\DHXD-HN", UrlsAndPatterns = new Dictionary <string, string>() { { "http://tuyensinh.nuce.edu.vn/tin-tuc/thong-bao-diem-chuan-va-danh-sach-trung-tuyen-dai-hoc-he-chinh-quy", @".*\/(.*?.pdf)$|google.com" } } }; service.Download(options); }
protected override void ProcessAllPages(IEnumerable <string> pages, string filePattern, string targetFolder) { var pageDownloadService = new DhdlPageDownloadService(); foreach (var pageLink in pages) { var options = new DownloadServiceOption() { TargetFolder = targetFolder, UrlsAndPatterns = new Dictionary <string, string>() { { pageLink, filePattern } } }; pageDownloadService.Download(options); } }
protected override void ProcessAllPages(IEnumerable<string> pages, string filePattern, string targetFolder) { var pageDownloadService = new HvtcPageDownloadService(); foreach (var pageLink in pages) { var options = new DownloadServiceOption() { TargetFolder = targetFolder, UrlsAndPatterns = new Dictionary<string, string>() { {pageLink, filePattern} } }; pageDownloadService.Download(options); } }
private void ProcessDownload(DownloadServiceOption options, string file, KeyValuePair<string, IEnumerable<string>> kvp) { try { var pattern = options.UrlsAndPatterns.First(x => x.Key == kvp.Key).Value; var fileName = GetFileName(file, pattern); if (!string.IsNullOrWhiteSpace(fileName) && File.Exists(Path.Combine(options.TargetFolder, fileName))) { Console.WriteLine(" -> Exists"); return; } var fileResult = _worker.DownloadResponse(new CrawlingOption(file)); if (string.IsNullOrWhiteSpace(fileName) && !string.IsNullOrWhiteSpace(fileResult.ResponseUri)) { fileName = Path.GetFileName(fileResult.ResponseUri); } var filePath = Path.Combine(options.TargetFolder, fileName); if (File.Exists(filePath)) { Console.WriteLine(" -> Exists"); return; } if (!Directory.Exists(options.TargetFolder)) { Directory.CreateDirectory(options.TargetFolder); } File.WriteAllBytes(filePath, fileResult.Content); Console.WriteLine(fileName + " -> Done"); } catch (Exception ex) { Console.WriteLine(ex.ToString()); } }
public void Download(DownloadServiceOption options) { var fileLinks = new Dictionary<string, IEnumerable<string>>(); foreach (var urlAndPattern in options.UrlsAndPatterns) { var uri = urlAndPattern.Key; var filePattern = urlAndPattern.Value; Console.WriteLine("Start getting " + uri); var articleResult = _worker.DownloadResponse(new CrawlingOption(uri)); if (articleResult.StatusCode == HttpStatusCode.OK) { fileLinks.Add(uri, GetFileLinks(articleResult.ReadAsText(), filePattern)); } else { Console.WriteLine(articleResult.StatusDescription); } } var count = 0; var total = fileLinks.Values.SelectMany(x => x).Count(); foreach (var kvp in fileLinks) { foreach (var file in kvp.Value) { count += 1; Console.WriteLine("Process " + count + "/" + total); ProcessDownload(options, file, kvp); } } }