private async Task Task_DownloadFileAsync(InvalidFileModel fileModel) { try { using (WebClient webClient = new WebClient()) { if (!useCustomNumbering) { await webClient.DownloadFileTaskAsync(new Uri(fileModel.Url), $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(fileModel.Url)}"); } else { await webClient.DownloadFileTaskAsync(new Uri(fileModel.Url), $@"{folderPathToSaveTo}\{string.Format("{0:D3}", fileModel.Index)}{TextHelper.GetImageExtensionFromLink(fileModel.Url)}"); } //remove model from bag //concurrentListOfInvalidUrls.FirstOrDefault(x => x.Index == fileModel.Index). concurrentListOfUrls.TryRemove(fileModel.Index, out string value); } } catch (Exception) { //transform data Enum.TryParse(TextHelper.GetImageExtensionFromLink(fileModel.Url), out Enums.FileExtensions matchingEnum); concurrentListOfUrls[fileModel.Index] = TextHelper.ChangeFileExtension(fileModel.Url, (matchingEnum++).ToString()); } }
private void DownloadFile(KeyValuePair <long, string> imageLink) //string url, long index { try { using (WebClient client = new WebClient()) { if (!_useNumbering) { client.DownloadFile(imageLink.Value, $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(imageLink.Value)}"); } else { client.DownloadFile(imageLink.Value, $@"{folderPathToSaveTo}\{string.Format("{0:D3}", imageLink.Key)}{TextHelper.GetImageExtensionFromLink(imageLink.Value)}"); } // OR //client.DownloadFileAsync(url, $@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}"); concurrentListOfUrls.TryRemove(imageLink.Key, out string value); } } catch (Exception) { //transform data Enum.TryParse(TextHelper.GetImageExtensionFromLink(imageLink.Value), out Enums.FileExtensions matchingEnum); concurrentListOfUrls[imageLink.Key] = TextHelper.ChangeFileExtension(imageLink.Value, (matchingEnum++).ToString()); } }
//public (ResultModel, List<string>) Test_Scrape_Comic(ComicModel model) //{ // ResultModel modelToReturn = new ResultModel() // { // Occurrence = DateTime.Now, // Data = Constants.Error, // Result = Enums.ResultTypes.Error // }; // List<string> listOfImageLinks = new List<string>(); // try // { // listOfImageLinks = Get_Nodes(model, 5); // //return the list of images // modelToReturn.Data = $@"{Get_ComicName(model)} has {listOfImageLinks.Count} items.{Environment.NewLine}{listOfImageLinks[0]}{Environment.NewLine}{listOfImageLinks[listOfImageLinks.Count/2]}{Environment.NewLine}{listOfImageLinks[listOfImageLinks.Count-1]}{Environment.NewLine} . . ."; // modelToReturn.Result = Enums.ResultTypes.Success; // } // catch (Exception ex) // { // modelToReturn.Data = $"{ex.Message}\n{ex.StackTrace}"; // } // return (modelToReturn, listOfImageLinks); //} #region Utility /// <summary> /// Don't use this, it will time-out on the 3rd call. The host will probably think this is a DOS attack since it happens in a sequence. only way I see this can work is if we re-use connections. /// </summary> /// <param name="url"></param> /// <returns></returns> private string ValidateUrl(string url) { if (galleryHasMultipleMediaTypes) { Thread.Sleep(5000); bool validLink = false; int extensionIndex = 0; do { var request = HttpWebRequest.Create(new Uri(url)); //(HttpWebRequest) HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //(HttpWebResponse) //var type = response.ContentType; if (response.StatusCode == HttpStatusCode.OK || extensionIndex == 3) { validLink = true; } else { //iterate the next possible file extension extensionIndex++; url = TextHelper.ChangeFileExtension(url, ((FileExtensions)extensionIndex).ToString()); } }while (!validLink); } return(url); }
/// <summary> /// https://stackoverflow.com/questions/3826370/how-can-i-validate-a-url-in-c-sharp-to-avoid-404-errors /// </summary> /// <param name="url"></param> /// <param name="index"></param> private void DownloadFile(string url, long index) { try { using (WebClient client = new WebClient()) { if (!useCustomNumbering) { client.DownloadFile(url, $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(url)}"); } else { client.DownloadFile(url, $@"{folderPathToSaveTo}\{string.Format("{0:D3}", index)}{TextHelper.GetImageExtensionFromLink(url)}"); } // OR //client.DownloadFileAsync(url, $@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}"); concurrentListOfUrls.TryRemove(index, out string value); } } catch (Exception) { //InvalidFileModel invalidFileModel = new InvalidFileModel() //{ // Url = url, // Index = index //}; //concurrentListOfInvalidUrls.Add(invalidFileModel); //transform data Enum.TryParse(TextHelper.GetImageExtensionFromLink(url), out Enums.FileExtensions matchingEnum); concurrentListOfUrls[index] = TextHelper.ChangeFileExtension(url, (matchingEnum++).ToString()); } //using (var httpClient = new HttpClient()) //{ // var imageBytes = httpClient.get(uri); // File.WriteAllBytes($@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}", imageBytes); //} //using (var sr = new StreamReader(HttpWebRequest.Create(url).GetResponse().GetResponseStream())) //{ // Uri uri = new Uri(url); // using (var sw = new StreamWriter($@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}")) // { // sw.Write(sr.ReadToEnd()); // } //} }
/// <summary> /// https://stackoverflow.com/a/13638087 /// </summary> /// <param name="urls"></param> private void ParallelDoWhile_Download(List <string> urls) { do { Parallel.ForEach( concurrentListOfUrls, new ParallelOptions { MaxDegreeOfParallelism = 10 }, async(kvp, state, index) => { try { using (WebClient client = new WebClient()) { if (!useCustomNumbering) { await Task.Run(() => client.DownloadFileAsync(new Uri(kvp.Value), $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(kvp.Value)}")); } else { await Task.Run(() => client.DownloadFileAsync(new Uri(kvp.Value), $@"{folderPathToSaveTo}\{string.Format("{0:D3}", kvp.Key)}{TextHelper.GetImageExtensionFromLink(kvp.Value)}")); } //remove the item from the ConcurrentDictionary if we get this far concurrentListOfUrls.TryRemove(kvp.Key, out string value); } } catch (Exception) { //inside the exception we will transform the file extension in the URL Enum.TryParse(TextHelper.GetImageExtensionFromLink(kvp.Value), out Enums.FileExtensions matchingEnum); concurrentListOfUrls[kvp.Key] = TextHelper.ChangeFileExtension(kvp.Value, (matchingEnum++).ToString()); } } ); }while (concurrentListOfUrls.Count > 0); }
/// <summary> /// https://docs.microsoft.com/en-us/dotnet/api/system.componentmodel.backgroundworker?view=netcore-3.1 /// as suggested from /// https://www.codeproject.com/Questions/5268310/How-do-I-download-multiple-file-with-downloadfilea /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void scraperBackgroundWorker_DoWork(object sender, DoWorkEventArgs e) { BackgroundWorker worker = sender as BackgroundWorker; _comicModel.ComicLink = txtLinkToComic.Text; var resultModel = _scraper.Scrape_Comic(_comicModel, _isProperScrape); concurrentListOfUrls = resultModel.Item2; numberOfImagesToDownload = concurrentListOfUrls.Count; //find out how much a unit of work is unitOfScrapingProgress = (1 / concurrentListOfUrls.Count) * 100; scrapingProgress = 0; do { if (worker.CancellationPending) { e.Cancel = true; } else { //Parallel_Download(); Parallel.ForEach( concurrentListOfUrls, new ParallelOptions { MaxDegreeOfParallelism = 10 }, (kvp, state, index) => //DownloadFile(kvp) { try { using (WebClient client = new WebClient()) { if (!_useNumbering) { client.DownloadFile(kvp.Value, $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(kvp.Value)}"); } else { client.DownloadFile(kvp.Value, $@"{folderPathToSaveTo}\{string.Format("{0:D3}", kvp.Key)}{TextHelper.GetImageExtensionFromLink(kvp.Value)}"); } // OR //client.DownloadFileAsync(url, $@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}"); concurrentListOfUrls.TryRemove(kvp.Key, out string value); scrapingProgress = ((numberOfImagesToDownload - concurrentListOfUrls.Count) / numberOfImagesToDownload) * 100; //Console.WriteLine($@"({numberOfImagesToDownload} - {concurrentListOfUrls.Count}) / {numberOfImagesToDownload} = {scrapingProgress}"); //scrapingProgress += unitOfScrapingProgress; worker.ReportProgress((int)(scrapingProgress > 100 ? 100 : scrapingProgress)); } } catch (Exception) { var extension = TextHelper.GetImageExtensionFromLink(kvp.Value).Replace(".", ""); //transform data Enum.TryParse(extension, out Enums.FileExtensions matchingEnum); var newLink = TextHelper.ChangeFileExtension(kvp.Value, matchingEnum.Next().ToString()); concurrentListOfUrls.TryUpdate(kvp.Key, newLink, kvp.Value); } } ); } }while (concurrentListOfUrls.Count > 0); e.Result = resultModel.Item1; }
/// <summary> /// Method to extract links from scraped HTML /// </summary> /// <param name="model">The site profile</param> /// <param name="limit">The number of items to scrape. Pass -1 if to return all.</param> /// <returns></returns> private ConcurrentDictionary <long, string> Get_Nodes(ComicModel model, int limit) { int pageCount = 1; bool continueCheckingForImagesOverMultiplePages = model.QueryString != null && model.QueryString != string.Empty; //var listOfImageLinks = new List<string>(); int imagesPerPage = 0; bool doNotAddToList; ConcurrentDictionary <long, string> keyValuePairs = new ConcurrentDictionary <long, string>(); do { doNotAddToList = false; pageCount++; //get the collection var nodes = documentToScrape .DocumentNode .SelectNodes($@"{model.XPath}"); var scrapedImageLinks = new Dictionary <long, string>(); var srcAttribute = model.TagNameInsideImage.Equals(string.Empty) ? Constants.DefaultAttributeToLookForInImage : model.TagNameInsideImage; var srcTag = (bool)(model.TagToLookFor?.Equals(string.Empty)) ? Constants.DefaultTagToLookFor : model.TagToLookFor; //var imgs = nodes.Descendants(srcTag); //var decodedImgs = imgs.Select(img => WebUtility.HtmlDecode(img.GetAttributeValue(srcAttribute, null))); int index = 0; //links to process are found here var linksToProcess = nodes.Descendants(srcTag) .Select(img => WebUtility.HtmlDecode(img.GetAttributeValue(srcAttribute, null))) .Where(s => !String.IsNullOrEmpty(s)) .Take(limit > 0 ? limit : int.MaxValue) .ToList(); //some websites break their own code, but we can still attempt to grab URLs from text via Regex if (model.DoubleCheckLinks) { foreach (var item in nodes.Descendants("a")) { var link = TextHelper.GetUrlFromText(item.OuterHtml); if (!linksToProcess.Contains(link)) { linksToProcess.Add(link); } } } //https://stackoverflow.com/a/17158393 foreach (var item in linksToProcess) { index++; StringBuilder baseUrl = new StringBuilder(); StringBuilder finalUrl = new StringBuilder(); var url = string.Empty; if (model.AppendDomain) { baseUrl.Append(model.Link); } if (!model.ReplaceString.Equals(string.Empty) || model.RemoveDimensions || !model.ReplaceTextInImageNames.Equals(string.Empty)) { if (!model.ReplaceString.Equals(string.Empty)) { url = item.Replace(model.ReplaceString, model.ReplaceWith); } //will remove last instance of string provided if (!model.ReplaceTextInImageNames.Equals(string.Empty)) { //first grab the file extension var ext = TextHelper.GetImageExtensionFromLink(item); //merge the replaced text with the extension url = TextHelper.ReplaceLastInstance(item, model.ReplaceTextInImageNames, model.ReplaceTextInImageNamesWith) + ext; } if (model.RemoveDimensions) { if (string.IsNullOrEmpty(url)) { url = item.RemoveDimensionsFromLink(); } else { url = url.RemoveDimensionsFromLink(); } } } else { baseUrl.Append(item); } if (!string.IsNullOrEmpty(model.ReplaceImageExtensionWith)) { finalUrl.Append(TextHelper.ChangeFileExtension(baseUrl.ToString() + url, model.ReplaceImageExtensionWith)); } else { finalUrl.Append(baseUrl.ToString() + url); } //concurrentListOfUrls.TryAdd(index, finalUrl.ToString()); scrapedImageLinks.Add(index, finalUrl.ToString()); } if (imagesPerPage < scrapedImageLinks.Count) { imagesPerPage = scrapedImageLinks.Count; } if (continueCheckingForImagesOverMultiplePages && scrapeAddress != null) { var resultModel = PrepareDocumentFromUri(new Uri($@"{scrapeAddress}?{model.QueryString}={pageCount}")); //to check to see if the site has redirects instead of a custom error page for non-existant pages when trying to scrape for images over multiple pages, we will compare image names if (resultModel.Result == Enums.ResultTypes.Error || scrapedImageLinks.Count - imagesPerPage < 0) { //page not found (custom error page) continueCheckingForImagesOverMultiplePages = false; } else if (pageCount > 2 && (keyValuePairs.ElementAt(0).Equals(scrapedImageLinks.ElementAt(0)) || scrapedImageLinks.ElementAt(scrapedImageLinks.Count - 1).Equals(keyValuePairs.ElementAt(((pageCount - 2) * imagesPerPage) - 1)))) { //image names case 1) very first image with newest page, case 2) newest page with last page continueCheckingForImagesOverMultiplePages = false; doNotAddToList = true; } } if (!doNotAddToList) { foreach (var imageLink in scrapedImageLinks) { //keyValuePairs.AddRange(scrapedImageLinks.Select(x => x.Value)); keyValuePairs.TryAdd(imageLink.Key, imageLink.Value); } } }while (continueCheckingForImagesOverMultiplePages); return(keyValuePairs); }