コード例 #1
0
ファイル: Scraper.cs プロジェクト: InhumanPwnage/ComicScraper
        private async Task Task_DownloadFileAsync(InvalidFileModel fileModel)
        {
            try
            {
                using (WebClient webClient = new WebClient())
                {
                    if (!useCustomNumbering)
                    {
                        await webClient.DownloadFileTaskAsync(new Uri(fileModel.Url), $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(fileModel.Url)}");
                    }
                    else
                    {
                        await webClient.DownloadFileTaskAsync(new Uri(fileModel.Url), $@"{folderPathToSaveTo}\{string.Format("{0:D3}", fileModel.Index)}{TextHelper.GetImageExtensionFromLink(fileModel.Url)}");
                    }

                    //remove model from bag
                    //concurrentListOfInvalidUrls.FirstOrDefault(x => x.Index == fileModel.Index).
                    concurrentListOfUrls.TryRemove(fileModel.Index, out string value);
                }
            }
            catch (Exception)
            {
                //transform data
                Enum.TryParse(TextHelper.GetImageExtensionFromLink(fileModel.Url), out Enums.FileExtensions matchingEnum);

                concurrentListOfUrls[fileModel.Index] = TextHelper.ChangeFileExtension(fileModel.Url, (matchingEnum++).ToString());
            }
        }
コード例 #2
0
        private void DownloadFile(KeyValuePair <long, string> imageLink) //string url, long index
        {
            try
            {
                using (WebClient client = new WebClient())
                {
                    if (!_useNumbering)
                    {
                        client.DownloadFile(imageLink.Value, $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(imageLink.Value)}");
                    }
                    else
                    {
                        client.DownloadFile(imageLink.Value, $@"{folderPathToSaveTo}\{string.Format("{0:D3}", imageLink.Key)}{TextHelper.GetImageExtensionFromLink(imageLink.Value)}");
                    }
                    // OR
                    //client.DownloadFileAsync(url, $@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}");
                    concurrentListOfUrls.TryRemove(imageLink.Key, out string value);
                }
            }
            catch (Exception)
            {
                //transform data
                Enum.TryParse(TextHelper.GetImageExtensionFromLink(imageLink.Value), out Enums.FileExtensions matchingEnum);

                concurrentListOfUrls[imageLink.Key] = TextHelper.ChangeFileExtension(imageLink.Value, (matchingEnum++).ToString());
            }
        }
コード例 #3
0
ファイル: Scraper.cs プロジェクト: InhumanPwnage/ComicScraper
        //public (ResultModel, List<string>) Test_Scrape_Comic(ComicModel model)
        //{
        //    ResultModel modelToReturn = new ResultModel()
        //    {
        //        Occurrence = DateTime.Now,
        //        Data = Constants.Error,
        //        Result = Enums.ResultTypes.Error
        //    };

        //    List<string> listOfImageLinks = new List<string>();

        //    try
        //    {
        //        listOfImageLinks = Get_Nodes(model, 5);

        //        //return the list of images
        //        modelToReturn.Data = $@"{Get_ComicName(model)} has {listOfImageLinks.Count} items.{Environment.NewLine}{listOfImageLinks[0]}{Environment.NewLine}{listOfImageLinks[listOfImageLinks.Count/2]}{Environment.NewLine}{listOfImageLinks[listOfImageLinks.Count-1]}{Environment.NewLine} . . .";
        //        modelToReturn.Result = Enums.ResultTypes.Success;
        //    }
        //    catch (Exception ex)
        //    {
        //        modelToReturn.Data = $"{ex.Message}\n{ex.StackTrace}";
        //    }

        //    return (modelToReturn, listOfImageLinks);
        //}


        #region Utility

        /// <summary>
        /// Don't use this, it will time-out on the 3rd call. The host will probably think this is a DOS attack since it happens in a sequence. only way I see this can work is if we re-use connections.
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private string ValidateUrl(string url)
        {
            if (galleryHasMultipleMediaTypes)
            {
                Thread.Sleep(5000);
                bool validLink      = false;
                int  extensionIndex = 0;

                do
                {
                    var             request  = HttpWebRequest.Create(new Uri(url));    //(HttpWebRequest)
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //(HttpWebResponse)
                                                                                       //var type = response.ContentType;
                    if (response.StatusCode == HttpStatusCode.OK || extensionIndex == 3)
                    {
                        validLink = true;
                    }
                    else
                    {
                        //iterate the next possible file extension
                        extensionIndex++;

                        url = TextHelper.ChangeFileExtension(url, ((FileExtensions)extensionIndex).ToString());
                    }
                }while (!validLink);
            }

            return(url);
        }
コード例 #4
0
ファイル: Scraper.cs プロジェクト: InhumanPwnage/ComicScraper
        /// <summary>
        /// https://stackoverflow.com/questions/3826370/how-can-i-validate-a-url-in-c-sharp-to-avoid-404-errors
        /// </summary>
        /// <param name="url"></param>
        /// <param name="index"></param>
        private void DownloadFile(string url, long index)
        {
            try
            {
                using (WebClient client = new WebClient())
                {
                    if (!useCustomNumbering)
                    {
                        client.DownloadFile(url, $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(url)}");
                    }
                    else
                    {
                        client.DownloadFile(url, $@"{folderPathToSaveTo}\{string.Format("{0:D3}", index)}{TextHelper.GetImageExtensionFromLink(url)}");
                    }
                    // OR
                    //client.DownloadFileAsync(url, $@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}");
                    concurrentListOfUrls.TryRemove(index, out string value);
                }
            }
            catch (Exception)
            {
                //InvalidFileModel invalidFileModel = new InvalidFileModel()
                //{
                //    Url = url,
                //    Index = index
                //};

                //concurrentListOfInvalidUrls.Add(invalidFileModel);
                //transform data
                Enum.TryParse(TextHelper.GetImageExtensionFromLink(url), out Enums.FileExtensions matchingEnum);

                concurrentListOfUrls[index] = TextHelper.ChangeFileExtension(url, (matchingEnum++).ToString());
            }


            //using (var httpClient = new HttpClient())
            //{
            //    var imageBytes = httpClient.get(uri);
            //    File.WriteAllBytes($@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}", imageBytes);
            //}

            //using (var sr = new StreamReader(HttpWebRequest.Create(url).GetResponse().GetResponseStream()))
            //{
            //    Uri uri = new Uri(url);

            //    using (var sw = new StreamWriter($@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}"))
            //    {
            //        sw.Write(sr.ReadToEnd());
            //    }
            //}
        }
コード例 #5
0
ファイル: Scraper.cs プロジェクト: InhumanPwnage/ComicScraper
        /// <summary>
        /// https://stackoverflow.com/a/13638087
        /// </summary>
        /// <param name="urls"></param>
        private void ParallelDoWhile_Download(List <string> urls)
        {
            do
            {
                Parallel.ForEach(
                    concurrentListOfUrls, new ParallelOptions {
                    MaxDegreeOfParallelism = 10
                }, async(kvp, state, index) =>
                {
                    try
                    {
                        using (WebClient client = new WebClient())
                        {
                            if (!useCustomNumbering)
                            {
                                await Task.Run(() => client.DownloadFileAsync(new Uri(kvp.Value), $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(kvp.Value)}"));
                            }
                            else
                            {
                                await Task.Run(() => client.DownloadFileAsync(new Uri(kvp.Value), $@"{folderPathToSaveTo}\{string.Format("{0:D3}", kvp.Key)}{TextHelper.GetImageExtensionFromLink(kvp.Value)}"));
                            }

                            //remove the item from the ConcurrentDictionary if we get this far
                            concurrentListOfUrls.TryRemove(kvp.Key, out string value);
                        }
                    }
                    catch (Exception)
                    {
                        //inside the exception we will transform the file extension in the URL
                        Enum.TryParse(TextHelper.GetImageExtensionFromLink(kvp.Value), out Enums.FileExtensions matchingEnum);

                        concurrentListOfUrls[kvp.Key] = TextHelper.ChangeFileExtension(kvp.Value, (matchingEnum++).ToString());
                    }
                }
                    );
            }while (concurrentListOfUrls.Count > 0);
        }
コード例 #6
0
        /// <summary>
        /// https://docs.microsoft.com/en-us/dotnet/api/system.componentmodel.backgroundworker?view=netcore-3.1
        /// as suggested from
        /// https://www.codeproject.com/Questions/5268310/How-do-I-download-multiple-file-with-downloadfilea
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void scraperBackgroundWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            BackgroundWorker worker = sender as BackgroundWorker;

            _comicModel.ComicLink = txtLinkToComic.Text;

            var resultModel = _scraper.Scrape_Comic(_comicModel, _isProperScrape);

            concurrentListOfUrls     = resultModel.Item2;
            numberOfImagesToDownload = concurrentListOfUrls.Count;

            //find out how much a unit of work is
            unitOfScrapingProgress = (1 / concurrentListOfUrls.Count) * 100;
            scrapingProgress       = 0;

            do
            {
                if (worker.CancellationPending)
                {
                    e.Cancel = true;
                }
                else
                {
                    //Parallel_Download();

                    Parallel.ForEach(
                        concurrentListOfUrls, new ParallelOptions {
                        MaxDegreeOfParallelism = 10
                    }, (kvp, state, index) =>                                                                             //DownloadFile(kvp)
                    {
                        try
                        {
                            using (WebClient client = new WebClient())
                            {
                                if (!_useNumbering)
                                {
                                    client.DownloadFile(kvp.Value, $@"{folderPathToSaveTo}\{TextHelper.GetImageNameFromLink(kvp.Value)}");
                                }
                                else
                                {
                                    client.DownloadFile(kvp.Value, $@"{folderPathToSaveTo}\{string.Format("{0:D3}", kvp.Key)}{TextHelper.GetImageExtensionFromLink(kvp.Value)}");
                                }
                                // OR
                                //client.DownloadFileAsync(url, $@"{folderPathToSaveTo}\{Path.GetFileName(uri.LocalPath.Split('/').Last())}");
                                concurrentListOfUrls.TryRemove(kvp.Key, out string value);

                                scrapingProgress = ((numberOfImagesToDownload - concurrentListOfUrls.Count) / numberOfImagesToDownload) * 100;
                                //Console.WriteLine($@"({numberOfImagesToDownload} - {concurrentListOfUrls.Count}) / {numberOfImagesToDownload} = {scrapingProgress}");

                                //scrapingProgress += unitOfScrapingProgress;
                                worker.ReportProgress((int)(scrapingProgress > 100 ? 100 : scrapingProgress));
                            }
                        }
                        catch (Exception)
                        {
                            var extension = TextHelper.GetImageExtensionFromLink(kvp.Value).Replace(".", "");

                            //transform data
                            Enum.TryParse(extension, out Enums.FileExtensions matchingEnum);

                            var newLink = TextHelper.ChangeFileExtension(kvp.Value, matchingEnum.Next().ToString());

                            concurrentListOfUrls.TryUpdate(kvp.Key, newLink, kvp.Value);
                        }
                    }
                        );
                }
            }while (concurrentListOfUrls.Count > 0);

            e.Result = resultModel.Item1;
        }
コード例 #7
0
ファイル: Scraper.cs プロジェクト: InhumanPwnage/ComicScraper
        /// <summary>
        /// Method to extract links from scraped HTML
        /// </summary>
        /// <param name="model">The site profile</param>
        /// <param name="limit">The number of items to scrape. Pass -1 if to return all.</param>
        /// <returns></returns>
        private ConcurrentDictionary <long, string> Get_Nodes(ComicModel model, int limit)
        {
            int  pageCount = 1;
            bool continueCheckingForImagesOverMultiplePages = model.QueryString != null && model.QueryString != string.Empty;
            //var listOfImageLinks = new List<string>();
            int  imagesPerPage = 0;
            bool doNotAddToList;
            ConcurrentDictionary <long, string> keyValuePairs = new ConcurrentDictionary <long, string>();

            do
            {
                doNotAddToList = false;
                pageCount++;

                //get the collection
                var nodes = documentToScrape
                            .DocumentNode
                            .SelectNodes($@"{model.XPath}");

                var scrapedImageLinks = new Dictionary <long, string>();

                var srcAttribute = model.TagNameInsideImage.Equals(string.Empty) ? Constants.DefaultAttributeToLookForInImage : model.TagNameInsideImage;
                var srcTag       = (bool)(model.TagToLookFor?.Equals(string.Empty)) ? Constants.DefaultTagToLookFor : model.TagToLookFor;

                //var imgs = nodes.Descendants(srcTag);
                //var decodedImgs = imgs.Select(img => WebUtility.HtmlDecode(img.GetAttributeValue(srcAttribute, null)));
                int index = 0;

                //links to process are found here
                var linksToProcess = nodes.Descendants(srcTag)
                                     .Select(img => WebUtility.HtmlDecode(img.GetAttributeValue(srcAttribute, null)))
                                     .Where(s => !String.IsNullOrEmpty(s))
                                     .Take(limit > 0 ? limit : int.MaxValue)
                                     .ToList();

                //some websites break their own code, but we can still attempt to grab URLs from text via Regex
                if (model.DoubleCheckLinks)
                {
                    foreach (var item in nodes.Descendants("a"))
                    {
                        var link = TextHelper.GetUrlFromText(item.OuterHtml);

                        if (!linksToProcess.Contains(link))
                        {
                            linksToProcess.Add(link);
                        }
                    }
                }


                //https://stackoverflow.com/a/17158393
                foreach (var item in linksToProcess)
                {
                    index++;
                    StringBuilder baseUrl  = new StringBuilder();
                    StringBuilder finalUrl = new StringBuilder();

                    var url = string.Empty;

                    if (model.AppendDomain)
                    {
                        baseUrl.Append(model.Link);
                    }

                    if (!model.ReplaceString.Equals(string.Empty) || model.RemoveDimensions || !model.ReplaceTextInImageNames.Equals(string.Empty))
                    {
                        if (!model.ReplaceString.Equals(string.Empty))
                        {
                            url = item.Replace(model.ReplaceString, model.ReplaceWith);
                        }

                        //will remove last instance of string provided
                        if (!model.ReplaceTextInImageNames.Equals(string.Empty))
                        {
                            //first grab the file extension
                            var ext = TextHelper.GetImageExtensionFromLink(item);

                            //merge the replaced text with the extension
                            url = TextHelper.ReplaceLastInstance(item, model.ReplaceTextInImageNames, model.ReplaceTextInImageNamesWith) + ext;
                        }

                        if (model.RemoveDimensions)
                        {
                            if (string.IsNullOrEmpty(url))
                            {
                                url = item.RemoveDimensionsFromLink();
                            }
                            else
                            {
                                url = url.RemoveDimensionsFromLink();
                            }
                        }
                    }
                    else
                    {
                        baseUrl.Append(item);
                    }

                    if (!string.IsNullOrEmpty(model.ReplaceImageExtensionWith))
                    {
                        finalUrl.Append(TextHelper.ChangeFileExtension(baseUrl.ToString() + url, model.ReplaceImageExtensionWith));
                    }
                    else
                    {
                        finalUrl.Append(baseUrl.ToString() + url);
                    }

                    //concurrentListOfUrls.TryAdd(index, finalUrl.ToString());
                    scrapedImageLinks.Add(index, finalUrl.ToString());
                }

                if (imagesPerPage < scrapedImageLinks.Count)
                {
                    imagesPerPage = scrapedImageLinks.Count;
                }

                if (continueCheckingForImagesOverMultiplePages && scrapeAddress != null)
                {
                    var resultModel = PrepareDocumentFromUri(new Uri($@"{scrapeAddress}?{model.QueryString}={pageCount}"));

                    //to check to see if the site has redirects instead of a custom error page for non-existant pages when trying to scrape for images over multiple pages, we will compare image names
                    if (resultModel.Result == Enums.ResultTypes.Error || scrapedImageLinks.Count - imagesPerPage < 0)
                    {
                        //page not found (custom error page)
                        continueCheckingForImagesOverMultiplePages = false;
                    }
                    else if (pageCount > 2 &&
                             (keyValuePairs.ElementAt(0).Equals(scrapedImageLinks.ElementAt(0)) ||
                              scrapedImageLinks.ElementAt(scrapedImageLinks.Count - 1).Equals(keyValuePairs.ElementAt(((pageCount - 2) * imagesPerPage) - 1))))
                    {
                        //image names case 1) very first image with newest page, case 2) newest page with last page
                        continueCheckingForImagesOverMultiplePages = false;
                        doNotAddToList = true;
                    }
                }

                if (!doNotAddToList)
                {
                    foreach (var imageLink in scrapedImageLinks)
                    {
                        //keyValuePairs.AddRange(scrapedImageLinks.Select(x => x.Value));
                        keyValuePairs.TryAdd(imageLink.Key, imageLink.Value);
                    }
                }
            }while (continueCheckingForImagesOverMultiplePages);

            return(keyValuePairs);
        }