Exemple #1
0
        public void TestFile()
        {
            string localPath = @"C:\temp\master.zip";
            bool   succeded  = WebDataRetriever.TryDownloadFile("https://github.com/gabriele-ricci-kyklos/FantasyFootballStatistics/archive/master.zip", localPath);

            Assert.IsTrue(succeded);
            Assert.IsTrue(File.Exists(localPath));
        }
        private async Task <byte[]> RetrieveZipFileWithBackup(Architecture arch, string backupPath, IProgress <double> progress = null)
        {
            backupPath.AssertHasText(nameof(backupPath));

            if (arch != Architecture.X86 && arch != Architecture.X64)
            {
                throw new ArgumentException($"Architecture {arch} not supported");
            }

            string url      = GetDownloadUrl(arch);
            string fileName = Utilities.GetFileNameFromUriString(url);

            Logger.Info("Downloading the zip archive");

            byte[] zipArchive = null;
            try
            {
                zipArchive = await WebDataRetriever.DownloadFileAsync(url, progress);
            }
            catch (Exception ex)
            {
                throw new ArgumentException("An error occurred while downloading the file", ex);
            }
            finally
            {
                Logger.Info("Finished downloading the zip archive");
            }

            string localPath = Path.Combine(backupPath, fileName);

            Logger.Info($"Writing the zip archive locally to {backupPath}");
            File.WriteAllBytes(localPath, zipArchive);
            Logger.Info("The zip archive has been successfully saved locally");

            return(zipArchive);
        }
Exemple #3
0
        public void TestPage()
        {
            string html = WebDataRetriever.RetrievePage("https://google.it/");

            Assert.IsNotNull(html);
        }
Exemple #4
0
        public static void Regenerate(
            Uri root,
            DirectoryInfo destination,
            UrlRewriter urlRewriter,
            Func <IRewriteContent, IRewriteContent> optionalRewriteIntercepter,
            IEnumerable <Uri> optionalAdditionalUrlsToRetrieve = null)
        {
            if (root == null)
            {
                throw new ArgumentNullException("root");
            }
            if (!root.IsAbsoluteUri)
            {
                throw new ArgumentException("root must be an absolute url");
            }
            if (destination == null)
            {
                throw new ArgumentNullException("destination");
            }
            destination.Refresh();
            if (!destination.Exists)
            {
                throw new ArgumentException("The specified destination folder does not exist");
            }
            if (urlRewriter == null)
            {
                throw new ArgumentNullException("urlRewriter");
            }

            var             webRequester        = new WebDataRetriever();
            IRewriteContent htmlContentRewriter = new HtmlContentRewriter(urlRewriter);

            if (optionalRewriteIntercepter != null)
            {
                htmlContentRewriter = optionalRewriteIntercepter(htmlContentRewriter);
            }
            IRewriteContent cssContentRewriter = new CssContentRewriter(urlRewriter);

            if (optionalRewriteIntercepter != null)
            {
                cssContentRewriter = optionalRewriteIntercepter(cssContentRewriter);
            }

            var processedUrls = new HashSet <Uri>();
            var urlsToProcess = new HashSet <Uri>
            {
                new Uri("/", UriKind.Relative)
            };

            if (optionalAdditionalUrlsToRetrieve != null)
            {
                foreach (var url in optionalAdditionalUrlsToRetrieve)
                {
                    if (url != null)
                    {
                        urlsToProcess.Add(url);
                    }
                }
            }
            while (urlsToProcess.Any(u => !processedUrls.Contains(u)))
            {
                foreach (var url in urlsToProcess.ToArray().Where(u => !processedUrls.Contains(u)))
                {
                    var urlToRequest = new Uri(root, url);
                    var rewrittenUrl = urlRewriter(url);
                    if (rewrittenUrl.IsAbsoluteUri)
                    {
                        processedUrls.Add(url);
                        continue;
                    }

                    var rewrittenUrlString = rewrittenUrl.ToString();
                    if (rewrittenUrlString.Contains("?"))
                    {
                        // If the rewritten URL still contains QueryString content then don't pull it down - presumably it varies by QueryString (if
                        // we want to always return the same content then the URL Rewriter should have pulled the QueryString content into the page
                        // name)
                        processedUrls.Add(url);
                        continue;
                    }

                    var pageName = rewrittenUrlString.Split('?')[0].Split('#')[0].Replace('/', '\\').TrimEnd('\\');
                    if (pageName == "")
                    {
                        pageName = "index.html";
                    }
                    else
                    {
                        var lastUrlSegment = pageName.Split('/').Last();
                        if (!lastUrlSegment.Contains(".") || (lastUrlSegment.Split('.').Last().Any(c => char.IsUpper(c))))                         // TODO: Hack workaround to catch "ASP.Net" and make it "ASP.Net.html"
                        {
                            pageName += ".html";
                        }
                    }

                    // If it's html or css content then pass it through the appropriate default content rewriter and then the optionalRewriteIntercepter
                    // (if non-null). Otherwise just pull it as binary content, regardless of whether it's an image or a javascript file (no custom
                    // rewriting is done in this case, the optionalRewriteIntercepter is not used even if it isn't null)
                    IRewriteContent contentRewriter;
                    if (pageName.EndsWith(".html", StringComparison.InvariantCultureIgnoreCase))
                    {
                        contentRewriter = htmlContentRewriter;
                    }
                    else if (pageName.EndsWith(".css", StringComparison.InvariantCultureIgnoreCase))
                    {
                        contentRewriter = cssContentRewriter;
                    }
                    else
                    {
                        contentRewriter = null;
                    }

                    // 2021-04-07 DWR: Most page names won't have any URL-encoded characters because they are generally formed as "URL friendly" names
                    // but there are some, such as the Archive-by-Tag pages, that DO have characters that will be URL-encoded in the content that is
                    // extracted now but we want the file name that's written to be the NON-encoded version
                    pageName = HttpUtility.UrlDecode(pageName.TrimStart('\\'));

                    var destinationFile = new FileInfo(Path.Combine(destination.FullName, pageName));
                    if (!destinationFile.Directory.Exists)
                    {
                        destinationFile.Directory.Create();
                    }
                    if (contentRewriter == null)
                    {
                        File.WriteAllBytes(
                            destinationFile.FullName,
                            webRequester.GetBinary(urlToRequest)
                            );
                    }
                    else
                    {
                        var rewrittenContent = contentRewriter.Rewrite(
                            webRequester.GetText(urlToRequest),
                            urlToRequest
                            );
                        File.WriteAllText(
                            destinationFile.FullName,
                            rewrittenContent.Content
                            );
                        foreach (var urlToAdd in rewrittenContent.ReferencedRelativeUrls)
                        {
                            urlsToProcess.Add(urlToAdd);
                        }
                    }

                    processedUrls.Add(url);
                }
            }

            // TODO: Have to deal with any files referenced by javascript!
        }