Beispiel #1
        private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo)
            if (urlContentTypeInfo == null)

            // We should download pages that are web pages, css, or js files and treat them as web pages!
        private static string GetBodyText(HttpWebResponse resp)
            if (resp.ContentType != null && resp.ContentType.Length > 0)
                IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true);
                string      mainType        = (string)contentTypeData[""];
                switch (mainType)
                case "text/plain":

                case "text/html":
        /// <summary>
        /// Gets the character set associated with the WebResponse.
        /// </summary>
        /// <param name="response">The WebResponse to inspect for a character set.</param>
        /// <param name="characterSet">When this method returns, contains the character set associated with the
        /// WebResponse if the character set is explicitly specified; otherwise, null. This parameter is passed
        /// uninitialized.</param>
        /// <returns>true if the WebResponse explicitly specifies a character set; otherwise, false.</returns>
        private bool TryGetCharacterSet(HttpWebResponse response, out string characterSet)
            // A very applicable comment from MSDN on why not to use the HttpWebResponse.CharacterSet property:
            // "As stated in a previous comment, many web servers are poorly configured and don't include the charset
            // in their content type header, e. g. they just return "text/html". In theory, user agents should treat
            // it as ISO-8859-1, as recommended by W3C. This is what the CharacterSet property actually does: It
            // always returns ISO-8859-1 if the charset it not specified, although often the content has a different
            // encoding (which of course HttpWebResponse cannot know).
            // In real life however, in case of a missing charset definition in the HTTP header user agents look into
            // the markup, and usully a meta tag can be found that contains the correct encoding, like "utf-8". To
            // implement this pragmatic approach, it would IMHO be much more convenient if the CharacterSet would
            // default to an empty string, then you know, that the encoding is not specified and you need a workaround
            // to determine the correct encoding to use.
            // The only workaround that I could find for me was to parse ContentType myself to extract the character
            // set, and ignore ContentEncoding and CharacterSet because they are useless."

            string contentType = response.ContentType;

            if (!String.IsNullOrEmpty(contentType))
                IDictionary values = MimeHelper.ParseContentType(contentType, true);

                const string charset = "charset";
                if (values.Contains(charset))
                    characterSet = values[charset] as string;
                    if (!String.IsNullOrEmpty(characterSet))
                        Debug.Assert(characterSet.Equals(response.CharacterSet, StringComparison.OrdinalIgnoreCase),
                                     "CharacterSet was parsed incorrectly!");

            characterSet = null;
        private static string GetBodyText(HttpWebResponse resp)
            if (resp.ContentType != null && resp.ContentType.Length > 0)
                IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true);
                string      mainType        = (string)contentTypeData[""];
                switch (mainType)
                case "text/plain":

                case "text/html":
                                       DecodeBody(resp), true))));
        public bool ShouldDownloadThisUrl(UrlContentTypeInfo info)
            string url = info.FinalUrl;

            if (!ShouldDownloadThisUrl(url))

            // If we've exceeded the maximum number of pages that we're allowed to download
            if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload)

            // If this file is too large
            // TODO: Should this apply to files or also web pages?  Currently applies to web pages too
            if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576)

            // If we should only download pages and this isn't a page, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType))

            // If we should only download pages and documents and this isn't a document, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType)))

        /// <summary>
        /// Download a reference, providing progress
        /// </summary>
        /// <param name="reference">The reference to download</param>
        /// <param name="fileStorage">The storage to download the reference into</param>
        /// <param name="progressHost">The progressHost to provide feedback to</param>
        private void DownloadReference(ReferenceToDownload reference, FileBasedSiteStorage fileStorage, IProgressHost progressHost)
            if (IsBase64EmbededImage(reference.AbsoluteUrl))
            UrlDownloadToFile downloader;
            string            fullPath;

            downloader           = new UrlDownloadToFile();
            downloader.TimeoutMs = 30000;

            if (progressHost.CancelRequested)
                throw new OperationCancelledException();

            // make sure that the directory exists
            fullPath = Path.Combine(fileStorage.BasePath, reference.RelativePath);

            string directory = Path.GetDirectoryName(fullPath);

            if (!Directory.Exists(directory))

            // Make sure there aren't any conflicts
            lock (this)
                string newFileName = Path.GetFileName(fullPath);
                    fullPath    = PathHelper.GetNonConflictingPath(fullPath);
                    newFileName = Path.GetFileName(fullPath);

                    reference.SetFileName(ref newFileName);
                } while (newFileName != Path.GetFileName(fullPath));

                using (File.Open(fullPath, FileMode.Create, FileAccess.ReadWrite, FileShare.None))

            string downloadUrl = reference.AbsoluteUrl;

            if (UrlHelper.IsFileUrl(downloadUrl))
                downloadUrl = HttpUtility.UrlDecode(downloadUrl);

            downloader.Url            = downloadUrl.Trim();
            downloader.FilePath       = fullPath;
            downloader.ShowSecurityUI = true;
            if (CredentialsContext != null)
                downloader.CredentialsContext = CredentialsContext;

            catch (COMException e)
                // If the file couldn't be downloaded, this doesn't matter.  But log it
                Trace.WriteLine("Didn't download file: " + downloader.Url + " " + e.ToString());

            // Fix the filename of the downloaded entity to have the correct extension
            string contentType = downloader.ContentType;

            if (contentType != null && File.Exists(fullPath))
                string suggestedExtension = MimeHelper.GetExtensionFromContentType(contentType);
                if (suggestedExtension == null)
                    suggestedExtension = UrlHelper.GetExtensionForUrl(downloader.FinalUrl);

                if (Path.GetExtension(fullPath) != suggestedExtension)
                    string newFilePath = Path.ChangeExtension(fullPath, suggestedExtension);
                    string newFileName = Path.GetFileName(newFilePath);

                    // Try to reset the name until we can both agree
                    while (true)
                        newFilePath = PathHelper.GetNonConflictingPath(newFilePath);
                        newFileName = Path.GetFileName(newFilePath);

                        FileHelper.Rename(fullPath, newFilePath);
                        reference.SetFileName(ref newFileName);
                        if (newFileName != Path.GetFileName(newFilePath))
                            catch (Exception e) { Debug.Fail("Unable to delete failed temp file: " + e.ToString()); }

                    fullPath = newFilePath;

            // Handle anything special we need to do for stylesheet and js file references
            if (Path.GetExtension(fullPath) == ".css" && File.Exists(fullPath))
                string fileContents = string.Empty;
                using (StreamReader reader = new StreamReader(fullPath))
                    fileContents = reader.ReadToEnd();

                if (fileContents != string.Empty)
                    LightWeightCSSReplacer cssReplacer = new LightWeightCSSReplacer(fileContents);
                    // fix up references
                    foreach (ReferenceToDownload referenceInfo in _referencesToDownload.Values)
                        cssReplacer.AddUrlToReplace(new UrlToReplace(referenceInfo.UrlToReplace, referenceInfo.FileName));

                    string newCss = cssReplacer.DoReplace();
                    using (StreamWriter writer = new StreamWriter(fullPath, false))