Example #1
0
        private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo)
        {
            if (urlContentTypeInfo == null)
            {
                return(false);
            }

            // We should download pages that are web pages, css, or js files and treat them as web pages!
            return(MimeHelper.IsContentTypeWebPage(urlContentTypeInfo.ContentType));
        }
        private static string GetBodyText(HttpWebResponse resp)
        {
            if (resp.ContentType != null && resp.ContentType.Length > 0)
            {
                IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true);
                string      mainType        = (string)contentTypeData[""];
                switch (mainType)
                {
                case "text/plain":
                {
                    return(DecodeBody(resp));
                }

                case "text/html":
                {
                    return(DecodeBody(resp));
                }
                }
            }
            return("");
        }
        /// <summary>
        /// Gets the character set associated with the WebResponse.
        /// </summary>
        /// <param name="response">The WebResponse to inspect for a character set.</param>
        /// <param name="characterSet">When this method returns, contains the character set associated with the
        /// WebResponse if the character set is explicitly specified; otherwise, null. This parameter is passed
        /// uninitialized.</param>
        /// <returns>true if the WebResponse explicitly specifies a character set; otherwise, false.</returns>
        private bool TryGetCharacterSet(HttpWebResponse response, out string characterSet)
        {
            // A very applicable comment from MSDN on why not to use the HttpWebResponse.CharacterSet property:
            // http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.characterset(v=VS.80).aspx
            //
            // "As stated in a previous comment, many web servers are poorly configured and don't include the charset
            // in their content type header, e. g. they just return "text/html". In theory, user agents should treat
            // it as ISO-8859-1, as recommended by W3C. This is what the CharacterSet property actually does: It
            // always returns ISO-8859-1 if the charset it not specified, although often the content has a different
            // encoding (which of course HttpWebResponse cannot know).
            // In real life however, in case of a missing charset definition in the HTTP header user agents look into
            // the markup, and usully a meta tag can be found that contains the correct encoding, like "utf-8". To
            // implement this pragmatic approach, it would IMHO be much more convenient if the CharacterSet would
            // default to an empty string, then you know, that the encoding is not specified and you need a workaround
            // to determine the correct encoding to use.
            // The only workaround that I could find for me was to parse ContentType myself to extract the character
            // set, and ignore ContentEncoding and CharacterSet because they are useless."

            string contentType = response.ContentType;

            if (!String.IsNullOrEmpty(contentType))
            {
                IDictionary values = MimeHelper.ParseContentType(contentType, true);

                const string charset = "charset";
                if (values.Contains(charset))
                {
                    characterSet = values[charset] as string;
                    if (!String.IsNullOrEmpty(characterSet))
                    {
                        Debug.Assert(characterSet.Equals(response.CharacterSet, StringComparison.OrdinalIgnoreCase),
                                     "CharacterSet was parsed incorrectly!");
                        return(true);
                    }
                }
            }

            characterSet = null;
            return(false);
        }
        private static string GetBodyText(HttpWebResponse resp)
        {
            if (resp.ContentType != null && resp.ContentType.Length > 0)
            {
                IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true);
                string      mainType        = (string)contentTypeData[""];
                switch (mainType)
                {
                case "text/plain":
                {
                    return(DecodeBody(resp));
                }

                case "text/html":
                {
                    return(StringHelper.CompressExcessWhitespace(
                               HTMLDocumentHelper.HTMLToPlainText(
                                   LightWeightHTMLThinner2.Thin(
                                       DecodeBody(resp), true))));
                }
                }
            }
            return("");
        }
        public bool ShouldDownloadThisUrl(UrlContentTypeInfo info)
        {
            string url = info.FinalUrl;

            if (!ShouldDownloadThisUrl(url))
            {
                return(false);
            }

            // If we've exceeded the maximum number of pages that we're allowed to download
            if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload)
            {
                return(false);
            }

            // If this file is too large
            // TODO: Should this apply to files or also web pages?  Currently applies to web pages too
            if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576)
            {
                return(false);
            }

            // If we should only download pages and this isn't a page, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType))
            {
                return(false);
            }

            // If we should only download pages and documents and this isn't a document, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType)))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// Download a reference, providing progress
        /// </summary>
        /// <param name="reference">The reference to download</param>
        /// <param name="fileStorage">The storage to download the reference into</param>
        /// <param name="progressHost">The progressHost to provide feedback to</param>
        private void DownloadReference(ReferenceToDownload reference, FileBasedSiteStorage fileStorage, IProgressHost progressHost)
        {
            if (IsBase64EmbededImage(reference.AbsoluteUrl))
            {
                return;
            }
            UrlDownloadToFile downloader;
            string            fullPath;

            downloader           = new UrlDownloadToFile();
            downloader.TimeoutMs = 30000;

            if (progressHost.CancelRequested)
            {
                throw new OperationCancelledException();
            }

            // make sure that the directory exists
            fullPath = Path.Combine(fileStorage.BasePath, reference.RelativePath);

            string directory = Path.GetDirectoryName(fullPath);

            if (!Directory.Exists(directory))
            {
                Directory.CreateDirectory(directory);
            }

            // Make sure there aren't any conflicts
            lock (this)
            {
                string newFileName = Path.GetFileName(fullPath);
                do
                {
                    fullPath    = PathHelper.GetNonConflictingPath(fullPath);
                    newFileName = Path.GetFileName(fullPath);

                    reference.SetFileName(ref newFileName);
                } while (newFileName != Path.GetFileName(fullPath));

                using (File.Open(fullPath, FileMode.Create, FileAccess.ReadWrite, FileShare.None))
                {
                }
            }

            string downloadUrl = reference.AbsoluteUrl;

            if (UrlHelper.IsFileUrl(downloadUrl))
            {
                downloadUrl = HttpUtility.UrlDecode(downloadUrl);
            }

            downloader.Url            = downloadUrl.Trim();
            downloader.FilePath       = fullPath;
            downloader.ShowSecurityUI = true;
            if (CredentialsContext != null)
            {
                downloader.CredentialsContext = CredentialsContext;
            }

            try
            {
                downloader.Download(progressHost);
            }
            catch (COMException e)
            {
                // If the file couldn't be downloaded, this doesn't matter.  But log it
                Trace.WriteLine("Didn't download file: " + downloader.Url + " " + e.ToString());
            }

            // Fix the filename of the downloaded entity to have the correct extension
            string contentType = downloader.ContentType;

            if (contentType != null && File.Exists(fullPath))
            {
                string suggestedExtension = MimeHelper.GetExtensionFromContentType(contentType);
                if (suggestedExtension == null)
                {
                    suggestedExtension = UrlHelper.GetExtensionForUrl(downloader.FinalUrl);
                }

                if (Path.GetExtension(fullPath) != suggestedExtension)
                {
                    string newFilePath = Path.ChangeExtension(fullPath, suggestedExtension);
                    string newFileName = Path.GetFileName(newFilePath);

                    // Try to reset the name until we can both agree
                    while (true)
                    {
                        newFilePath = PathHelper.GetNonConflictingPath(newFilePath);
                        newFileName = Path.GetFileName(newFilePath);

                        FileHelper.Rename(fullPath, newFilePath);
                        reference.SetFileName(ref newFileName);
                        if (newFileName != Path.GetFileName(newFilePath))
                        {
                            try
                            {
                                File.Delete(newFilePath);
                            }
                            catch (Exception e) { Debug.Fail("Unable to delete failed temp file: " + e.ToString()); }
                        }
                        else
                        {
                            break;
                        }
                    }

                    fullPath = newFilePath;
                }
            }

            // Handle anything special we need to do for stylesheet and js file references
            if (Path.GetExtension(fullPath) == ".css" && File.Exists(fullPath))
            {
                string fileContents = string.Empty;
                using (StreamReader reader = new StreamReader(fullPath))
                    fileContents = reader.ReadToEnd();

                if (fileContents != string.Empty)
                {
                    LightWeightCSSReplacer cssReplacer = new LightWeightCSSReplacer(fileContents);
                    // fix up references
                    foreach (ReferenceToDownload referenceInfo in _referencesToDownload.Values)
                    {
                        cssReplacer.AddUrlToReplace(new UrlToReplace(referenceInfo.UrlToReplace, referenceInfo.FileName));
                    }

                    string newCss = cssReplacer.DoReplace();
                    using (StreamWriter writer = new StreamWriter(fullPath, false))
                        writer.Write(newCss);
                }
            }
        }