private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo) { if (urlContentTypeInfo == null) { return(false); } // We should download pages that are web pages, css, or js files and treat them as web pages! return(MimeHelper.IsContentTypeWebPage(urlContentTypeInfo.ContentType)); }
private static string GetBodyText(HttpWebResponse resp) { if (resp.ContentType != null && resp.ContentType.Length > 0) { IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true); string mainType = (string)contentTypeData[""]; switch (mainType) { case "text/plain": { return(DecodeBody(resp)); } case "text/html": { return(DecodeBody(resp)); } } } return(""); }
/// <summary> /// Gets the character set associated with the WebResponse. /// </summary> /// <param name="response">The WebResponse to inspect for a character set.</param> /// <param name="characterSet">When this method returns, contains the character set associated with the /// WebResponse if the character set is explicitly specified; otherwise, null. This parameter is passed /// uninitialized.</param> /// <returns>true if the WebResponse explicitly specifies a character set; otherwise, false.</returns> private bool TryGetCharacterSet(HttpWebResponse response, out string characterSet) { // A very applicable comment from MSDN on why not to use the HttpWebResponse.CharacterSet property: // http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.characterset(v=VS.80).aspx // // "As stated in a previous comment, many web servers are poorly configured and don't include the charset // in their content type header, e. g. they just return "text/html". In theory, user agents should treat // it as ISO-8859-1, as recommended by W3C. This is what the CharacterSet property actually does: It // always returns ISO-8859-1 if the charset it not specified, although often the content has a different // encoding (which of course HttpWebResponse cannot know). // In real life however, in case of a missing charset definition in the HTTP header user agents look into // the markup, and usully a meta tag can be found that contains the correct encoding, like "utf-8". To // implement this pragmatic approach, it would IMHO be much more convenient if the CharacterSet would // default to an empty string, then you know, that the encoding is not specified and you need a workaround // to determine the correct encoding to use. // The only workaround that I could find for me was to parse ContentType myself to extract the character // set, and ignore ContentEncoding and CharacterSet because they are useless." string contentType = response.ContentType; if (!String.IsNullOrEmpty(contentType)) { IDictionary values = MimeHelper.ParseContentType(contentType, true); const string charset = "charset"; if (values.Contains(charset)) { characterSet = values[charset] as string; if (!String.IsNullOrEmpty(characterSet)) { Debug.Assert(characterSet.Equals(response.CharacterSet, StringComparison.OrdinalIgnoreCase), "CharacterSet was parsed incorrectly!"); return(true); } } } characterSet = null; return(false); }
private static string GetBodyText(HttpWebResponse resp) { if (resp.ContentType != null && resp.ContentType.Length > 0) { IDictionary contentTypeData = MimeHelper.ParseContentType(resp.ContentType, true); string mainType = (string)contentTypeData[""]; switch (mainType) { case "text/plain": { return(DecodeBody(resp)); } case "text/html": { return(StringHelper.CompressExcessWhitespace( HTMLDocumentHelper.HTMLToPlainText( LightWeightHTMLThinner2.Thin( DecodeBody(resp), true)))); } } } return(""); }
public bool ShouldDownloadThisUrl(UrlContentTypeInfo info) { string url = info.FinalUrl; if (!ShouldDownloadThisUrl(url)) { return(false); } // If we've exceeded the maximum number of pages that we're allowed to download if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload) { return(false); } // If this file is too large // TODO: Should this apply to files or also web pages? Currently applies to web pages too if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576) { return(false); } // If we should only download pages and this isn't a page, filter it out if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType)) { return(false); } // If we should only download pages and documents and this isn't a document, filter it out if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType))) { return(false); } return(true); }
/// <summary> /// Download a reference, providing progress /// </summary> /// <param name="reference">The reference to download</param> /// <param name="fileStorage">The storage to download the reference into</param> /// <param name="progressHost">The progressHost to provide feedback to</param> private void DownloadReference(ReferenceToDownload reference, FileBasedSiteStorage fileStorage, IProgressHost progressHost) { if (IsBase64EmbededImage(reference.AbsoluteUrl)) { return; } UrlDownloadToFile downloader; string fullPath; downloader = new UrlDownloadToFile(); downloader.TimeoutMs = 30000; if (progressHost.CancelRequested) { throw new OperationCancelledException(); } // make sure that the directory exists fullPath = Path.Combine(fileStorage.BasePath, reference.RelativePath); string directory = Path.GetDirectoryName(fullPath); if (!Directory.Exists(directory)) { Directory.CreateDirectory(directory); } // Make sure there aren't any conflicts lock (this) { string newFileName = Path.GetFileName(fullPath); do { fullPath = PathHelper.GetNonConflictingPath(fullPath); newFileName = Path.GetFileName(fullPath); reference.SetFileName(ref newFileName); } while (newFileName != Path.GetFileName(fullPath)); using (File.Open(fullPath, FileMode.Create, FileAccess.ReadWrite, FileShare.None)) { } } string downloadUrl = reference.AbsoluteUrl; if (UrlHelper.IsFileUrl(downloadUrl)) { downloadUrl = HttpUtility.UrlDecode(downloadUrl); } downloader.Url = downloadUrl.Trim(); downloader.FilePath = fullPath; downloader.ShowSecurityUI = true; if (CredentialsContext != null) { downloader.CredentialsContext = CredentialsContext; } try { downloader.Download(progressHost); } catch (COMException e) { // If the file couldn't be downloaded, this doesn't matter. But log it Trace.WriteLine("Didn't download file: " + downloader.Url + " " + e.ToString()); } // Fix the filename of the downloaded entity to have the correct extension string contentType = downloader.ContentType; if (contentType != null && File.Exists(fullPath)) { string suggestedExtension = MimeHelper.GetExtensionFromContentType(contentType); if (suggestedExtension == null) { suggestedExtension = UrlHelper.GetExtensionForUrl(downloader.FinalUrl); } if (Path.GetExtension(fullPath) != suggestedExtension) { string newFilePath = Path.ChangeExtension(fullPath, suggestedExtension); string newFileName = Path.GetFileName(newFilePath); // Try to reset the name until we can both agree while (true) { newFilePath = PathHelper.GetNonConflictingPath(newFilePath); newFileName = Path.GetFileName(newFilePath); FileHelper.Rename(fullPath, newFilePath); reference.SetFileName(ref newFileName); if (newFileName != Path.GetFileName(newFilePath)) { try { File.Delete(newFilePath); } catch (Exception e) { Debug.Fail("Unable to delete failed temp file: " + e.ToString()); } } else { break; } } fullPath = newFilePath; } } // Handle anything special we need to do for stylesheet and js file references if (Path.GetExtension(fullPath) == ".css" && File.Exists(fullPath)) { string fileContents = string.Empty; using (StreamReader reader = new StreamReader(fullPath)) fileContents = reader.ReadToEnd(); if (fileContents != string.Empty) { LightWeightCSSReplacer cssReplacer = new LightWeightCSSReplacer(fileContents); // fix up references foreach (ReferenceToDownload referenceInfo in _referencesToDownload.Values) { cssReplacer.AddUrlToReplace(new UrlToReplace(referenceInfo.UrlToReplace, referenceInfo.FileName)); } string newCss = cssReplacer.DoReplace(); using (StreamWriter writer = new StreamWriter(fullPath, false)) writer.Write(newCss); } } }