/// <summary> /// Determine the content type of a URL using inexpensive operations as well as /// potentially using network IO to determine content type. This will return the /// correct content type. /// </summary> /// <param name="url">The url for which to check content type</param> /// <param name="timeOutMs">MS to execute before timing out</param> /// <returns>The content type</returns> public static UrlContentTypeInfo ExpensivelyGetUrlContentType(string url, int timeOutMs) { string contentType = null; UrlContentTypeInfo urlContentTypeInfo = null; // If the url ends with a pdf, treat it as a PDF file no matter what the server says! if (UrlHelper.GetExtensionForUrl(url) == ".pdf") { contentType = GuessContentTypeLocally(url); return(new UrlContentTypeInfo(contentType, url)); } urlContentTypeInfo = GetContentTypeFromBrowserCache(url); if (urlContentTypeInfo != null) { return(urlContentTypeInfo); } urlContentTypeInfo = GetContentTypeUsingNetworkIO(url, timeOutMs); if (urlContentTypeInfo != null) { return(urlContentTypeInfo); } contentType = GuessContentTypeLocally(url); if (contentType != null) { return(new UrlContentTypeInfo(contentType, url)); } return(null); }
/// <summary> /// Looks up the content type in browser cache /// </summary> /// <param name="url">The url for which to check content type</param> /// <returns>The content type</returns> private static UrlContentTypeInfo GetContentTypeFromBrowserCache(string url) { UrlContentTypeInfo contentType = null; return(contentType); // throw out the query string and other bits of the url, if we can }
private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo) { if (urlContentTypeInfo == null) { return(false); } // We should download pages that are web pages, css, or js files and treat them as web pages! return(MimeHelper.IsContentTypeWebPage(urlContentTypeInfo.ContentType)); }
/// <summary> /// Determine the content type of a URL using only inexpensive operations like looking /// in the cache or guessing based upon extension. This may not always return /// the correct content type, especially for redirected URLs. /// </summary> /// <param name="url">The url for which to check content type</param> /// <returns>The content type</returns> public static UrlContentTypeInfo InexpensivelyGetUrlContentType(string url) { UrlContentTypeInfo contentTypeInfo = null; contentTypeInfo = GetContentTypeFromBrowserCache(url); if (contentTypeInfo == null) { string contentType = GuessContentTypeLocally(url); contentTypeInfo = new UrlContentTypeInfo(contentType, url); } return(contentTypeInfo); }
/// <summary> /// Retrieve the content type by requesting the content type from the server hosting the URL /// </summary> /// <param name="url">The url for which to check content type</param> /// <param name="timeOutMs">The duration in MS that the operation will execute before failing</param> /// <returns>The content type</returns> private static UrlContentTypeInfo GetContentTypeUsingNetworkIO(string url, int timeOutMs) { UrlContentTypeInfo contentType = null; if (UrlHelper.IsFileUrl(url)) { string content = GuessContentTypeLocally(url); if (content != null) { contentType = new UrlContentTypeInfo(content, url); } } if (contentType == null) { WebRequestWithCache webRequest = new WebRequestWithCache(url); WebResponse response; if (timeOutMs == -1) { response = webRequest.GetHeadOnly(); } else { response = webRequest.GetHeadOnly(timeOutMs); } if (response != null && response.ContentType != null && response.ContentType != string.Empty) { string contentTypeString = response.ContentType; string contentEncodingString = null; if (contentTypeString.IndexOf(";", StringComparison.OrdinalIgnoreCase) > 0) { string[] contentTypeParts = contentTypeString.Split(';'); contentTypeString = contentTypeParts[0]; contentEncodingString = contentTypeParts[1]; } contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, UrlHelper.SafeToAbsoluteUri(response.ResponseUri), Convert.ToInt32(response.ContentLength)); } } return(contentType); }
public bool ShouldDownloadThisUrl(UrlContentTypeInfo info) { string url = info.FinalUrl; if (!ShouldDownloadThisUrl(url)) { return(false); } // If we've exceeded the maximum number of pages that we're allowed to download if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload) { return(false); } // If this file is too large // TODO: Should this apply to files or also web pages? Currently applies to web pages too if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576) { return(false); } // If we should only download pages and this isn't a page, filter it out if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType)) { return(false); } // If we should only download pages and documents and this isn't a document, filter it out if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType))) { return(false); } return(true); }
/// <summary> /// Determine the content type of a URL using only inexpensive operations like looking /// in the cache or guessing based upon extension. This may not always return /// the correct content type, especially for redirected URLs. /// </summary> /// <param name="url">The url for which to check content type</param> /// <returns>The content type</returns> public static UrlContentTypeInfo InexpensivelyGetUrlContentType(string url) { UrlContentTypeInfo contentTypeInfo = null; contentTypeInfo = GetContentTypeFromBrowserCache(url); if (contentTypeInfo == null) { string contentType = GuessContentTypeLocally(url); contentTypeInfo = new UrlContentTypeInfo(contentType, url); } return contentTypeInfo; }
/// <summary> /// Retrieve the content type by requesting the content type from the server hosting the URL /// </summary> /// <param name="url">The url for which to check content type</param> /// <param name="timeOutMs">The duration in MS that the operation will execute before failing</param> /// <returns>The content type</returns> private static UrlContentTypeInfo GetContentTypeUsingNetworkIO(string url, int timeOutMs) { UrlContentTypeInfo contentType = null; if (UrlHelper.IsFileUrl(url)) { string content = GuessContentTypeLocally(url); if (content != null) contentType = new UrlContentTypeInfo(content, url); } if (contentType == null) { WebRequestWithCache webRequest = new WebRequestWithCache(url); WebResponse response; if (timeOutMs == -1) response = webRequest.GetHeadOnly(); else response = webRequest.GetHeadOnly(timeOutMs); if (response != null && response.ContentType != null && response.ContentType != string.Empty) { string contentTypeString = response.ContentType; string contentEncodingString = null; if (contentTypeString.IndexOf(";", StringComparison.OrdinalIgnoreCase) > 0) { string[] contentTypeParts = contentTypeString.Split(';'); contentTypeString = contentTypeParts[0]; contentEncodingString = contentTypeParts[1]; } contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, UrlHelper.SafeToAbsoluteUri(response.ResponseUri), Convert.ToInt32(response.ContentLength)); } } return contentType; }
/// <summary> /// Looks up the content type in browser cache /// </summary> /// <param name="url">The url for which to check content type</param> /// <returns>The content type</returns> private static UrlContentTypeInfo GetContentTypeFromBrowserCache(string url) { UrlContentTypeInfo contentType = null; // throw out the query string and other bits of the url, if we can if (UrlHelper.IsUrl(url)) { Uri uri = new Uri(url); // by using the absolute uri, we're more likely to hit the cache url = UrlHelper.SafeToAbsoluteUri(uri); } // Get the header for this URL out of the cache and see if we // can get the content type out of the header Internet_Cache_Entry_Info info; if (WinInet.GetUrlCacheEntryInfo(url, out info)) { // Get the header string for the info struct string header = Marshal.PtrToStringAnsi(info.lpHeaderInfo); // scan through the lines until we find the content type line if (header != null) { string contentTypeString = null; string contentLengthString = null; string contentEncodingString = null; string[] lines = header.Split('\n'); foreach (string line in lines) { if (line.IndexOf(":", StringComparison.OrdinalIgnoreCase) > -1) { string[] parts = line.Split(':'); if (parts[0].ToUpperInvariant() == "CONTENT-TYPE") { // be aware the character encoding can be appended to the end of this line // following a semicolon if (parts[0].IndexOf(";", StringComparison.OrdinalIgnoreCase) > -1) { string[] subParts = parts[0].Split(';'); contentTypeString = subParts[0].Trim(); contentEncodingString = subParts[1].Trim(); } else contentTypeString = parts[1].Trim(); } else if (parts[0].ToUpperInvariant() == "CONTENT-LENGTH") { contentLengthString = parts[1].Trim(); } if (contentTypeString != null && contentLengthString != null) break; } } contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, url, int.Parse(contentLengthString, CultureInfo.InvariantCulture)); } } return contentType; }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) { throw new OperationCancelledException(); } _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) { thisPageTicks = TOTALTICKS; } ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) { headerInfo = new UrlContentTypeInfo("text/html", url); } else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) { headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); } else { headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) { thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); } else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) { foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) { thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } } } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else { thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; } // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) { _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); } // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload))); }
private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo) { if (urlContentTypeInfo == null) return false; // We should download pages that are web pages, css, or js files and treat them as web pages! return MimeHelper.IsContentTypeWebPage(urlContentTypeInfo.ContentType); }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) throw new OperationCancelledException(); _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) thisPageTicks = TOTALTICKS; ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) headerInfo = new UrlContentTypeInfo("text/html", url); else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); else headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return (PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload)); }
/// <summary> /// Looks up the content type in browser cache /// </summary> /// <param name="url">The url for which to check content type</param> /// <returns>The content type</returns> private static UrlContentTypeInfo GetContentTypeFromBrowserCache(string url) { UrlContentTypeInfo contentType = null; // throw out the query string and other bits of the url, if we can if (UrlHelper.IsUrl(url)) { Uri uri = new Uri(url); // by using the absolute uri, we're more likely to hit the cache url = UrlHelper.SafeToAbsoluteUri(uri); } // Get the header for this URL out of the cache and see if we // can get the content type out of the header Internet_Cache_Entry_Info info; if (WinInet.GetUrlCacheEntryInfo(url, out info)) { // Get the header string for the info struct string header = Marshal.PtrToStringAnsi(info.lpHeaderInfo); // scan through the lines until we find the content type line if (header != null) { string contentTypeString = null; string contentLengthString = null; string contentEncodingString = null; string[] lines = header.Split('\n'); foreach (string line in lines) { if (line.IndexOf(":", StringComparison.OrdinalIgnoreCase) > -1) { string[] parts = line.Split(':'); if (parts[0].ToUpperInvariant() == "CONTENT-TYPE") { // be aware the character encoding can be appended to the end of this line // following a semicolon if (parts[0].IndexOf(";", StringComparison.OrdinalIgnoreCase) > -1) { string[] subParts = parts[0].Split(';'); contentTypeString = subParts[0].Trim(); contentEncodingString = subParts[1].Trim(); } else { contentTypeString = parts[1].Trim(); } } else if (parts[0].ToUpperInvariant() == "CONTENT-LENGTH") { contentLengthString = parts[1].Trim(); } if (contentTypeString != null && contentLengthString != null) { break; } } } contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, url, int.Parse(contentLengthString, CultureInfo.InvariantCulture)); } } return(contentType); }
public bool ShouldDownloadThisUrl(UrlContentTypeInfo info) { string url = info.FinalUrl; if (!ShouldDownloadThisUrl(url)) return false; // If we've exceeded the maximum number of pages that we're allowed to download if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload) return false; // If this file is too large // TODO: Should this apply to files or also web pages? Currently applies to web pages too if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576) return false; // If we should only download pages and this isn't a page, filter it out if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType)) return false; // If we should only download pages and documents and this isn't a document, filter it out if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType))) return false; return true; }