/// <summary>
        /// Determine the content type of a URL using inexpensive operations as well as
        /// potentially using network IO to determine content type.  This will return the
        /// correct content type.
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <param name="timeOutMs">MS to execute before timing out</param>
        /// <returns>The content type</returns>
        public static UrlContentTypeInfo ExpensivelyGetUrlContentType(string url, int timeOutMs)
        {
            string             contentType        = null;
            UrlContentTypeInfo urlContentTypeInfo = null;

            // If the url ends with a pdf, treat it as a PDF file no matter what the server says!
            if (UrlHelper.GetExtensionForUrl(url) == ".pdf")
            {
                contentType = GuessContentTypeLocally(url);
                return(new UrlContentTypeInfo(contentType, url));
            }

            urlContentTypeInfo = GetContentTypeFromBrowserCache(url);
            if (urlContentTypeInfo != null)
            {
                return(urlContentTypeInfo);
            }

            urlContentTypeInfo = GetContentTypeUsingNetworkIO(url, timeOutMs);
            if (urlContentTypeInfo != null)
            {
                return(urlContentTypeInfo);
            }

            contentType = GuessContentTypeLocally(url);
            if (contentType != null)
            {
                return(new UrlContentTypeInfo(contentType, url));
            }

            return(null);
        }
        /// <summary>
        /// Looks up the content type in browser cache
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <returns>The content type</returns>
        private static UrlContentTypeInfo GetContentTypeFromBrowserCache(string url)
        {
            UrlContentTypeInfo contentType = null;

            return(contentType);

            // throw out the query string and other bits of the url, if we can
        }
示例#3
0
        private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo)
        {
            if (urlContentTypeInfo == null)
            {
                return(false);
            }

            // We should download pages that are web pages, css, or js files and treat them as web pages!
            return(MimeHelper.IsContentTypeWebPage(urlContentTypeInfo.ContentType));
        }
        /// <summary>
        /// Determine the content type of a URL using only inexpensive operations like looking
        /// in the cache or guessing based upon extension.  This may not always return
        /// the correct content type, especially for redirected URLs.
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <returns>The content type</returns>
        public static UrlContentTypeInfo InexpensivelyGetUrlContentType(string url)
        {
            UrlContentTypeInfo contentTypeInfo = null;

            contentTypeInfo = GetContentTypeFromBrowserCache(url);

            if (contentTypeInfo == null)
            {
                string contentType = GuessContentTypeLocally(url);
                contentTypeInfo = new UrlContentTypeInfo(contentType, url);
            }

            return(contentTypeInfo);
        }
        /// <summary>
        /// Retrieve the content type by requesting the content type from the server hosting the URL
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <param name="timeOutMs">The duration in MS that the operation will execute before failing</param>
        /// <returns>The content type</returns>
        private static UrlContentTypeInfo GetContentTypeUsingNetworkIO(string url, int timeOutMs)
        {
            UrlContentTypeInfo contentType = null;

            if (UrlHelper.IsFileUrl(url))
            {
                string content = GuessContentTypeLocally(url);
                if (content != null)
                {
                    contentType = new UrlContentTypeInfo(content, url);
                }
            }

            if (contentType == null)
            {
                WebRequestWithCache webRequest = new WebRequestWithCache(url);

                WebResponse response;
                if (timeOutMs == -1)
                {
                    response = webRequest.GetHeadOnly();
                }
                else
                {
                    response = webRequest.GetHeadOnly(timeOutMs);
                }

                if (response != null && response.ContentType != null && response.ContentType != string.Empty)
                {
                    string contentTypeString     = response.ContentType;
                    string contentEncodingString = null;
                    if (contentTypeString.IndexOf(";", StringComparison.OrdinalIgnoreCase) > 0)
                    {
                        string[] contentTypeParts = contentTypeString.Split(';');
                        contentTypeString     = contentTypeParts[0];
                        contentEncodingString = contentTypeParts[1];
                    }
                    contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, UrlHelper.SafeToAbsoluteUri(response.ResponseUri), Convert.ToInt32(response.ContentLength));
                }
            }
            return(contentType);
        }
        public bool ShouldDownloadThisUrl(UrlContentTypeInfo info)
        {
            string url = info.FinalUrl;

            if (!ShouldDownloadThisUrl(url))
            {
                return(false);
            }

            // If we've exceeded the maximum number of pages that we're allowed to download
            if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload)
            {
                return(false);
            }

            // If this file is too large
            // TODO: Should this apply to files or also web pages?  Currently applies to web pages too
            if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576)
            {
                return(false);
            }

            // If we should only download pages and this isn't a page, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType))
            {
                return(false);
            }

            // If we should only download pages and documents and this isn't a document, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType)))
            {
                return(false);
            }

            return(true);
        }
        /// <summary>
        /// Determine the content type of a URL using only inexpensive operations like looking
        /// in the cache or guessing based upon extension.  This may not always return
        /// the correct content type, especially for redirected URLs.
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <returns>The content type</returns>
        public static UrlContentTypeInfo InexpensivelyGetUrlContentType(string url)
        {
            UrlContentTypeInfo contentTypeInfo = null;
            contentTypeInfo = GetContentTypeFromBrowserCache(url);

            if (contentTypeInfo == null)
            {
                string contentType = GuessContentTypeLocally(url);
                contentTypeInfo = new UrlContentTypeInfo(contentType, url);
            }

            return contentTypeInfo;
        }
        /// <summary>
        /// Retrieve the content type by requesting the content type from the server hosting the URL
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <param name="timeOutMs">The duration in MS that the operation will execute before failing</param>
        /// <returns>The content type</returns>
        private static UrlContentTypeInfo GetContentTypeUsingNetworkIO(string url, int timeOutMs)
        {
            UrlContentTypeInfo contentType = null;

            if (UrlHelper.IsFileUrl(url))
            {
                string content = GuessContentTypeLocally(url);
                if (content != null)
                    contentType = new UrlContentTypeInfo(content, url);
            }

            if (contentType == null)
            {
                WebRequestWithCache webRequest = new WebRequestWithCache(url);

                WebResponse response;
                if (timeOutMs == -1)
                    response = webRequest.GetHeadOnly();
                else
                    response = webRequest.GetHeadOnly(timeOutMs);

                if (response != null && response.ContentType != null && response.ContentType != string.Empty)
                {
                    string contentTypeString = response.ContentType;
                    string contentEncodingString = null;
                    if (contentTypeString.IndexOf(";", StringComparison.OrdinalIgnoreCase) > 0)
                    {
                        string[] contentTypeParts = contentTypeString.Split(';');
                        contentTypeString = contentTypeParts[0];
                        contentEncodingString = contentTypeParts[1];
                    }
                    contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, UrlHelper.SafeToAbsoluteUri(response.ResponseUri), Convert.ToInt32(response.ContentLength));
                }

            }
            return contentType;
        }
        /// <summary>
        /// Looks up the content type in browser cache
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <returns>The content type</returns>
        private static UrlContentTypeInfo GetContentTypeFromBrowserCache(string url)
        {
            UrlContentTypeInfo contentType = null;

            // throw out the query string and other bits of the url, if we can
            if (UrlHelper.IsUrl(url))
            {
                Uri uri = new Uri(url);
                // by using the absolute uri, we're more likely to hit the cache
                url = UrlHelper.SafeToAbsoluteUri(uri);
            }

            // Get the header for this URL out of the cache and see if we
            // can get the content type out of the header
            Internet_Cache_Entry_Info info;
            if (WinInet.GetUrlCacheEntryInfo(url, out info))
            {
                // Get the header string for the info struct
                string header = Marshal.PtrToStringAnsi(info.lpHeaderInfo);

                // scan through the lines until we find the content type line
                if (header != null)
                {
                    string contentTypeString = null;
                    string contentLengthString = null;
                    string contentEncodingString = null;

                    string[] lines = header.Split('\n');
                    foreach (string line in lines)
                    {
                        if (line.IndexOf(":", StringComparison.OrdinalIgnoreCase) > -1)
                        {
                            string[] parts = line.Split(':');
                            if (parts[0].ToUpperInvariant() == "CONTENT-TYPE")
                            {
                                // be aware the character encoding can be appended to the end of this line
                                // following a semicolon
                                if (parts[0].IndexOf(";", StringComparison.OrdinalIgnoreCase) > -1)
                                {
                                    string[] subParts = parts[0].Split(';');
                                    contentTypeString = subParts[0].Trim();
                                    contentEncodingString = subParts[1].Trim();
                                }
                                else
                                    contentTypeString = parts[1].Trim();

                            }
                            else if (parts[0].ToUpperInvariant() == "CONTENT-LENGTH")
                            {
                                contentLengthString = parts[1].Trim();
                            }

                            if (contentTypeString != null && contentLengthString != null)
                                break;
                        }
                    }
                    contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, url, int.Parse(contentLengthString, CultureInfo.InvariantCulture));
                }
            }
            return contentType;
        }
示例#10
0
        /// <summary>
        /// Actually downloads the pages
        /// </summary>
        private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload)
        {
            // Check for cancel
            if (progress.CancelRequested)
            {
                throw new OperationCancelledException();
            }

            _currentDepth++;
            ArrayList downloadedPages = new ArrayList();

            // Set up our progress
            int thisPageTicks = FIRSTPAGETICKS;

            if (_context.Depth == _currentDepth)
            {
                thisPageTicks = TOTALTICKS;
            }
            ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS);

            string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url);

            // Look up the content type of this pageToDownload
            UrlContentTypeInfo headerInfo = null;

            if (_headerInfo.ContainsKey(safeUrl))
            {
                headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl];
            }
            else
            {
                if (lightWeightDocument != null)
                {
                    headerInfo = new UrlContentTypeInfo("text/html", url);
                }
                else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url))
                {
                    progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url));
                    if (lightWeightDocument == null)
                    {
                        headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS);
                    }
                    else
                    {
                        headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url);
                    }
                }
                _headerInfo.Add(safeUrl, headerInfo);
            }

            // If this is a web page and we should download it, do it!
            if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) ||
                (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo))
                )
            {
                bool downloadWorked   = false;
                int  downloadAttempts = -1;
                bool timedOut         = true;

                // Max sure we are retrying the correct number of times
                ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100);
                while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut)
                {
                    timedOut = false;

                    pageDownloadProgress.UpdateProgress(0, 1);
                    try
                    {
                        // If we haven't downloaded this page yet download it
                        PageToDownload thisPageToDownload = null;

                        if (!_context.UrlAlreadyDownloaded(safeUrl))
                        {
                            if (lightWeightDocument == null)
                            {
                                thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress);
                            }
                            else
                            {
                                LightWeightHTMLDocument htmlDoc = lightWeightDocument;

                                // Only redownload if we absolutely need to
                                if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null))
                                {
                                    string html     = htmlDoc.GenerateHtml();
                                    string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm");
                                    using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8))
                                        writer.Write(html);
                                    using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false))
                                    {
                                        downloader.DownloadHTMLDocument(pageDownloadProgress);

                                        htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url);
                                    }
                                }
                                thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload);
                                if (htmlDoc.StyleResourcesUrls != null)
                                {
                                    foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls)
                                    {
                                        thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
                                    }
                                }
                            }
                            // Add this page to our lists
                            _context.AddPageToDownload(safeUrl, thisPageToDownload, true);
                            downloadedPages.Add(thisPageToDownload);
                        }
                        else
                        {
                            thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl];
                        }

                        // If we're downloading a site, add a second copy of the root page in the references subdir
                        // This was, if the root page gets renamed, links back to it will still work correctly
                        // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output
                        // the site and change the root file name
                        if (thisPageToDownload.IsRootPage && _context.Depth > 0)
                        {
                            PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload);
                            downloadedPages.Add(copyOfThisPageToDownload);
                        }

                        // enumerate the frames of this page and add them to the list of pages
                        PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload);
                        downloadedPages.AddRange(subFramesToDownload);
                        foreach (PageToDownload pageToDownload in subFramesToDownload)
                        {
                            _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false);
                        }

                        // Now drill down based upon the depth configuration
                        if (_context.ShouldContinue(_currentDepth))
                        {
                            ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS);
                            downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload));
                        }
                        downloadWorked = true;
                        firstPagedownloadProgress.UpdateProgress(1, 1);
                    }
                    catch (OperationTimedOutException)
                    {
                        timedOut = true;
                    }
                    catch (WebPageDownloaderException htex)
                    {
                        HandleException(new Exception(htex.Message, htex));
                    }
                    catch (Exception ex)
                    {
                        HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex));
                    }
                }

                // If we never got the download to succeed, add it to the list of timed out Urls
                if (!downloadWorked && timedOut)
                {
                    _context.AddTimedOutUrl(_url);
                    firstPagedownloadProgress.UpdateProgress(1, 1);
                }
            }
            // If it isn't a page we'll just add the file to the reference list for the parent page
            // There is not an else, because we could be looking at a reference, but a reference that
            // should not be downloaded (in which case we just ignore it)
            else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo))
            {
                parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload));
                progress.UpdateProgress(1, 1);
            }

            progress.UpdateProgress(1, 1);

            _currentDepth--;
            return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload)));
        }
        private bool IsDownloadablePageResource(UrlContentTypeInfo urlContentTypeInfo)
        {
            if (urlContentTypeInfo == null)
                return false;

            // We should download pages that are web pages, css, or js files and treat them as web pages!
            return MimeHelper.IsContentTypeWebPage(urlContentTypeInfo.ContentType);
        }
        /// <summary>
        /// Actually downloads the pages
        /// </summary>
        private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload)
        {
            // Check for cancel
            if (progress.CancelRequested)
                throw new OperationCancelledException();

            _currentDepth++;
            ArrayList downloadedPages = new ArrayList();

            // Set up our progress
            int thisPageTicks = FIRSTPAGETICKS;
            if (_context.Depth == _currentDepth)
                thisPageTicks = TOTALTICKS;
            ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS);

            string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url);

            // Look up the content type of this pageToDownload
            UrlContentTypeInfo headerInfo = null;
            if (_headerInfo.ContainsKey(safeUrl))
            {
                headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl];
            }
            else
            {
                if (lightWeightDocument != null)
                    headerInfo = new UrlContentTypeInfo("text/html", url);
                else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url))
                {
                    progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url));
                    if (lightWeightDocument == null)
                        headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS);
                    else
                        headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url);
                }
                _headerInfo.Add(safeUrl, headerInfo);
            }

            // If this is a web page and we should download it, do it!
            if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) ||
                (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo))
                )
            {
                bool downloadWorked = false;
                int downloadAttempts = -1;
                bool timedOut = true;

                // Max sure we are retrying the correct number of times
                ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100);
                while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut)
                {
                    timedOut = false;

                    pageDownloadProgress.UpdateProgress(0, 1);
                    try
                    {
                        // If we haven't downloaded this page yet download it
                        PageToDownload thisPageToDownload = null;

                        if (!_context.UrlAlreadyDownloaded(safeUrl))
                        {
                            if (lightWeightDocument == null)
                                thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress);
                            else
                            {
                                LightWeightHTMLDocument htmlDoc = lightWeightDocument;

                                // Only redownload if we absolutely need to
                                if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null))
                                {

                                    string html = htmlDoc.GenerateHtml();
                                    string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm");
                                    using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8))
                                        writer.Write(html);
                                    using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false))
                                    {
                                        downloader.DownloadHTMLDocument(pageDownloadProgress);

                                        htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url);
                                    }
                                }
                                thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload);
                                if (htmlDoc.StyleResourcesUrls != null)
                                    foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls)
                                        thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
                            }
                            // Add this page to our lists
                            _context.AddPageToDownload(safeUrl, thisPageToDownload, true);
                            downloadedPages.Add(thisPageToDownload);

                        }
                        else
                            thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl];

                        // If we're downloading a site, add a second copy of the root page in the references subdir
                        // This was, if the root page gets renamed, links back to it will still work correctly
                        // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output
                        // the site and change the root file name
                        if (thisPageToDownload.IsRootPage && _context.Depth > 0)
                        {
                            PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload);
                            downloadedPages.Add(copyOfThisPageToDownload);
                        }

                        // enumerate the frames of this page and add them to the list of pages
                        PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload);
                        downloadedPages.AddRange(subFramesToDownload);
                        foreach (PageToDownload pageToDownload in subFramesToDownload)
                            _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false);

                        // Now drill down based upon the depth configuration
                        if (_context.ShouldContinue(_currentDepth))
                        {
                            ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS);
                            downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload));
                        }
                        downloadWorked = true;
                        firstPagedownloadProgress.UpdateProgress(1, 1);

                    }
                    catch (OperationTimedOutException)
                    {
                        timedOut = true;
                    }
                    catch (WebPageDownloaderException htex)
                    {
                        HandleException(new Exception(htex.Message, htex));
                    }
                    catch (Exception ex)
                    {
                        HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex));
                    }
                }

                // If we never got the download to succeed, add it to the list of timed out Urls
                if (!downloadWorked && timedOut)
                {
                    _context.AddTimedOutUrl(_url);
                    firstPagedownloadProgress.UpdateProgress(1, 1);

                }
            }
            // If it isn't a page we'll just add the file to the reference list for the parent page
            // There is not an else, because we could be looking at a reference, but a reference that
            // should not be downloaded (in which case we just ignore it)
            else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo))
            {
                parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload));
                progress.UpdateProgress(1, 1);
            }

            progress.UpdateProgress(1, 1);

            _currentDepth--;
            return (PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload));
        }
        /// <summary>
        /// Looks up the content type in browser cache
        /// </summary>
        /// <param name="url">The url for which to check content type</param>
        /// <returns>The content type</returns>
        private static UrlContentTypeInfo GetContentTypeFromBrowserCache(string url)
        {
            UrlContentTypeInfo contentType = null;

            // throw out the query string and other bits of the url, if we can
            if (UrlHelper.IsUrl(url))
            {
                Uri uri = new Uri(url);
                // by using the absolute uri, we're more likely to hit the cache
                url = UrlHelper.SafeToAbsoluteUri(uri);
            }

            // Get the header for this URL out of the cache and see if we
            // can get the content type out of the header
            Internet_Cache_Entry_Info info;

            if (WinInet.GetUrlCacheEntryInfo(url, out info))
            {
                // Get the header string for the info struct
                string header = Marshal.PtrToStringAnsi(info.lpHeaderInfo);

                // scan through the lines until we find the content type line
                if (header != null)
                {
                    string contentTypeString     = null;
                    string contentLengthString   = null;
                    string contentEncodingString = null;

                    string[] lines = header.Split('\n');
                    foreach (string line in lines)
                    {
                        if (line.IndexOf(":", StringComparison.OrdinalIgnoreCase) > -1)
                        {
                            string[] parts = line.Split(':');
                            if (parts[0].ToUpperInvariant() == "CONTENT-TYPE")
                            {
                                // be aware the character encoding can be appended to the end of this line
                                // following a semicolon
                                if (parts[0].IndexOf(";", StringComparison.OrdinalIgnoreCase) > -1)
                                {
                                    string[] subParts = parts[0].Split(';');
                                    contentTypeString     = subParts[0].Trim();
                                    contentEncodingString = subParts[1].Trim();
                                }
                                else
                                {
                                    contentTypeString = parts[1].Trim();
                                }
                            }
                            else if (parts[0].ToUpperInvariant() == "CONTENT-LENGTH")
                            {
                                contentLengthString = parts[1].Trim();
                            }

                            if (contentTypeString != null && contentLengthString != null)
                            {
                                break;
                            }
                        }
                    }
                    contentType = new UrlContentTypeInfo(contentTypeString, contentEncodingString, url, int.Parse(contentLengthString, CultureInfo.InvariantCulture));
                }
            }
            return(contentType);
        }
        public bool ShouldDownloadThisUrl(UrlContentTypeInfo info)
        {
            string url = info.FinalUrl;

            if (!ShouldDownloadThisUrl(url))
                return false;

            // If we've exceeded the maximum number of pages that we're allowed to download
            if (LimitNumberOfPages && MaxNumberOfPagesToDownload > -1 && _currentPageCount >= MaxNumberOfPagesToDownload)
                return false;

            // If this file is too large
            // TODO: Should this apply to files or also web pages?  Currently applies to web pages too
            if (LimitSizeOfFile && MaxFileSizeToDownload > 0 && info.ContentLength > MaxFileSizeToDownload * 1048576)
                return false;

            // If we should only download pages and this isn't a page, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.Pages && !MimeHelper.IsContentTypeWebPage(info.ContentType))
                return false;

            // If we should only download pages and documents and this isn't a document, filter it out
            if (DownloadFilter == SiteCaptureDownloadFilter.PagesAndDocuments && (!MimeHelper.IsContentTypeDocument(info.ContentType) && !MimeHelper.IsContentTypeWebPage(info.ContentType)))
                return false;

            return true;
        }