Exemplo n.º 1
0
        /// <summary>
        /// Gets pagesToDownloadFactories for a set of pagesToDownload
        /// </summary>
        /// <param name="pagesToDownload">The pagesToDownload to get factories for</param>
        /// <param name="context">The context controlling the factories</param>
        /// <returns>An array of PageToDownloadFactories</returns>
        private string[] GetSubPagesToDownload(PageToDownload[] pagesToDownload, PageToDownload parentPage)
        {
            ArrayList subItemsToDownload = new ArrayList();

            foreach (PageToDownload pageToDownload in pagesToDownload)
            {
                foreach (UrlInfo urlInfo in pageToDownload.LightWeightHTMLDocument.Anchors)
                {
                    if (urlInfo != null)
                    {
                        if (ShouldAddUrl(urlInfo.Url, pagesToDownload, parentPage))
                        {
                            subItemsToDownload.Add(urlInfo.Url);
                        }
                    }
                }

                LightWeightTag[] tags = pageToDownload.LightWeightHTMLDocument.GetTagsByName("AREA");
                foreach (LightWeightTag tag in tags)
                {
                    string url = tag.BeginTag.GetAttributeValue("href");
                    if (url != null)
                    {
                        if (ShouldAddUrl(url, pagesToDownload, parentPage))
                        {
                            subItemsToDownload.Add(url);
                        }
                    }
                }
            }

            return((string[])subItemsToDownload.ToArray(typeof(string)));
        }
 public string GetRelativeUrlForReferencedPage(PageToDownload pageBeingReferenced)
 {
     if (pageBeingReferenced.IsRootPage)
     {
         if (IsRootPage)
         {
             return(pageBeingReferenced.FileName);
         }
         else
         {
             return("../" + pageBeingReferenced.FileName);
         }
     }
     else
     {
         if (IsRootPage)
         {
             return(DirectoryToken + "/" + pageBeingReferenced.FileName);
         }
         else
         {
             return(pageBeingReferenced.FileName);
         }
     }
 }
Exemplo n.º 3
0
 public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName, PageToDownload parentInfo)
 {
     ParentInfo = parentInfo;
     _lightweightHTMLDocument = htmlDocument;
     _urlToReplace = url;
     _anchor = UrlHelper.GetAnchorIdentifier(url);
     m_fileName = rootFileName;
 }
 public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName, PageToDownload parentInfo)
 {
     ParentInfo = parentInfo;
     _lightweightHTMLDocument = htmlDocument;
     _urlToReplace            = url;
     _anchor    = UrlHelper.GetAnchorIdentifier(url);
     m_fileName = rootFileName;
 }
        public void AddPageToDownload(string url, PageToDownload pageToDownload, bool countAsPageDownload)
        {
            if (countAsPageDownload)
            {
                _currentPageCount++;
            }

            if (!CreatedPageToDownloadTable.ContainsKey(url))
            {
                CreatedPageToDownloadTable.Add(url, pageToDownload);
            }
        }
Exemplo n.º 6
0
        /// <summary>
        /// Returns an array of page download infos representing the information
        /// required to download a given IHTMLDocument2
        /// </summary>
        /// <param name="rootDocument">The IHTMLDocument for which to fetch the infos</param>
        /// <param name="parentInfo">The Parent PageDownloadInfo for the subitem</param>
        /// <returns>Array of PageDownloadInfo</returns>
        private static PageToDownload[] GetFramePagesToDownload(PageToDownload parentPageToDownload)
        {
            ArrayList subFrames = new ArrayList();

            if (parentPageToDownload.LightWeightHTMLDocument.Frames != null)
            {
                foreach (LightWeightHTMLDocument frameDocument in parentPageToDownload.LightWeightHTMLDocument.Frames)
                {
                    PageToDownload subFramePageToDownload = new PageToDownload(frameDocument, frameDocument.Url, null, parentPageToDownload);
                    subFrames.Add(subFramePageToDownload);
                    subFrames.AddRange(GetFramePagesToDownload(subFramePageToDownload));
                }
            }
            return((PageToDownload[])subFrames.ToArray(typeof(PageToDownload)));
        }
 public string GetRelativeUrlForReference(PageToDownload referencingPage)
 {
     if (!referencingPage.IsRootPage && ParentInfo.IsRootPage)
     {
         return(FileName);
     }
     else if (referencingPage.IsRootPage && !ParentInfo.IsRootPage)
     {
         return(ParentInfo.DirectoryToken + "/references/" + FileName);
     }
     else
     {
         return
             (RelativeUrl);
     }
 }
Exemplo n.º 8
0
        public string Capture(int timeoutMs)
        {
            // flag indicating whether we should continue with the capture
            bool continueCapture = true;

            // request the page
            HttpWebResponse response = RequestPage(TargetUrl, timeoutMs);

            OnHeadersReceived(response.Headers, ref continueCapture);
            if (!continueCapture)
            {
                throw new OperationCancelledException();
            }

            // transfer it to a stream
            MemoryStream pageStream = new MemoryStream();

            using (Stream responseStream = response.GetResponseStream())
                StreamHelper.Transfer(responseStream, pageStream);
            pageStream.Seek(0, SeekOrigin.Begin);

            // allow filter on content
            OnContentReceived(new StreamReader(pageStream).ReadToEnd(), ref continueCapture);
            if (!continueCapture)
            {
                throw new OperationCancelledException();
            }
            pageStream.Seek(0, SeekOrigin.Begin);

            // Read the stream into a lightweight HTML doc. We use from LightWeightHTMLDocument.FromIHTMLDocument2
            // instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration
            // above the docType (bug 289357)
            IHTMLDocument2          doc  = HTMLDocumentHelper.StreamToHTMLDoc(pageStream, TargetUrl, false);
            LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, TargetUrl, true);

            // download references
            FileBasedSiteStorage       siteStorage = new FileBasedSiteStorage(DestinationPath, "index.htm");
            PageToDownload             page        = new PageToDownload(ldoc, TargetUrl, siteStorage.RootFile);
            PageAndReferenceDownloader downloader  = new PageAndReferenceDownloader(new PageToDownload[] { page }, siteStorage);

            downloader.Download(new TimeoutProgressHost(timeoutMs));

            // return path to captured page
            return(Path.Combine(DestinationPath, siteStorage.RootFile));
        }
Exemplo n.º 9
0
        /// <summary>
        /// Used to actually commit the HTML to disk
        /// </summary>
        /// <param name="pageInfo">The PageToDownload to write</param>
        /// <param name="downloadedReferences">A hashtable of download references</param>
        /// <param name="storage">The storage to write the file into</param>
        private void WriteHtmlToDisk(PageToDownload pageInfo, FileBasedSiteStorage storage)
        {
            // Set the character set for this document
            pageInfo.LightWeightHTMLDocument.MetaData.Charset = Encoding.UTF8.WebName;
            string html = string.Empty;

            // Replace references to any URL that we downloaded!
            foreach (PageToDownload pageToDownload in _pagesToDownload)
            {
                if (!pageToDownload.IsRootPage)
                {
                    pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(pageToDownload.UrlToReplace, pageInfo.GetRelativeUrlForReferencedPage(pageToDownload)));
                }
            }

            foreach (ReferenceToDownload referenceToDownload in _referencesToDownload.Values)
            {
                ReferenceToDownload downloadedReference = ((ReferenceToDownload)_referencesToDownload[referenceToDownload.AbsoluteUrl]);

                // Since we consolidated references, replace the UrToReplace from the original reference
                // with the relativePath to the reference that actually got downloaded
                string path = downloadedReference.GetRelativeUrlForReference(pageInfo);
                pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(referenceToDownload.AbsoluteUrl, path));
            }


            html = pageInfo.LightWeightHTMLDocument.GenerateHtml();

            // finally, write the html out to disk
            string destination = Path.Combine(_siteStorage.BasePath, pageInfo.RelativePath);
            Stream htmlStream  = _siteStorage.Open(destination, AccessMode.Write);

            using (StreamWriter writer = new StreamWriter(htmlStream, Encoding.UTF8))
            {
                writer.Write(html);
            }

            // if this is the entry page, write the path token and root file name
            if (pageInfo.IsRootPage)
            {
                this._pathToken       = pageInfo.ReferencedFileRelativePath;
                _siteStorage.RootFile = pageInfo.FileName;
            }
        }
Exemplo n.º 10
0
        public string Capture(int timeoutMs)
        {
            // flag indicating whether we should continue with the capture
            bool continueCapture = true ;

            // request the page
            HttpWebResponse response = RequestPage(TargetUrl, timeoutMs);
            OnHeadersReceived(response.Headers, ref continueCapture) ;
            if ( !continueCapture )
                throw new OperationCancelledException() ;

            // transfer it to a stream
            MemoryStream pageStream = new MemoryStream();
            using ( Stream responseStream = response.GetResponseStream() )
                StreamHelper.Transfer(responseStream, pageStream);
            pageStream.Seek(0, SeekOrigin.Begin) ;

            // allow filter on content
            OnContentReceived( new StreamReader(pageStream).ReadToEnd(), ref continueCapture ) ;
            if ( !continueCapture )
                throw new OperationCancelledException() ;
            pageStream.Seek(0, SeekOrigin.Begin) ;

            // Read the stream into a lightweight HTML doc. We use from LightWeightHTMLDocument.FromIHTMLDocument2
            // instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration
            // above the docType (bug 289357)
            IHTMLDocument2 doc = HTMLDocumentHelper.StreamToHTMLDoc(pageStream, TargetUrl, false);
            LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, TargetUrl, true);

            // download references
            FileBasedSiteStorage siteStorage = new FileBasedSiteStorage(DestinationPath, "index.htm");
            PageToDownload page = new PageToDownload(ldoc, TargetUrl, siteStorage.RootFile);
            PageAndReferenceDownloader downloader = new PageAndReferenceDownloader(new PageToDownload[]{page}, siteStorage) ;
            downloader.Download(new TimeoutProgressHost(timeoutMs)) ;

            // return path to captured page
            return Path.Combine(DestinationPath, siteStorage.RootFile) ;
        }
Exemplo n.º 11
0
        private PageToDownload DownloadUrl(string url, PageToDownload parent, IProgressHost progress)
        {
            PageToDownload thisPageToDownload = null;

            // Download the current page
            LightWeightHTMLDocument lightWeightDoc = null;

            using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, url, null, _context.CookieString, _context.TimeoutMS, true))
            {
                downloader.DownloadHTMLDocument(progress);
                lightWeightDoc     = LightWeightHTMLDocument.FromIHTMLDocument2(downloader.HtmlDocument, downloader.Url);
                thisPageToDownload = new PageToDownload(lightWeightDoc, url, null, parent);
                // Reset the url in the event that a redirect occurred
                thisPageToDownload.AbsoluteUrl = downloader.Url;
            }

            foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in lightWeightDoc.StyleResourcesUrls)
            {
                thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
            }

            return(thisPageToDownload);
        }
Exemplo n.º 12
0
        private bool ShouldAddUrl(string url, PageToDownload[] pagesToDownload, PageToDownload parentPage)
        {
            // Filter it if its already in the list
            for (int i = 0; i < pagesToDownload.Length; i++)
            {
                if (pagesToDownload[i] != null && UrlHelper.UrlsAreEqual(url, pagesToDownload[i].AbsoluteUrl))
                {
                    return(false);
                }
            }

            // Filter it if it is one of this pages parents
            PageToDownload currentParent = parentPage;

            while (currentParent != null)
            {
                if (UrlHelper.UrlsAreEqual(url, currentParent.AbsoluteUrl))
                {
                    return(false);
                }
                currentParent = currentParent.ParentInfo;
            }
            return(true);
        }
        public void AddPageToDownload(string url, PageToDownload pageToDownload, bool countAsPageDownload)
        {
            if (countAsPageDownload)
                _currentPageCount++;

            if (!CreatedPageToDownloadTable.ContainsKey(url))
                CreatedPageToDownloadTable.Add(url, pageToDownload);
        }
Exemplo n.º 14
0
 public string GetRelativeUrlForReferencedPage(PageToDownload pageBeingReferenced)
 {
     if (pageBeingReferenced.IsRootPage)
     {
         if (IsRootPage)
             return pageBeingReferenced.FileName;
         else
             return "../" + pageBeingReferenced.FileName;
     }
     else
     {
         if (IsRootPage)
             return DirectoryToken + "/" + pageBeingReferenced.FileName;
         else
             return pageBeingReferenced.FileName;
     }
 }
        /// <summary>
        /// Gets pagesToDownloadFactories for a set of pagesToDownload
        /// </summary>
        /// <param name="pagesToDownload">The pagesToDownlaod to get factories for</param>
        /// <param name="context">The context controlling the factories</param>
        /// <returns>An array of PageToDownloadFactories</returns>
        private string[] GetSubPagesToDownload(PageToDownload[] pagesToDownload, PageToDownload parentPage)
        {
            ArrayList subItemsToDownload = new ArrayList();
            foreach (PageToDownload pageToDownload in pagesToDownload)
            {

                foreach (UrlInfo urlInfo in pageToDownload.LightWeightHTMLDocument.Anchors)
                {
                    if (urlInfo != null)
                    {
                        if (ShouldAddUrl(urlInfo.Url, pagesToDownload, parentPage))
                            subItemsToDownload.Add(urlInfo.Url);
                    }
                }

                LightWeightTag[] tags = pageToDownload.LightWeightHTMLDocument.GetTagsByName("AREA");
                foreach (LightWeightTag tag in tags)
                {
                    string url = tag.BeginTag.GetAttributeValue("href");
                    if (url != null)
                    {
                        if (ShouldAddUrl(url, pagesToDownload, parentPage))
                            subItemsToDownload.Add(url);
                    }
                }
            }

            return (string[])subItemsToDownload.ToArray(typeof(string));
        }
 /// <summary>
 /// Constructs a new asynchrous page download.
 /// </summary>
 /// <param name="pagesToDownload">The array of pagesToDownload</param>
 /// <param name="siteStorage">The File based site storage into which to place the page and references</param>
 /// <param name="target">The ISynchronizeInvoke that will be used to call back</param>
 /// <param name="throwOnFailure">Indicates whether downloader should throw on failure, or just
 /// log the failure and continue</param>
 public PageAndReferenceDownloader(PageToDownload[] pagesToDownload, FileBasedSiteStorage siteStorage, bool throwOnFailure)
 {
     _siteStorage = siteStorage;
     _pagesToDownload = pagesToDownload;
     _throwOnFailure = throwOnFailure;
 }
        private bool ShouldAddUrl(string url, PageToDownload[] pagesToDownload, PageToDownload parentPage)
        {
            // Filter it if its already in the list
            for (int i = 0; i < pagesToDownload.Length; i++)
                if (pagesToDownload[i] != null && UrlHelper.UrlsAreEqual(url, pagesToDownload[i].AbsoluteUrl))
                    return false;

            // Filter it if it is one of this pages parents
            PageToDownload currentParent = parentPage;
            while (currentParent != null)
            {
                if (UrlHelper.UrlsAreEqual(url, currentParent.AbsoluteUrl))
                    return false;
                currentParent = currentParent.ParentInfo;
            }
            return true;

        }
        /// <summary>
        /// Used to actually commit the HTML to disk
        /// </summary>
        /// <param name="pageInfo">The PageToDownload to write</param>
        /// <param name="downloadedReferences">A hashtable of download references</param>
        /// <param name="storage">The storage to write the file into</param>
        private void WriteHtmlToDisk(PageToDownload pageInfo, FileBasedSiteStorage storage)
        {
            // Set the character set for this document
            pageInfo.LightWeightHTMLDocument.MetaData.Charset = Encoding.UTF8.WebName;
            string html = string.Empty;

            // Replace references to any URL that we downloaded!
            foreach (PageToDownload pageToDownload in _pagesToDownload)
                if (!pageToDownload.IsRootPage)
                    pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(pageToDownload.UrlToReplace, pageInfo.GetRelativeUrlForReferencedPage(pageToDownload)));

            foreach (ReferenceToDownload referenceToDownload in _referencesToDownload.Values)
            {
                ReferenceToDownload downloadedReference = ((ReferenceToDownload)_referencesToDownload[referenceToDownload.AbsoluteUrl]);

                // Since we consolidated references, replace the UrToReplace from the original reference
                // with the relativePath to the reference that actually got downloaded
                string path = downloadedReference.GetRelativeUrlForReference(pageInfo);
                pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(referenceToDownload.AbsoluteUrl, path));
            }

            html = pageInfo.LightWeightHTMLDocument.GenerateHtml();

            // finally, write the html out to disk
            string destination = Path.Combine(_siteStorage.BasePath, pageInfo.RelativePath);
            Stream htmlStream = _siteStorage.Open(destination, AccessMode.Write);
            using (StreamWriter writer = new StreamWriter(htmlStream, Encoding.UTF8))
            {
                writer.Write(html);
            }

            // if this is the entry page, write the path token and root file name
            if (pageInfo.IsRootPage)
            {
                this._pathToken = pageInfo.ReferencedFileRelativePath;
                _siteStorage.RootFile = pageInfo.FileName;
            }
        }
 /// <summary>
 /// Constructs a new asynchrous page download.
 /// </summary>
 /// <param name="pagesToDownload">The array of pagesToDownload</param>
 /// <param name="siteStorage">The File based site storage into which to place the page and references</param>
 /// <param name="target">The ISynchronizeInvoke that will be used to call back</param>
 public PageAndReferenceDownloader(PageToDownload[] pagesToDownload, FileBasedSiteStorage siteStorage)
     : this(pagesToDownload, siteStorage, true)
 {
 }
        private PageToDownload[] GetSubPagesToDownload(IProgressHost progress, ArrayList downloadedPagesToScan, PageToDownload parentPage)
        {
            ArrayList subPages = new ArrayList();
            // enumerate the other downloads to do (if we're scanning)
            string[] subUrlsToDownload;
            if (_context.SelectedUrlsToDownload.Count < 1)
                subUrlsToDownload = GetSubPagesToDownload((PageToDownload[])downloadedPagesToScan.ToArray(typeof(PageToDownload)), parentPage);
            else
                subUrlsToDownload = (string[])_context.SelectedUrlsToDownload.ToArray(typeof(string));

            // do the other downloads, passing the context controlling depth
            foreach (string subUrl in subUrlsToDownload)
            {
                if (_context.ShouldContinue(_currentDepth))
                {
                    ProgressTick tick = new ProgressTick(progress, 1, subUrlsToDownload.Length);
                    subPages.AddRange(DownloadPages(tick, subUrl, null, parentPage));
                }
            }
            return (PageToDownload[])subPages.ToArray(typeof(PageToDownload));
        }
 /// <summary>
 /// Returns an array of page download infos representing the information
 /// required to download a given IHTMLDocument2
 /// </summary>
 /// <param name="rootDocument">The IHTMLDocument for which to fetch the infos</param>
 /// <param name="parentInfo">The Parent PageDownloadInfo for the subitem</param>
 /// <returns>Array of PageDownloadInfo</returns>
 private static PageToDownload[] GetFramePagesToDownload(PageToDownload parentPageToDownload)
 {
     ArrayList subFrames = new ArrayList();
     if (parentPageToDownload.LightWeightHTMLDocument.Frames != null)
     {
         foreach (LightWeightHTMLDocument frameDocument in parentPageToDownload.LightWeightHTMLDocument.Frames)
         {
             PageToDownload subFramePageToDownload = new PageToDownload(frameDocument, frameDocument.Url, null, parentPageToDownload);
             subFrames.Add(subFramePageToDownload);
             subFrames.AddRange(GetFramePagesToDownload(subFramePageToDownload));
         }
     }
     return (PageToDownload[])subFrames.ToArray(typeof(PageToDownload));
 }
 public string GetRelativeUrlForReference(PageToDownload referencingPage)
 {
     if (!referencingPage.IsRootPage && ParentInfo.IsRootPage)
         return FileName;
     else if (referencingPage.IsRootPage && !ParentInfo.IsRootPage)
         return ParentInfo.DirectoryToken + "/references/" + FileName;
     else
         return
             RelativeUrl;
 }
 /// <summary>
 /// Constructs a new reference download info
 /// </summary>
 /// <param name="url">The Url of the reference</param>
 /// <param name="parentInfo">The parent info that references this url</param>
 public ReferenceToDownload(string url, PageToDownload parentPageToDownload)
 {
     m_urlToReplace = url;
     ParentInfo     = parentPageToDownload;
 }
 /// <summary>
 /// Constructs a new reference download info
 /// </summary>
 /// <param name="url">The Url of the reference</param>
 /// <param name="parentInfo">The parent info that references this url</param>
 public ReferenceToDownload(string url, PageToDownload parentPageToDownload)
 {
     m_urlToReplace = url;
     ParentInfo = parentPageToDownload;
 }
Exemplo n.º 25
0
        /// <summary>
        /// Actually downloads the pages
        /// </summary>
        private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload)
        {
            // Check for cancel
            if (progress.CancelRequested)
            {
                throw new OperationCancelledException();
            }

            _currentDepth++;
            ArrayList downloadedPages = new ArrayList();

            // Set up our progress
            int thisPageTicks = FIRSTPAGETICKS;

            if (_context.Depth == _currentDepth)
            {
                thisPageTicks = TOTALTICKS;
            }
            ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS);

            string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url);

            // Look up the content type of this pageToDownload
            UrlContentTypeInfo headerInfo = null;

            if (_headerInfo.ContainsKey(safeUrl))
            {
                headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl];
            }
            else
            {
                if (lightWeightDocument != null)
                {
                    headerInfo = new UrlContentTypeInfo("text/html", url);
                }
                else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url))
                {
                    progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url));
                    if (lightWeightDocument == null)
                    {
                        headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS);
                    }
                    else
                    {
                        headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url);
                    }
                }
                _headerInfo.Add(safeUrl, headerInfo);
            }

            // If this is a web page and we should download it, do it!
            if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) ||
                (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo))
                )
            {
                bool downloadWorked   = false;
                int  downloadAttempts = -1;
                bool timedOut         = true;

                // Max sure we are retrying the correct number of times
                ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100);
                while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut)
                {
                    timedOut = false;

                    pageDownloadProgress.UpdateProgress(0, 1);
                    try
                    {
                        // If we haven't downloaded this page yet download it
                        PageToDownload thisPageToDownload = null;

                        if (!_context.UrlAlreadyDownloaded(safeUrl))
                        {
                            if (lightWeightDocument == null)
                            {
                                thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress);
                            }
                            else
                            {
                                LightWeightHTMLDocument htmlDoc = lightWeightDocument;

                                // Only redownload if we absolutely need to
                                if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null))
                                {
                                    string html     = htmlDoc.GenerateHtml();
                                    string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm");
                                    using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8))
                                        writer.Write(html);
                                    using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false))
                                    {
                                        downloader.DownloadHTMLDocument(pageDownloadProgress);

                                        htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url);
                                    }
                                }
                                thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload);
                                if (htmlDoc.StyleResourcesUrls != null)
                                {
                                    foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls)
                                    {
                                        thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
                                    }
                                }
                            }
                            // Add this page to our lists
                            _context.AddPageToDownload(safeUrl, thisPageToDownload, true);
                            downloadedPages.Add(thisPageToDownload);
                        }
                        else
                        {
                            thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl];
                        }

                        // If we're downloading a site, add a second copy of the root page in the references subdir
                        // This was, if the root page gets renamed, links back to it will still work correctly
                        // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output
                        // the site and change the root file name
                        if (thisPageToDownload.IsRootPage && _context.Depth > 0)
                        {
                            PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload);
                            downloadedPages.Add(copyOfThisPageToDownload);
                        }

                        // enumerate the frames of this page and add them to the list of pages
                        PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload);
                        downloadedPages.AddRange(subFramesToDownload);
                        foreach (PageToDownload pageToDownload in subFramesToDownload)
                        {
                            _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false);
                        }

                        // Now drill down based upon the depth configuration
                        if (_context.ShouldContinue(_currentDepth))
                        {
                            ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS);
                            downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload));
                        }
                        downloadWorked = true;
                        firstPagedownloadProgress.UpdateProgress(1, 1);
                    }
                    catch (OperationTimedOutException)
                    {
                        timedOut = true;
                    }
                    catch (WebPageDownloaderException htex)
                    {
                        HandleException(new Exception(htex.Message, htex));
                    }
                    catch (Exception ex)
                    {
                        HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex));
                    }
                }

                // If we never got the download to succeed, add it to the list of timed out Urls
                if (!downloadWorked && timedOut)
                {
                    _context.AddTimedOutUrl(_url);
                    firstPagedownloadProgress.UpdateProgress(1, 1);
                }
            }
            // If it isn't a page we'll just add the file to the reference list for the parent page
            // There is not an else, because we could be looking at a reference, but a reference that
            // should not be downloaded (in which case we just ignore it)
            else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo))
            {
                parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload));
                progress.UpdateProgress(1, 1);
            }

            progress.UpdateProgress(1, 1);

            _currentDepth--;
            return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload)));
        }
 public ReferenceToDownload(string url, PageToDownload parentPageToDownload, string absoluteUrl) : this(url, parentPageToDownload)
 {
     m_absoluteUrl = absoluteUrl;
 }
 public ReferenceToDownload(string url, PageToDownload parentPageToDownload, string absoluteUrl) : this(url, parentPageToDownload)
 {
     m_absoluteUrl = absoluteUrl;
 }
        /// <summary>
        /// Actually downloads the pages
        /// </summary>
        private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload)
        {
            // Check for cancel
            if (progress.CancelRequested)
                throw new OperationCancelledException();

            _currentDepth++;
            ArrayList downloadedPages = new ArrayList();

            // Set up our progress
            int thisPageTicks = FIRSTPAGETICKS;
            if (_context.Depth == _currentDepth)
                thisPageTicks = TOTALTICKS;
            ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS);

            string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url);

            // Look up the content type of this pageToDownload
            UrlContentTypeInfo headerInfo = null;
            if (_headerInfo.ContainsKey(safeUrl))
            {
                headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl];
            }
            else
            {
                if (lightWeightDocument != null)
                    headerInfo = new UrlContentTypeInfo("text/html", url);
                else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url))
                {
                    progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url));
                    if (lightWeightDocument == null)
                        headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS);
                    else
                        headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url);
                }
                _headerInfo.Add(safeUrl, headerInfo);
            }

            // If this is a web page and we should download it, do it!
            if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) ||
                (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo))
                )
            {
                bool downloadWorked = false;
                int downloadAttempts = -1;
                bool timedOut = true;

                // Max sure we are retrying the correct number of times
                ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100);
                while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut)
                {
                    timedOut = false;

                    pageDownloadProgress.UpdateProgress(0, 1);
                    try
                    {
                        // If we haven't downloaded this page yet download it
                        PageToDownload thisPageToDownload = null;

                        if (!_context.UrlAlreadyDownloaded(safeUrl))
                        {
                            if (lightWeightDocument == null)
                                thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress);
                            else
                            {
                                LightWeightHTMLDocument htmlDoc = lightWeightDocument;

                                // Only redownload if we absolutely need to
                                if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null))
                                {

                                    string html = htmlDoc.GenerateHtml();
                                    string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm");
                                    using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8))
                                        writer.Write(html);
                                    using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false))
                                    {
                                        downloader.DownloadHTMLDocument(pageDownloadProgress);

                                        htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url);
                                    }
                                }
                                thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload);
                                if (htmlDoc.StyleResourcesUrls != null)
                                    foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls)
                                        thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
                            }
                            // Add this page to our lists
                            _context.AddPageToDownload(safeUrl, thisPageToDownload, true);
                            downloadedPages.Add(thisPageToDownload);

                        }
                        else
                            thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl];

                        // If we're downloading a site, add a second copy of the root page in the references subdir
                        // This was, if the root page gets renamed, links back to it will still work correctly
                        // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output
                        // the site and change the root file name
                        if (thisPageToDownload.IsRootPage && _context.Depth > 0)
                        {
                            PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload);
                            downloadedPages.Add(copyOfThisPageToDownload);
                        }

                        // enumerate the frames of this page and add them to the list of pages
                        PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload);
                        downloadedPages.AddRange(subFramesToDownload);
                        foreach (PageToDownload pageToDownload in subFramesToDownload)
                            _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false);

                        // Now drill down based upon the depth configuration
                        if (_context.ShouldContinue(_currentDepth))
                        {
                            ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS);
                            downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload));
                        }
                        downloadWorked = true;
                        firstPagedownloadProgress.UpdateProgress(1, 1);

                    }
                    catch (OperationTimedOutException)
                    {
                        timedOut = true;
                    }
                    catch (WebPageDownloaderException htex)
                    {
                        HandleException(new Exception(htex.Message, htex));
                    }
                    catch (Exception ex)
                    {
                        HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex));
                    }
                }

                // If we never got the download to succeed, add it to the list of timed out Urls
                if (!downloadWorked && timedOut)
                {
                    _context.AddTimedOutUrl(_url);
                    firstPagedownloadProgress.UpdateProgress(1, 1);

                }
            }
            // If it isn't a page we'll just add the file to the reference list for the parent page
            // There is not an else, because we could be looking at a reference, but a reference that
            // should not be downloaded (in which case we just ignore it)
            else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo))
            {
                parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload));
                progress.UpdateProgress(1, 1);
            }

            progress.UpdateProgress(1, 1);

            _currentDepth--;
            return (PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload));
        }
Exemplo n.º 29
0
        private PageToDownload[] GetSubPagesToDownload(IProgressHost progress, ArrayList downloadedPagesToScan, PageToDownload parentPage)
        {
            ArrayList subPages = new ArrayList();

            // enumerate the other downloads to do (if we're scanning)
            string[] subUrlsToDownload;
            if (_context.SelectedUrlsToDownload.Count < 1)
            {
                subUrlsToDownload = GetSubPagesToDownload((PageToDownload[])downloadedPagesToScan.ToArray(typeof(PageToDownload)), parentPage);
            }
            else
            {
                subUrlsToDownload = (string[])_context.SelectedUrlsToDownload.ToArray(typeof(string));
            }

            // do the other downloads, passing the context controlling depth
            foreach (string subUrl in subUrlsToDownload)
            {
                if (_context.ShouldContinue(_currentDepth))
                {
                    ProgressTick tick = new ProgressTick(progress, 1, subUrlsToDownload.Length);
                    subPages.AddRange(DownloadPages(tick, subUrl, null, parentPage));
                }
            }
            return((PageToDownload[])subPages.ToArray(typeof(PageToDownload)));
        }
        private PageToDownload DownloadUrl(string url, PageToDownload parent, IProgressHost progress)
        {
            PageToDownload thisPageToDownload = null;

            // Download the current page
            LightWeightHTMLDocument lightWeightDoc = null;

            using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, url, null, _context.CookieString, _context.TimeoutMS, true))
            {
                downloader.DownloadHTMLDocument(progress);
                lightWeightDoc = LightWeightHTMLDocument.FromIHTMLDocument2(downloader.HtmlDocument, downloader.Url);
                thisPageToDownload = new PageToDownload(lightWeightDoc, url, null, parent);
                // Reset the url in the event that a redirect occurred
                thisPageToDownload.AbsoluteUrl = downloader.Url;
            }

            foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in lightWeightDoc.StyleResourcesUrls)
                thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));

            return thisPageToDownload;
        }