/// <summary> /// Gets pagesToDownloadFactories for a set of pagesToDownload /// </summary> /// <param name="pagesToDownload">The pagesToDownload to get factories for</param> /// <param name="context">The context controlling the factories</param> /// <returns>An array of PageToDownloadFactories</returns> private string[] GetSubPagesToDownload(PageToDownload[] pagesToDownload, PageToDownload parentPage) { ArrayList subItemsToDownload = new ArrayList(); foreach (PageToDownload pageToDownload in pagesToDownload) { foreach (UrlInfo urlInfo in pageToDownload.LightWeightHTMLDocument.Anchors) { if (urlInfo != null) { if (ShouldAddUrl(urlInfo.Url, pagesToDownload, parentPage)) { subItemsToDownload.Add(urlInfo.Url); } } } LightWeightTag[] tags = pageToDownload.LightWeightHTMLDocument.GetTagsByName("AREA"); foreach (LightWeightTag tag in tags) { string url = tag.BeginTag.GetAttributeValue("href"); if (url != null) { if (ShouldAddUrl(url, pagesToDownload, parentPage)) { subItemsToDownload.Add(url); } } } } return((string[])subItemsToDownload.ToArray(typeof(string))); }
public string GetRelativeUrlForReferencedPage(PageToDownload pageBeingReferenced) { if (pageBeingReferenced.IsRootPage) { if (IsRootPage) { return(pageBeingReferenced.FileName); } else { return("../" + pageBeingReferenced.FileName); } } else { if (IsRootPage) { return(DirectoryToken + "/" + pageBeingReferenced.FileName); } else { return(pageBeingReferenced.FileName); } } }
public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName, PageToDownload parentInfo) { ParentInfo = parentInfo; _lightweightHTMLDocument = htmlDocument; _urlToReplace = url; _anchor = UrlHelper.GetAnchorIdentifier(url); m_fileName = rootFileName; }
public void AddPageToDownload(string url, PageToDownload pageToDownload, bool countAsPageDownload) { if (countAsPageDownload) { _currentPageCount++; } if (!CreatedPageToDownloadTable.ContainsKey(url)) { CreatedPageToDownloadTable.Add(url, pageToDownload); } }
/// <summary> /// Returns an array of page download infos representing the information /// required to download a given IHTMLDocument2 /// </summary> /// <param name="rootDocument">The IHTMLDocument for which to fetch the infos</param> /// <param name="parentInfo">The Parent PageDownloadInfo for the subitem</param> /// <returns>Array of PageDownloadInfo</returns> private static PageToDownload[] GetFramePagesToDownload(PageToDownload parentPageToDownload) { ArrayList subFrames = new ArrayList(); if (parentPageToDownload.LightWeightHTMLDocument.Frames != null) { foreach (LightWeightHTMLDocument frameDocument in parentPageToDownload.LightWeightHTMLDocument.Frames) { PageToDownload subFramePageToDownload = new PageToDownload(frameDocument, frameDocument.Url, null, parentPageToDownload); subFrames.Add(subFramePageToDownload); subFrames.AddRange(GetFramePagesToDownload(subFramePageToDownload)); } } return((PageToDownload[])subFrames.ToArray(typeof(PageToDownload))); }
public string GetRelativeUrlForReference(PageToDownload referencingPage) { if (!referencingPage.IsRootPage && ParentInfo.IsRootPage) { return(FileName); } else if (referencingPage.IsRootPage && !ParentInfo.IsRootPage) { return(ParentInfo.DirectoryToken + "/references/" + FileName); } else { return (RelativeUrl); } }
public string Capture(int timeoutMs) { // flag indicating whether we should continue with the capture bool continueCapture = true; // request the page HttpWebResponse response = RequestPage(TargetUrl, timeoutMs); OnHeadersReceived(response.Headers, ref continueCapture); if (!continueCapture) { throw new OperationCancelledException(); } // transfer it to a stream MemoryStream pageStream = new MemoryStream(); using (Stream responseStream = response.GetResponseStream()) StreamHelper.Transfer(responseStream, pageStream); pageStream.Seek(0, SeekOrigin.Begin); // allow filter on content OnContentReceived(new StreamReader(pageStream).ReadToEnd(), ref continueCapture); if (!continueCapture) { throw new OperationCancelledException(); } pageStream.Seek(0, SeekOrigin.Begin); // Read the stream into a lightweight HTML doc. We use from LightWeightHTMLDocument.FromIHTMLDocument2 // instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration // above the docType (bug 289357) IHTMLDocument2 doc = HTMLDocumentHelper.StreamToHTMLDoc(pageStream, TargetUrl, false); LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, TargetUrl, true); // download references FileBasedSiteStorage siteStorage = new FileBasedSiteStorage(DestinationPath, "index.htm"); PageToDownload page = new PageToDownload(ldoc, TargetUrl, siteStorage.RootFile); PageAndReferenceDownloader downloader = new PageAndReferenceDownloader(new PageToDownload[] { page }, siteStorage); downloader.Download(new TimeoutProgressHost(timeoutMs)); // return path to captured page return(Path.Combine(DestinationPath, siteStorage.RootFile)); }
/// <summary> /// Used to actually commit the HTML to disk /// </summary> /// <param name="pageInfo">The PageToDownload to write</param> /// <param name="downloadedReferences">A hashtable of download references</param> /// <param name="storage">The storage to write the file into</param> private void WriteHtmlToDisk(PageToDownload pageInfo, FileBasedSiteStorage storage) { // Set the character set for this document pageInfo.LightWeightHTMLDocument.MetaData.Charset = Encoding.UTF8.WebName; string html = string.Empty; // Replace references to any URL that we downloaded! foreach (PageToDownload pageToDownload in _pagesToDownload) { if (!pageToDownload.IsRootPage) { pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(pageToDownload.UrlToReplace, pageInfo.GetRelativeUrlForReferencedPage(pageToDownload))); } } foreach (ReferenceToDownload referenceToDownload in _referencesToDownload.Values) { ReferenceToDownload downloadedReference = ((ReferenceToDownload)_referencesToDownload[referenceToDownload.AbsoluteUrl]); // Since we consolidated references, replace the UrToReplace from the original reference // with the relativePath to the reference that actually got downloaded string path = downloadedReference.GetRelativeUrlForReference(pageInfo); pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(referenceToDownload.AbsoluteUrl, path)); } html = pageInfo.LightWeightHTMLDocument.GenerateHtml(); // finally, write the html out to disk string destination = Path.Combine(_siteStorage.BasePath, pageInfo.RelativePath); Stream htmlStream = _siteStorage.Open(destination, AccessMode.Write); using (StreamWriter writer = new StreamWriter(htmlStream, Encoding.UTF8)) { writer.Write(html); } // if this is the entry page, write the path token and root file name if (pageInfo.IsRootPage) { this._pathToken = pageInfo.ReferencedFileRelativePath; _siteStorage.RootFile = pageInfo.FileName; } }
public string Capture(int timeoutMs) { // flag indicating whether we should continue with the capture bool continueCapture = true ; // request the page HttpWebResponse response = RequestPage(TargetUrl, timeoutMs); OnHeadersReceived(response.Headers, ref continueCapture) ; if ( !continueCapture ) throw new OperationCancelledException() ; // transfer it to a stream MemoryStream pageStream = new MemoryStream(); using ( Stream responseStream = response.GetResponseStream() ) StreamHelper.Transfer(responseStream, pageStream); pageStream.Seek(0, SeekOrigin.Begin) ; // allow filter on content OnContentReceived( new StreamReader(pageStream).ReadToEnd(), ref continueCapture ) ; if ( !continueCapture ) throw new OperationCancelledException() ; pageStream.Seek(0, SeekOrigin.Begin) ; // Read the stream into a lightweight HTML doc. We use from LightWeightHTMLDocument.FromIHTMLDocument2 // instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration // above the docType (bug 289357) IHTMLDocument2 doc = HTMLDocumentHelper.StreamToHTMLDoc(pageStream, TargetUrl, false); LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, TargetUrl, true); // download references FileBasedSiteStorage siteStorage = new FileBasedSiteStorage(DestinationPath, "index.htm"); PageToDownload page = new PageToDownload(ldoc, TargetUrl, siteStorage.RootFile); PageAndReferenceDownloader downloader = new PageAndReferenceDownloader(new PageToDownload[]{page}, siteStorage) ; downloader.Download(new TimeoutProgressHost(timeoutMs)) ; // return path to captured page return Path.Combine(DestinationPath, siteStorage.RootFile) ; }
private PageToDownload DownloadUrl(string url, PageToDownload parent, IProgressHost progress) { PageToDownload thisPageToDownload = null; // Download the current page LightWeightHTMLDocument lightWeightDoc = null; using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, url, null, _context.CookieString, _context.TimeoutMS, true)) { downloader.DownloadHTMLDocument(progress); lightWeightDoc = LightWeightHTMLDocument.FromIHTMLDocument2(downloader.HtmlDocument, downloader.Url); thisPageToDownload = new PageToDownload(lightWeightDoc, url, null, parent); // Reset the url in the event that a redirect occurred thisPageToDownload.AbsoluteUrl = downloader.Url; } foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in lightWeightDoc.StyleResourcesUrls) { thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } return(thisPageToDownload); }
private bool ShouldAddUrl(string url, PageToDownload[] pagesToDownload, PageToDownload parentPage) { // Filter it if its already in the list for (int i = 0; i < pagesToDownload.Length; i++) { if (pagesToDownload[i] != null && UrlHelper.UrlsAreEqual(url, pagesToDownload[i].AbsoluteUrl)) { return(false); } } // Filter it if it is one of this pages parents PageToDownload currentParent = parentPage; while (currentParent != null) { if (UrlHelper.UrlsAreEqual(url, currentParent.AbsoluteUrl)) { return(false); } currentParent = currentParent.ParentInfo; } return(true); }
public void AddPageToDownload(string url, PageToDownload pageToDownload, bool countAsPageDownload) { if (countAsPageDownload) _currentPageCount++; if (!CreatedPageToDownloadTable.ContainsKey(url)) CreatedPageToDownloadTable.Add(url, pageToDownload); }
public string GetRelativeUrlForReferencedPage(PageToDownload pageBeingReferenced) { if (pageBeingReferenced.IsRootPage) { if (IsRootPage) return pageBeingReferenced.FileName; else return "../" + pageBeingReferenced.FileName; } else { if (IsRootPage) return DirectoryToken + "/" + pageBeingReferenced.FileName; else return pageBeingReferenced.FileName; } }
/// <summary> /// Gets pagesToDownloadFactories for a set of pagesToDownload /// </summary> /// <param name="pagesToDownload">The pagesToDownlaod to get factories for</param> /// <param name="context">The context controlling the factories</param> /// <returns>An array of PageToDownloadFactories</returns> private string[] GetSubPagesToDownload(PageToDownload[] pagesToDownload, PageToDownload parentPage) { ArrayList subItemsToDownload = new ArrayList(); foreach (PageToDownload pageToDownload in pagesToDownload) { foreach (UrlInfo urlInfo in pageToDownload.LightWeightHTMLDocument.Anchors) { if (urlInfo != null) { if (ShouldAddUrl(urlInfo.Url, pagesToDownload, parentPage)) subItemsToDownload.Add(urlInfo.Url); } } LightWeightTag[] tags = pageToDownload.LightWeightHTMLDocument.GetTagsByName("AREA"); foreach (LightWeightTag tag in tags) { string url = tag.BeginTag.GetAttributeValue("href"); if (url != null) { if (ShouldAddUrl(url, pagesToDownload, parentPage)) subItemsToDownload.Add(url); } } } return (string[])subItemsToDownload.ToArray(typeof(string)); }
/// <summary> /// Constructs a new asynchrous page download. /// </summary> /// <param name="pagesToDownload">The array of pagesToDownload</param> /// <param name="siteStorage">The File based site storage into which to place the page and references</param> /// <param name="target">The ISynchronizeInvoke that will be used to call back</param> /// <param name="throwOnFailure">Indicates whether downloader should throw on failure, or just /// log the failure and continue</param> public PageAndReferenceDownloader(PageToDownload[] pagesToDownload, FileBasedSiteStorage siteStorage, bool throwOnFailure) { _siteStorage = siteStorage; _pagesToDownload = pagesToDownload; _throwOnFailure = throwOnFailure; }
private bool ShouldAddUrl(string url, PageToDownload[] pagesToDownload, PageToDownload parentPage) { // Filter it if its already in the list for (int i = 0; i < pagesToDownload.Length; i++) if (pagesToDownload[i] != null && UrlHelper.UrlsAreEqual(url, pagesToDownload[i].AbsoluteUrl)) return false; // Filter it if it is one of this pages parents PageToDownload currentParent = parentPage; while (currentParent != null) { if (UrlHelper.UrlsAreEqual(url, currentParent.AbsoluteUrl)) return false; currentParent = currentParent.ParentInfo; } return true; }
/// <summary> /// Used to actually commit the HTML to disk /// </summary> /// <param name="pageInfo">The PageToDownload to write</param> /// <param name="downloadedReferences">A hashtable of download references</param> /// <param name="storage">The storage to write the file into</param> private void WriteHtmlToDisk(PageToDownload pageInfo, FileBasedSiteStorage storage) { // Set the character set for this document pageInfo.LightWeightHTMLDocument.MetaData.Charset = Encoding.UTF8.WebName; string html = string.Empty; // Replace references to any URL that we downloaded! foreach (PageToDownload pageToDownload in _pagesToDownload) if (!pageToDownload.IsRootPage) pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(pageToDownload.UrlToReplace, pageInfo.GetRelativeUrlForReferencedPage(pageToDownload))); foreach (ReferenceToDownload referenceToDownload in _referencesToDownload.Values) { ReferenceToDownload downloadedReference = ((ReferenceToDownload)_referencesToDownload[referenceToDownload.AbsoluteUrl]); // Since we consolidated references, replace the UrToReplace from the original reference // with the relativePath to the reference that actually got downloaded string path = downloadedReference.GetRelativeUrlForReference(pageInfo); pageInfo.LightWeightHTMLDocument.AddUrlToEscape(new UrlToReplace(referenceToDownload.AbsoluteUrl, path)); } html = pageInfo.LightWeightHTMLDocument.GenerateHtml(); // finally, write the html out to disk string destination = Path.Combine(_siteStorage.BasePath, pageInfo.RelativePath); Stream htmlStream = _siteStorage.Open(destination, AccessMode.Write); using (StreamWriter writer = new StreamWriter(htmlStream, Encoding.UTF8)) { writer.Write(html); } // if this is the entry page, write the path token and root file name if (pageInfo.IsRootPage) { this._pathToken = pageInfo.ReferencedFileRelativePath; _siteStorage.RootFile = pageInfo.FileName; } }
/// <summary> /// Constructs a new asynchrous page download. /// </summary> /// <param name="pagesToDownload">The array of pagesToDownload</param> /// <param name="siteStorage">The File based site storage into which to place the page and references</param> /// <param name="target">The ISynchronizeInvoke that will be used to call back</param> public PageAndReferenceDownloader(PageToDownload[] pagesToDownload, FileBasedSiteStorage siteStorage) : this(pagesToDownload, siteStorage, true) { }
private PageToDownload[] GetSubPagesToDownload(IProgressHost progress, ArrayList downloadedPagesToScan, PageToDownload parentPage) { ArrayList subPages = new ArrayList(); // enumerate the other downloads to do (if we're scanning) string[] subUrlsToDownload; if (_context.SelectedUrlsToDownload.Count < 1) subUrlsToDownload = GetSubPagesToDownload((PageToDownload[])downloadedPagesToScan.ToArray(typeof(PageToDownload)), parentPage); else subUrlsToDownload = (string[])_context.SelectedUrlsToDownload.ToArray(typeof(string)); // do the other downloads, passing the context controlling depth foreach (string subUrl in subUrlsToDownload) { if (_context.ShouldContinue(_currentDepth)) { ProgressTick tick = new ProgressTick(progress, 1, subUrlsToDownload.Length); subPages.AddRange(DownloadPages(tick, subUrl, null, parentPage)); } } return (PageToDownload[])subPages.ToArray(typeof(PageToDownload)); }
/// <summary> /// Returns an array of page download infos representing the information /// required to download a given IHTMLDocument2 /// </summary> /// <param name="rootDocument">The IHTMLDocument for which to fetch the infos</param> /// <param name="parentInfo">The Parent PageDownloadInfo for the subitem</param> /// <returns>Array of PageDownloadInfo</returns> private static PageToDownload[] GetFramePagesToDownload(PageToDownload parentPageToDownload) { ArrayList subFrames = new ArrayList(); if (parentPageToDownload.LightWeightHTMLDocument.Frames != null) { foreach (LightWeightHTMLDocument frameDocument in parentPageToDownload.LightWeightHTMLDocument.Frames) { PageToDownload subFramePageToDownload = new PageToDownload(frameDocument, frameDocument.Url, null, parentPageToDownload); subFrames.Add(subFramePageToDownload); subFrames.AddRange(GetFramePagesToDownload(subFramePageToDownload)); } } return (PageToDownload[])subFrames.ToArray(typeof(PageToDownload)); }
public string GetRelativeUrlForReference(PageToDownload referencingPage) { if (!referencingPage.IsRootPage && ParentInfo.IsRootPage) return FileName; else if (referencingPage.IsRootPage && !ParentInfo.IsRootPage) return ParentInfo.DirectoryToken + "/references/" + FileName; else return RelativeUrl; }
/// <summary> /// Constructs a new reference download info /// </summary> /// <param name="url">The Url of the reference</param> /// <param name="parentInfo">The parent info that references this url</param> public ReferenceToDownload(string url, PageToDownload parentPageToDownload) { m_urlToReplace = url; ParentInfo = parentPageToDownload; }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) { throw new OperationCancelledException(); } _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) { thisPageTicks = TOTALTICKS; } ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) { headerInfo = new UrlContentTypeInfo("text/html", url); } else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) { headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); } else { headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) { thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); } else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) { foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) { thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } } } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else { thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; } // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) { _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); } // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload))); }
public ReferenceToDownload(string url, PageToDownload parentPageToDownload, string absoluteUrl) : this(url, parentPageToDownload) { m_absoluteUrl = absoluteUrl; }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) throw new OperationCancelledException(); _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) thisPageTicks = TOTALTICKS; ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) headerInfo = new UrlContentTypeInfo("text/html", url); else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); else headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return (PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload)); }
private PageToDownload[] GetSubPagesToDownload(IProgressHost progress, ArrayList downloadedPagesToScan, PageToDownload parentPage) { ArrayList subPages = new ArrayList(); // enumerate the other downloads to do (if we're scanning) string[] subUrlsToDownload; if (_context.SelectedUrlsToDownload.Count < 1) { subUrlsToDownload = GetSubPagesToDownload((PageToDownload[])downloadedPagesToScan.ToArray(typeof(PageToDownload)), parentPage); } else { subUrlsToDownload = (string[])_context.SelectedUrlsToDownload.ToArray(typeof(string)); } // do the other downloads, passing the context controlling depth foreach (string subUrl in subUrlsToDownload) { if (_context.ShouldContinue(_currentDepth)) { ProgressTick tick = new ProgressTick(progress, 1, subUrlsToDownload.Length); subPages.AddRange(DownloadPages(tick, subUrl, null, parentPage)); } } return((PageToDownload[])subPages.ToArray(typeof(PageToDownload))); }
private PageToDownload DownloadUrl(string url, PageToDownload parent, IProgressHost progress) { PageToDownload thisPageToDownload = null; // Download the current page LightWeightHTMLDocument lightWeightDoc = null; using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, url, null, _context.CookieString, _context.TimeoutMS, true)) { downloader.DownloadHTMLDocument(progress); lightWeightDoc = LightWeightHTMLDocument.FromIHTMLDocument2(downloader.HtmlDocument, downloader.Url); thisPageToDownload = new PageToDownload(lightWeightDoc, url, null, parent); // Reset the url in the event that a redirect occurred thisPageToDownload.AbsoluteUrl = downloader.Url; } foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in lightWeightDoc.StyleResourcesUrls) thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); return thisPageToDownload; }