private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name, bool escapePaths, bool escapeEmptyString) { string escapedHtml = HTMLDocumentHelper.HTMLDocToString(htmlDocument); if (escapedHtml == null) { return(null); } if (escapePaths) { escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(escapedHtml, url, true, escapeEmptyString); } LightWeightHTMLDocument finalDocument = new LightWeightHTMLDocument(escapedHtml, url, name); // Set the Frames finalDocument.SetFrames(GetLightWeightDocumentForFrames(htmlDocument)); // Set the styles finalDocument.SetStyleReferences(HTMLDocumentHelper.GetStyleReferencesForDocument(htmlDocument, url)); // Set the DocType HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(htmlDocument); finalDocument._docType = specialHeaders.DocType; finalDocument._savedFrom = specialHeaders.SavedFrom; finalDocument.Parse(); return(finalDocument); }
public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName, PageToDownload parentInfo) { ParentInfo = parentInfo; _lightweightHTMLDocument = htmlDocument; _urlToReplace = url; _anchor = UrlHelper.GetAnchorIdentifier(url); m_fileName = rootFileName; }
public static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url) { if (htmlDocument == null) { return(null); } return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, true)); }
public static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, bool escapePaths, bool escapeEmptyString) { if (htmlDocument == null) { return(null); } return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, null, escapePaths, escapeEmptyString)); }
public static LightWeightHTMLDocument FromStream(Stream stream, string url, string name) { if (!stream.CanSeek) { string filePath = TempFileManager.Instance.CreateTempFile(); using (FileStream file = new FileStream(filePath, FileMode.Open)) StreamHelper.Transfer(stream, file); return(LightWeightHTMLDocument.FromFile(filePath, url, name)); } else { Encoding currentEncoding = Encoding.Default; LightWeightHTMLDocument lwDoc = null; using (StreamReader reader = new StreamReader(stream, currentEncoding)) { lwDoc = LightWeightHTMLDocument.FromString(reader.ReadToEnd(), url, name, true); // If there is no metadata that disagrees with our encoding, just return the DOM read with default decoding LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(lwDoc); if (metaData != null && metaData.Charset != null) { try { // The decoding is different than the encoding used to read this document, reread it with correct encoding Encoding encoding = Encoding.GetEncoding(metaData.Charset); if (encoding != currentEncoding) { reader.DiscardBufferedData(); stream.Seek(0, SeekOrigin.Begin); using (StreamReader reader2 = new StreamReader(stream, encoding)) { lwDoc = LightWeightHTMLDocument.FromString(reader2.ReadToEnd(), url, name, true); } } } catch (NotSupportedException) { // The encoding isn't supported on this system } catch (ArgumentException) { // The encoding isn't an encoding that the OS even knows about (its probably // not well formatted or misspelled or something) } } } return(lwDoc); } }
public static LightWeightHTMLDocument[] GetLightWeightDocumentForFrames(IHTMLDocument2 htmlDocument) { ArrayList frameLightWeightDocuments = new ArrayList(); // Get the IOleContainer for the for the html document (this requires that // the document is the root document in the browser) IOleContainer oleContainer = (IOleContainer)htmlDocument; IEnumUnknown enumUnknown; // Enumerate the controls in the browser oleContainer.EnumObjects(OLECONTF.EMBEDDINGS, out enumUnknown); // Iterate through the controls object unknown; for (int i = 0; HRESULT.S_OK == enumUnknown.Next(1, out unknown, IntPtr.Zero); i++) { // Only subframes should cast to IWebBrowser2 IWebBrowser2 webBrowser = unknown as IWebBrowser2; // Since it is a subframe, we can also get the base frame implementation for it IHTMLFrameBase frameBase = unknown as IHTMLFrameBase; // It's a frame, add this to the list! if (webBrowser != null) { try { IHTMLDocument2 frameDocument = webBrowser.Document as IHTMLDocument2; if (frameDocument != null) { LightWeightHTMLDocument document = LightWeightHTMLDocument.FromIHTMLDocument2(frameDocument, frameDocument.url, frameBase.name); if (document != null) { frameLightWeightDocuments.Add(document); } } } catch (InvalidCastException) { string html = "<HTML></HTML>"; LightWeightHTMLDocument document = LightWeightHTMLDocument.FromString(html, webBrowser.LocationURL, webBrowser.LocationURL, true); if (document != null) { frameLightWeightDocuments.Add(document); } } } } return((LightWeightHTMLDocument[])frameLightWeightDocuments.ToArray(typeof(LightWeightHTMLDocument))); }
public static LightWeightHTMLDocumentData Create(IDataObject iDataObject) { string[] loser = iDataObject.GetFormats(); if (OleDataObjectHelper.GetDataPresentSafe(iDataObject, LightWeightHTMLDataObject.LIGHTWEIGHTHTMLDOCUMENTFORMAT)) { LightWeightHTMLDocument document = (LightWeightHTMLDocument)iDataObject.GetData(LightWeightHTMLDataObject.LIGHTWEIGHTHTMLDOCUMENTFORMAT); return(new LightWeightHTMLDocumentData(document)); } else { return(null); } }
/// <summary> /// Constructs a new HTMLMetaData for an IHTMLDocument2 /// </summary> /// <param name="HTMLDocument">The IHTMLDocument2 for which to fetch metadata.</param> public LightWeightHTMLMetaData(LightWeightHTMLDocument HTMLDocument) { m_HTMLDocument = HTMLDocument; _docType = HTMLDocument.DocType; _savedFrom = HTMLDocument.SavedFrom; LightWeightTag[] beginTags = HTMLDocument.GetTagsByName(HTMLTokens.Base); foreach (LightWeightTag baseTag in beginTags) { Attr href = baseTag.BeginTag.GetAttribute(HTMLTokens.Href); if (href != null) { _base = href.Value; break; } } }
public static LightWeightHTMLDocument FromString(string html, string baseUrl, string name, bool escapePaths) { string escapedHtml = html; if (escapePaths) { escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(html, baseUrl); } LightWeightHTMLDocument escapedDocument = new LightWeightHTMLDocument(escapedHtml, baseUrl, name); HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(escapedHtml, baseUrl); escapedDocument._docType = specialHeaders.DocType; escapedDocument._savedFrom = specialHeaders.SavedFrom; escapedDocument.Parse(); return(escapedDocument); }
public string Capture(int timeoutMs) { // flag indicating whether we should continue with the capture bool continueCapture = true; // request the page HttpWebResponse response = RequestPage(TargetUrl, timeoutMs); OnHeadersReceived(response.Headers, ref continueCapture); if (!continueCapture) { throw new OperationCancelledException(); } // transfer it to a stream MemoryStream pageStream = new MemoryStream(); using (Stream responseStream = response.GetResponseStream()) StreamHelper.Transfer(responseStream, pageStream); pageStream.Seek(0, SeekOrigin.Begin); // allow filter on content OnContentReceived(new StreamReader(pageStream).ReadToEnd(), ref continueCapture); if (!continueCapture) { throw new OperationCancelledException(); } pageStream.Seek(0, SeekOrigin.Begin); // Read the stream into a lightweight HTML doc. We use from LightWeightHTMLDocument.FromIHTMLDocument2 // instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration // above the docType (bug 289357) IHTMLDocument2 doc = HTMLDocumentHelper.StreamToHTMLDoc(pageStream, TargetUrl, false); LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, TargetUrl, true); // download references FileBasedSiteStorage siteStorage = new FileBasedSiteStorage(DestinationPath, "index.htm"); PageToDownload page = new PageToDownload(ldoc, TargetUrl, siteStorage.RootFile); PageAndReferenceDownloader downloader = new PageAndReferenceDownloader(new PageToDownload[] { page }, siteStorage); downloader.Download(new TimeoutProgressHost(timeoutMs)); // return path to captured page return(Path.Combine(DestinationPath, siteStorage.RootFile)); }
private PageToDownload DownloadUrl(string url, PageToDownload parent, IProgressHost progress) { PageToDownload thisPageToDownload = null; // Download the current page LightWeightHTMLDocument lightWeightDoc = null; using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, url, null, _context.CookieString, _context.TimeoutMS, true)) { downloader.DownloadHTMLDocument(progress); lightWeightDoc = LightWeightHTMLDocument.FromIHTMLDocument2(downloader.HtmlDocument, downloader.Url); thisPageToDownload = new PageToDownload(lightWeightDoc, url, null, parent); // Reset the url in the event that a redirect occurred thisPageToDownload.AbsoluteUrl = downloader.Url; } foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in lightWeightDoc.StyleResourcesUrls) { thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } return(thisPageToDownload); }
/// <summary> /// Fetches the HTMLMetaData /// </summary> private LightWeightHTMLMetaData GetMetaData(WebRequestWithCache.CacheSettings cacheSettings) { WebRequestWithCache webRequest = new WebRequestWithCache(m_url); Stream stream = webRequest.GetResponseStream(cacheSettings); LightWeightHTMLDocument document = null; try { document = LightWeightHTMLDocument.FromStream(stream, m_url); } catch (Exception e) { Debug.Fail("Couldn't get metadata from stream: " + e.Message); } if (document != null) { return(new LightWeightHTMLMetaData(document)); } else { return(null); } }
private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name, bool escapePaths) { return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, name, escapePaths, true)); }
public LightWeightHTMLDataObject(LightWeightHTMLDocument lightWeightDocument) { IDataObject = new DataObject(LIGHTWEIGHTHTMLDOCUMENTFORMAT, lightWeightDocument); }
private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name) { return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, name, true)); }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) throw new OperationCancelledException(); _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) thisPageTicks = TOTALTICKS; ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) headerInfo = new UrlContentTypeInfo("text/html", url); else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); else headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return (PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload)); }
/// <summary> /// Creates a new PageToDownloadFactory based upon an existing HTMLDocument /// </summary> /// <param name="lightWeightHTMLDocument">The HTMLDocument that represents this page</param> /// <param name="downloadContext">Context that control factory behavior</param> /// <param name="parent">The control that should be used as a parent</param> public PageToDownloadFactory(LightWeightHTMLDocument lightWeightHTMLDocument, PageDownloadContext downloadContext, Control parentControl) : this(downloadContext, parentControl) { _lightWeightHTMLDocument = lightWeightHTMLDocument; _url = lightWeightHTMLDocument.Url; }
public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName) : this(htmlDocument, url, rootFileName, null) { }
protected override void OnBeginTag(BeginTag tag) { if (tag != null) { // Reset any frame urls // This is done because the HTML that is often in this document may have // incorrect urls for frames. The frames enumeration is accurate, so if the // name from the frames enumeration is the same as this frame, we should fix its // url up. if (tag.NameEquals(HTMLTokens.Frame)) { Attr name = tag.GetAttribute(HTMLTokens.Name); if (name != null && this._frames != null) { LightWeightHTMLDocument frameDoc = GetFrameDocumentByName(name.Value); if (frameDoc != null) { Attr src = tag.GetAttribute(HTMLTokens.Src); if (src != null && src.Value != frameDoc.Url) { Generator.AddSubstitionUrl(new UrlToReplace(src.Value, frameDoc.Url)); } } } } LightWeightTag currentTag = new LightWeightTag(tag); // The key we'll use for the table string key = tag.Name.ToUpper(CultureInfo.InvariantCulture); if (!_tagTable.ContainsKey(key)) { _tagTable[key] = new LightWeightTag[0]; } LightWeightTag[] currentTags = (LightWeightTag[])_tagTable[key]; LightWeightTag[] grownTags = new LightWeightTag[currentTags.Length + 1]; currentTags.CopyTo(grownTags, 0); grownTags[currentTags.Length] = currentTag; _tagTable[key] = grownTags; // Accumulate the title text if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete) { _nextTextIsTitleText = true; } else if (tag.NameEquals(HTMLTokens.A) && !tag.Complete && tag.GetAttribute(HTMLTokens.Href) != null) { if (_collectingForTag != null) { if (tag.NameEquals(HTMLTokens.A)) { _collectingForTagDepth++; } } else { _collectingForTag = currentTag; } } } base.OnBeginTag(tag); }
private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name, bool escapePaths, bool escapeEmptyString) { string escapedHtml = HTMLDocumentHelper.HTMLDocToString(htmlDocument); if (escapedHtml == null) return null; if (escapePaths) escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(escapedHtml, url, true, escapeEmptyString); LightWeightHTMLDocument finalDocument = new LightWeightHTMLDocument(escapedHtml, url, name); // Set the Frames finalDocument.SetFrames(GetLightWeightDocumentForFrames(htmlDocument)); // Set the styles finalDocument.SetStyleReferences(HTMLDocumentHelper.GetStyleReferencesForDocument(htmlDocument, url)); // Set the DocType HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(htmlDocument); finalDocument._docType = specialHeaders.DocType; finalDocument._savedFrom = specialHeaders.SavedFrom; finalDocument.Parse(); return finalDocument; }
/// Changed this from private to public public void SetFrames(LightWeightHTMLDocument[] frames) { _frames = frames; }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) { throw new OperationCancelledException(); } _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) { thisPageTicks = TOTALTICKS; } ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) { headerInfo = new UrlContentTypeInfo("text/html", url); } else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) { headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); } else { headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) { thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); } else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) { foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) { thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } } } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else { thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; } // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) { _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); } // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload))); }
private LightWeightHTMLDocumentData(LightWeightHTMLDocument document) { _document = document; }
public static LightWeightHTMLDocument FromString(string html, string baseUrl) { return(LightWeightHTMLDocument.FromString(html, baseUrl, true)); }
public static LightWeightHTMLDocument FromString(string html, string baseUrl, string name, bool escapePaths) { string escapedHtml = html; if (escapePaths) escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(html, baseUrl); LightWeightHTMLDocument escapedDocument = new LightWeightHTMLDocument(escapedHtml, baseUrl, name); HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(escapedHtml, baseUrl); escapedDocument._docType = specialHeaders.DocType; escapedDocument._savedFrom = specialHeaders.SavedFrom; escapedDocument.Parse(); return escapedDocument; }