private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name, bool escapePaths, bool escapeEmptyString)
        {
            string escapedHtml = HTMLDocumentHelper.HTMLDocToString(htmlDocument);

            if (escapedHtml == null)
            {
                return(null);
            }

            if (escapePaths)
            {
                escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(escapedHtml, url, true, escapeEmptyString);
            }

            LightWeightHTMLDocument finalDocument = new LightWeightHTMLDocument(escapedHtml, url, name);

            // Set the Frames
            finalDocument.SetFrames(GetLightWeightDocumentForFrames(htmlDocument));

            // Set the styles
            finalDocument.SetStyleReferences(HTMLDocumentHelper.GetStyleReferencesForDocument(htmlDocument, url));

            // Set the DocType
            HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(htmlDocument);
            finalDocument._docType   = specialHeaders.DocType;
            finalDocument._savedFrom = specialHeaders.SavedFrom;

            finalDocument.Parse();
            return(finalDocument);
        }
 public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName, PageToDownload parentInfo)
 {
     ParentInfo = parentInfo;
     _lightweightHTMLDocument = htmlDocument;
     _urlToReplace            = url;
     _anchor    = UrlHelper.GetAnchorIdentifier(url);
     m_fileName = rootFileName;
 }
Пример #3
0
 public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName, PageToDownload parentInfo)
 {
     ParentInfo = parentInfo;
     _lightweightHTMLDocument = htmlDocument;
     _urlToReplace = url;
     _anchor = UrlHelper.GetAnchorIdentifier(url);
     m_fileName = rootFileName;
 }
        public static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url)
        {
            if (htmlDocument == null)
            {
                return(null);
            }

            return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, true));
        }
        public static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, bool escapePaths, bool escapeEmptyString)
        {
            if (htmlDocument == null)
            {
                return(null);
            }

            return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, null, escapePaths, escapeEmptyString));
        }
        public static LightWeightHTMLDocument FromStream(Stream stream, string url, string name)
        {
            if (!stream.CanSeek)
            {
                string filePath = TempFileManager.Instance.CreateTempFile();
                using (FileStream file = new FileStream(filePath, FileMode.Open))
                    StreamHelper.Transfer(stream, file);

                return(LightWeightHTMLDocument.FromFile(filePath, url, name));
            }
            else
            {
                Encoding currentEncoding      = Encoding.Default;
                LightWeightHTMLDocument lwDoc = null;
                using (StreamReader reader = new StreamReader(stream, currentEncoding))
                {
                    lwDoc = LightWeightHTMLDocument.FromString(reader.ReadToEnd(), url, name, true);

                    // If there is no metadata that disagrees with our encoding, just return the DOM read with default decoding
                    LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(lwDoc);
                    if (metaData != null && metaData.Charset != null)
                    {
                        try
                        {
                            // The decoding is different than the encoding used to read this document, reread it with correct encoding
                            Encoding encoding = Encoding.GetEncoding(metaData.Charset);
                            if (encoding != currentEncoding)
                            {
                                reader.DiscardBufferedData();
                                stream.Seek(0, SeekOrigin.Begin);

                                using (StreamReader reader2 = new StreamReader(stream, encoding))
                                {
                                    lwDoc = LightWeightHTMLDocument.FromString(reader2.ReadToEnd(), url, name, true);
                                }
                            }
                        }
                        catch (NotSupportedException)
                        {
                            // The encoding isn't supported on this system
                        }
                        catch (ArgumentException)
                        {
                            // The encoding isn't an encoding that the OS even knows about (its probably
                            // not well formatted or misspelled or something)
                        }
                    }
                }

                return(lwDoc);
            }
        }
        public static LightWeightHTMLDocument[] GetLightWeightDocumentForFrames(IHTMLDocument2 htmlDocument)
        {
            ArrayList frameLightWeightDocuments = new ArrayList();

            // Get the IOleContainer for the for the html document (this requires that
            // the document is the root document in the browser)
            IOleContainer oleContainer = (IOleContainer)htmlDocument;
            IEnumUnknown  enumUnknown;

            // Enumerate the controls in the browser
            oleContainer.EnumObjects(OLECONTF.EMBEDDINGS, out enumUnknown);

            // Iterate through the controls
            object unknown;

            for (int i = 0; HRESULT.S_OK == enumUnknown.Next(1, out unknown, IntPtr.Zero); i++)
            {
                // Only subframes should cast to IWebBrowser2
                IWebBrowser2 webBrowser = unknown as IWebBrowser2;

                // Since it is a subframe, we can also get the base frame implementation for it
                IHTMLFrameBase frameBase = unknown as IHTMLFrameBase;

                // It's a frame, add this to the list!
                if (webBrowser != null)
                {
                    try
                    {
                        IHTMLDocument2 frameDocument = webBrowser.Document as IHTMLDocument2;

                        if (frameDocument != null)
                        {
                            LightWeightHTMLDocument document = LightWeightHTMLDocument.FromIHTMLDocument2(frameDocument, frameDocument.url, frameBase.name);
                            if (document != null)
                            {
                                frameLightWeightDocuments.Add(document);
                            }
                        }
                    }
                    catch (InvalidCastException)
                    {
                        string html = "<HTML></HTML>";
                        LightWeightHTMLDocument document = LightWeightHTMLDocument.FromString(html, webBrowser.LocationURL, webBrowser.LocationURL, true);
                        if (document != null)
                        {
                            frameLightWeightDocuments.Add(document);
                        }
                    }
                }
            }
            return((LightWeightHTMLDocument[])frameLightWeightDocuments.ToArray(typeof(LightWeightHTMLDocument)));
        }
        public static LightWeightHTMLDocumentData Create(IDataObject iDataObject)
        {
            string[] loser = iDataObject.GetFormats();

            if (OleDataObjectHelper.GetDataPresentSafe(iDataObject, LightWeightHTMLDataObject.LIGHTWEIGHTHTMLDOCUMENTFORMAT))
            {
                LightWeightHTMLDocument document = (LightWeightHTMLDocument)iDataObject.GetData(LightWeightHTMLDataObject.LIGHTWEIGHTHTMLDOCUMENTFORMAT);
                return(new LightWeightHTMLDocumentData(document));
            }
            else
            {
                return(null);
            }
        }
 /// <summary>
 /// Constructs a new HTMLMetaData for an IHTMLDocument2
 /// </summary>
 /// <param name="HTMLDocument">The IHTMLDocument2 for which to fetch metadata.</param>
 public LightWeightHTMLMetaData(LightWeightHTMLDocument HTMLDocument)
 {
     m_HTMLDocument = HTMLDocument;
     _docType       = HTMLDocument.DocType;
     _savedFrom     = HTMLDocument.SavedFrom;
     LightWeightTag[] beginTags = HTMLDocument.GetTagsByName(HTMLTokens.Base);
     foreach (LightWeightTag baseTag in beginTags)
     {
         Attr href = baseTag.BeginTag.GetAttribute(HTMLTokens.Href);
         if (href != null)
         {
             _base = href.Value;
             break;
         }
     }
 }
 /// <summary>
 /// Constructs a new HTMLMetaData for an IHTMLDocument2
 /// </summary>
 /// <param name="HTMLDocument">The IHTMLDocument2 for which to fetch metadata.</param>
 public LightWeightHTMLMetaData(LightWeightHTMLDocument HTMLDocument)
 {
     m_HTMLDocument = HTMLDocument;
     _docType = HTMLDocument.DocType;
     _savedFrom = HTMLDocument.SavedFrom;
     LightWeightTag[] beginTags = HTMLDocument.GetTagsByName(HTMLTokens.Base);
     foreach (LightWeightTag baseTag in beginTags)
     {
         Attr href = baseTag.BeginTag.GetAttribute(HTMLTokens.Href);
         if (href != null)
         {
             _base = href.Value;
             break;
         }
     }
 }
        public static LightWeightHTMLDocument FromString(string html, string baseUrl, string name, bool escapePaths)
        {
            string escapedHtml = html;

            if (escapePaths)
            {
                escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(html, baseUrl);
            }

            LightWeightHTMLDocument escapedDocument = new LightWeightHTMLDocument(escapedHtml, baseUrl, name);

            HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(escapedHtml, baseUrl);
            escapedDocument._docType   = specialHeaders.DocType;
            escapedDocument._savedFrom = specialHeaders.SavedFrom;
            escapedDocument.Parse();
            return(escapedDocument);
        }
Пример #12
0
        public string Capture(int timeoutMs)
        {
            // flag indicating whether we should continue with the capture
            bool continueCapture = true;

            // request the page
            HttpWebResponse response = RequestPage(TargetUrl, timeoutMs);

            OnHeadersReceived(response.Headers, ref continueCapture);
            if (!continueCapture)
            {
                throw new OperationCancelledException();
            }

            // transfer it to a stream
            MemoryStream pageStream = new MemoryStream();

            using (Stream responseStream = response.GetResponseStream())
                StreamHelper.Transfer(responseStream, pageStream);
            pageStream.Seek(0, SeekOrigin.Begin);

            // allow filter on content
            OnContentReceived(new StreamReader(pageStream).ReadToEnd(), ref continueCapture);
            if (!continueCapture)
            {
                throw new OperationCancelledException();
            }
            pageStream.Seek(0, SeekOrigin.Begin);

            // Read the stream into a lightweight HTML doc. We use from LightWeightHTMLDocument.FromIHTMLDocument2
            // instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration
            // above the docType (bug 289357)
            IHTMLDocument2          doc  = HTMLDocumentHelper.StreamToHTMLDoc(pageStream, TargetUrl, false);
            LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, TargetUrl, true);

            // download references
            FileBasedSiteStorage       siteStorage = new FileBasedSiteStorage(DestinationPath, "index.htm");
            PageToDownload             page        = new PageToDownload(ldoc, TargetUrl, siteStorage.RootFile);
            PageAndReferenceDownloader downloader  = new PageAndReferenceDownloader(new PageToDownload[] { page }, siteStorage);

            downloader.Download(new TimeoutProgressHost(timeoutMs));

            // return path to captured page
            return(Path.Combine(DestinationPath, siteStorage.RootFile));
        }
Пример #13
0
        private PageToDownload DownloadUrl(string url, PageToDownload parent, IProgressHost progress)
        {
            PageToDownload thisPageToDownload = null;

            // Download the current page
            LightWeightHTMLDocument lightWeightDoc = null;

            using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, url, null, _context.CookieString, _context.TimeoutMS, true))
            {
                downloader.DownloadHTMLDocument(progress);
                lightWeightDoc     = LightWeightHTMLDocument.FromIHTMLDocument2(downloader.HtmlDocument, downloader.Url);
                thisPageToDownload = new PageToDownload(lightWeightDoc, url, null, parent);
                // Reset the url in the event that a redirect occurred
                thisPageToDownload.AbsoluteUrl = downloader.Url;
            }

            foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in lightWeightDoc.StyleResourcesUrls)
            {
                thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
            }

            return(thisPageToDownload);
        }
Пример #14
0
        /// <summary>
        /// Fetches the HTMLMetaData
        /// </summary>
        private LightWeightHTMLMetaData GetMetaData(WebRequestWithCache.CacheSettings cacheSettings)
        {
            WebRequestWithCache webRequest = new WebRequestWithCache(m_url);
            Stream stream = webRequest.GetResponseStream(cacheSettings);
            LightWeightHTMLDocument document = null;

            try
            {
                document = LightWeightHTMLDocument.FromStream(stream, m_url);
            }
            catch (Exception e)
            {
                Debug.Fail("Couldn't get metadata from stream: " + e.Message);
            }

            if (document != null)
            {
                return(new LightWeightHTMLMetaData(document));
            }
            else
            {
                return(null);
            }
        }
 private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name, bool escapePaths)
 {
     return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, name, escapePaths, true));
 }
 public LightWeightHTMLDataObject(LightWeightHTMLDocument lightWeightDocument)
 {
     IDataObject = new DataObject(LIGHTWEIGHTHTMLDOCUMENTFORMAT, lightWeightDocument);
 }
 private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name)
 {
     return(LightWeightHTMLDocument.FromIHTMLDocument2(htmlDocument, url, name, true));
 }
        /// <summary>
        /// Actually downloads the pages
        /// </summary>
        private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload)
        {
            // Check for cancel
            if (progress.CancelRequested)
                throw new OperationCancelledException();

            _currentDepth++;
            ArrayList downloadedPages = new ArrayList();

            // Set up our progress
            int thisPageTicks = FIRSTPAGETICKS;
            if (_context.Depth == _currentDepth)
                thisPageTicks = TOTALTICKS;
            ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS);

            string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url);

            // Look up the content type of this pageToDownload
            UrlContentTypeInfo headerInfo = null;
            if (_headerInfo.ContainsKey(safeUrl))
            {
                headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl];
            }
            else
            {
                if (lightWeightDocument != null)
                    headerInfo = new UrlContentTypeInfo("text/html", url);
                else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url))
                {
                    progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url));
                    if (lightWeightDocument == null)
                        headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS);
                    else
                        headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url);
                }
                _headerInfo.Add(safeUrl, headerInfo);
            }

            // If this is a web page and we should download it, do it!
            if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) ||
                (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo))
                )
            {
                bool downloadWorked = false;
                int downloadAttempts = -1;
                bool timedOut = true;

                // Max sure we are retrying the correct number of times
                ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100);
                while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut)
                {
                    timedOut = false;

                    pageDownloadProgress.UpdateProgress(0, 1);
                    try
                    {
                        // If we haven't downloaded this page yet download it
                        PageToDownload thisPageToDownload = null;

                        if (!_context.UrlAlreadyDownloaded(safeUrl))
                        {
                            if (lightWeightDocument == null)
                                thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress);
                            else
                            {
                                LightWeightHTMLDocument htmlDoc = lightWeightDocument;

                                // Only redownload if we absolutely need to
                                if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null))
                                {

                                    string html = htmlDoc.GenerateHtml();
                                    string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm");
                                    using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8))
                                        writer.Write(html);
                                    using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false))
                                    {
                                        downloader.DownloadHTMLDocument(pageDownloadProgress);

                                        htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url);
                                    }
                                }
                                thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload);
                                if (htmlDoc.StyleResourcesUrls != null)
                                    foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls)
                                        thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
                            }
                            // Add this page to our lists
                            _context.AddPageToDownload(safeUrl, thisPageToDownload, true);
                            downloadedPages.Add(thisPageToDownload);

                        }
                        else
                            thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl];

                        // If we're downloading a site, add a second copy of the root page in the references subdir
                        // This was, if the root page gets renamed, links back to it will still work correctly
                        // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output
                        // the site and change the root file name
                        if (thisPageToDownload.IsRootPage && _context.Depth > 0)
                        {
                            PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload);
                            downloadedPages.Add(copyOfThisPageToDownload);
                        }

                        // enumerate the frames of this page and add them to the list of pages
                        PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload);
                        downloadedPages.AddRange(subFramesToDownload);
                        foreach (PageToDownload pageToDownload in subFramesToDownload)
                            _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false);

                        // Now drill down based upon the depth configuration
                        if (_context.ShouldContinue(_currentDepth))
                        {
                            ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS);
                            downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload));
                        }
                        downloadWorked = true;
                        firstPagedownloadProgress.UpdateProgress(1, 1);

                    }
                    catch (OperationTimedOutException)
                    {
                        timedOut = true;
                    }
                    catch (WebPageDownloaderException htex)
                    {
                        HandleException(new Exception(htex.Message, htex));
                    }
                    catch (Exception ex)
                    {
                        HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex));
                    }
                }

                // If we never got the download to succeed, add it to the list of timed out Urls
                if (!downloadWorked && timedOut)
                {
                    _context.AddTimedOutUrl(_url);
                    firstPagedownloadProgress.UpdateProgress(1, 1);

                }
            }
            // If it isn't a page we'll just add the file to the reference list for the parent page
            // There is not an else, because we could be looking at a reference, but a reference that
            // should not be downloaded (in which case we just ignore it)
            else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo))
            {
                parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload));
                progress.UpdateProgress(1, 1);
            }

            progress.UpdateProgress(1, 1);

            _currentDepth--;
            return (PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload));
        }
 /// <summary>
 /// Creates a new PageToDownloadFactory based upon an existing HTMLDocument
 /// </summary>
 /// <param name="lightWeightHTMLDocument">The HTMLDocument that represents this page</param>
 /// <param name="downloadContext">Context that control factory behavior</param>
 /// <param name="parent">The control that should be used as a parent</param>
 public PageToDownloadFactory(LightWeightHTMLDocument lightWeightHTMLDocument, PageDownloadContext downloadContext, Control parentControl) : this(downloadContext, parentControl)
 {
     _lightWeightHTMLDocument = lightWeightHTMLDocument;
     _url = lightWeightHTMLDocument.Url;
 }
Пример #20
0
 public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName) :
     this(htmlDocument, url, rootFileName, null)
 {
 }
        protected override void OnBeginTag(BeginTag tag)
        {
            if (tag != null)
            {
                // Reset any frame urls
                // This is done because the HTML that is often in this document may have
                // incorrect urls for frames.  The frames enumeration is accurate, so if the
                // name from the frames enumeration is the same as this frame, we should fix its
                // url up.
                if (tag.NameEquals(HTMLTokens.Frame))
                {
                    Attr name = tag.GetAttribute(HTMLTokens.Name);
                    if (name != null && this._frames != null)
                    {
                        LightWeightHTMLDocument frameDoc = GetFrameDocumentByName(name.Value);
                        if (frameDoc != null)
                        {
                            Attr src = tag.GetAttribute(HTMLTokens.Src);
                            if (src != null && src.Value != frameDoc.Url)
                            {
                                Generator.AddSubstitionUrl(new UrlToReplace(src.Value, frameDoc.Url));
                            }
                        }
                    }
                }

                LightWeightTag currentTag = new LightWeightTag(tag);
                // The key we'll use for the table
                string key = tag.Name.ToUpper(CultureInfo.InvariantCulture);
                if (!_tagTable.ContainsKey(key))
                {
                    _tagTable[key] = new LightWeightTag[0];
                }

                LightWeightTag[] currentTags = (LightWeightTag[])_tagTable[key];
                LightWeightTag[] grownTags   = new LightWeightTag[currentTags.Length + 1];
                currentTags.CopyTo(grownTags, 0);
                grownTags[currentTags.Length] = currentTag;
                _tagTable[key] = grownTags;

                // Accumulate the title text
                if (tag.NameEquals(HTMLTokens.Title) && !tag.Complete)
                {
                    _nextTextIsTitleText = true;
                }
                else if (tag.NameEquals(HTMLTokens.A) && !tag.Complete && tag.GetAttribute(HTMLTokens.Href) != null)
                {
                    if (_collectingForTag != null)
                    {
                        if (tag.NameEquals(HTMLTokens.A))
                        {
                            _collectingForTagDepth++;
                        }
                    }
                    else
                    {
                        _collectingForTag = currentTag;
                    }
                }
            }
            base.OnBeginTag(tag);
        }
        private static LightWeightHTMLDocument FromIHTMLDocument2(IHTMLDocument2 htmlDocument, string url, string name, bool escapePaths, bool escapeEmptyString)
        {
            string escapedHtml = HTMLDocumentHelper.HTMLDocToString(htmlDocument);
            if (escapedHtml == null)
                return null;

            if (escapePaths)
                escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(escapedHtml, url, true, escapeEmptyString);

            LightWeightHTMLDocument finalDocument = new LightWeightHTMLDocument(escapedHtml, url, name);

            // Set the Frames
            finalDocument.SetFrames(GetLightWeightDocumentForFrames(htmlDocument));

            // Set the styles
            finalDocument.SetStyleReferences(HTMLDocumentHelper.GetStyleReferencesForDocument(htmlDocument, url));

            // Set the DocType
            HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(htmlDocument);
            finalDocument._docType = specialHeaders.DocType;
            finalDocument._savedFrom = specialHeaders.SavedFrom;

            finalDocument.Parse();
            return finalDocument;
        }
 /// Changed this from private to public
 public void SetFrames(LightWeightHTMLDocument[] frames)
 {
     _frames = frames;
 }
 public PageToDownload(LightWeightHTMLDocument htmlDocument, string url, string rootFileName) :
     this(htmlDocument, url, rootFileName, null)
 {
 }
Пример #25
0
        /// <summary>
        /// Actually downloads the pages
        /// </summary>
        private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload)
        {
            // Check for cancel
            if (progress.CancelRequested)
            {
                throw new OperationCancelledException();
            }

            _currentDepth++;
            ArrayList downloadedPages = new ArrayList();

            // Set up our progress
            int thisPageTicks = FIRSTPAGETICKS;

            if (_context.Depth == _currentDepth)
            {
                thisPageTicks = TOTALTICKS;
            }
            ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS);

            string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url);

            // Look up the content type of this pageToDownload
            UrlContentTypeInfo headerInfo = null;

            if (_headerInfo.ContainsKey(safeUrl))
            {
                headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl];
            }
            else
            {
                if (lightWeightDocument != null)
                {
                    headerInfo = new UrlContentTypeInfo("text/html", url);
                }
                else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url))
                {
                    progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url));
                    if (lightWeightDocument == null)
                    {
                        headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS);
                    }
                    else
                    {
                        headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url);
                    }
                }
                _headerInfo.Add(safeUrl, headerInfo);
            }

            // If this is a web page and we should download it, do it!
            if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) ||
                (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo))
                )
            {
                bool downloadWorked   = false;
                int  downloadAttempts = -1;
                bool timedOut         = true;

                // Max sure we are retrying the correct number of times
                ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100);
                while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut)
                {
                    timedOut = false;

                    pageDownloadProgress.UpdateProgress(0, 1);
                    try
                    {
                        // If we haven't downloaded this page yet download it
                        PageToDownload thisPageToDownload = null;

                        if (!_context.UrlAlreadyDownloaded(safeUrl))
                        {
                            if (lightWeightDocument == null)
                            {
                                thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress);
                            }
                            else
                            {
                                LightWeightHTMLDocument htmlDoc = lightWeightDocument;

                                // Only redownload if we absolutely need to
                                if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null))
                                {
                                    string html     = htmlDoc.GenerateHtml();
                                    string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm");
                                    using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8))
                                        writer.Write(html);
                                    using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false))
                                    {
                                        downloader.DownloadHTMLDocument(pageDownloadProgress);

                                        htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url);
                                    }
                                }
                                thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload);
                                if (htmlDoc.StyleResourcesUrls != null)
                                {
                                    foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls)
                                    {
                                        thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl));
                                    }
                                }
                            }
                            // Add this page to our lists
                            _context.AddPageToDownload(safeUrl, thisPageToDownload, true);
                            downloadedPages.Add(thisPageToDownload);
                        }
                        else
                        {
                            thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl];
                        }

                        // If we're downloading a site, add a second copy of the root page in the references subdir
                        // This was, if the root page gets renamed, links back to it will still work correctly
                        // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output
                        // the site and change the root file name
                        if (thisPageToDownload.IsRootPage && _context.Depth > 0)
                        {
                            PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload);
                            downloadedPages.Add(copyOfThisPageToDownload);
                        }

                        // enumerate the frames of this page and add them to the list of pages
                        PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload);
                        downloadedPages.AddRange(subFramesToDownload);
                        foreach (PageToDownload pageToDownload in subFramesToDownload)
                        {
                            _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false);
                        }

                        // Now drill down based upon the depth configuration
                        if (_context.ShouldContinue(_currentDepth))
                        {
                            ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS);
                            downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload));
                        }
                        downloadWorked = true;
                        firstPagedownloadProgress.UpdateProgress(1, 1);
                    }
                    catch (OperationTimedOutException)
                    {
                        timedOut = true;
                    }
                    catch (WebPageDownloaderException htex)
                    {
                        HandleException(new Exception(htex.Message, htex));
                    }
                    catch (Exception ex)
                    {
                        HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex));
                    }
                }

                // If we never got the download to succeed, add it to the list of timed out Urls
                if (!downloadWorked && timedOut)
                {
                    _context.AddTimedOutUrl(_url);
                    firstPagedownloadProgress.UpdateProgress(1, 1);
                }
            }
            // If it isn't a page we'll just add the file to the reference list for the parent page
            // There is not an else, because we could be looking at a reference, but a reference that
            // should not be downloaded (in which case we just ignore it)
            else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo))
            {
                parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload));
                progress.UpdateProgress(1, 1);
            }

            progress.UpdateProgress(1, 1);

            _currentDepth--;
            return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload)));
        }
 private LightWeightHTMLDocumentData(LightWeightHTMLDocument document)
 {
     _document = document;
 }
 private LightWeightHTMLDocumentData(LightWeightHTMLDocument document)
 {
     _document = document;
 }
 public LightWeightHTMLDataObject(LightWeightHTMLDocument lightWeightDocument)
 {
     IDataObject = new DataObject(LIGHTWEIGHTHTMLDOCUMENTFORMAT, lightWeightDocument);
 }
 public static LightWeightHTMLDocument FromString(string html, string baseUrl)
 {
     return(LightWeightHTMLDocument.FromString(html, baseUrl, true));
 }
        public static LightWeightHTMLDocument FromString(string html, string baseUrl, string name, bool escapePaths)
        {
            string escapedHtml = html;
            if (escapePaths)
                escapedHtml = LightWeightHTMLUrlToAbsolute.ConvertToAbsolute(html, baseUrl);

            LightWeightHTMLDocument escapedDocument = new LightWeightHTMLDocument(escapedHtml, baseUrl, name);
            HTMLDocumentHelper.SpecialHeaders specialHeaders = HTMLDocumentHelper.GetSpecialHeaders(escapedHtml, baseUrl);
            escapedDocument._docType = specialHeaders.DocType;
            escapedDocument._savedFrom = specialHeaders.SavedFrom;
            escapedDocument.Parse();
            return escapedDocument;
        }
Пример #31
0
 /// <summary>
 /// Creates a new PageToDownloadFactory based upon an existing HTMLDocument
 /// </summary>
 /// <param name="lightWeightHTMLDocument">The HTMLDocument that represents this page</param>
 /// <param name="downloadContext">Context that control factory behavior</param>
 /// <param name="parent">The control that should be used as a parent</param>
 public PageToDownloadFactory(LightWeightHTMLDocument lightWeightHTMLDocument, PageDownloadContext downloadContext, Control parentControl) : this(downloadContext, parentControl)
 {
     _lightWeightHTMLDocument = lightWeightHTMLDocument;
     _url = lightWeightHTMLDocument.Url;
 }