/// <summary> /// Downloads a webpage from a blog. /// </summary> /// <param name="blogHomepageUrl"></param> /// <param name="progress"></param> /// <returns></returns> private MemoryStream DownloadBlogPage(string blogHomepageUrl, IProgressHost progress) { ProgressTick tick = new ProgressTick(progress, 50, 100); if (progress.CancelRequested) { throw new OperationCancelledException(); } tick.UpdateProgress(0, 0, Res.Get(StringId.ProgressDownloadingWeblogEditingStyle)); HttpWebResponse resp = _pageDownloader(blogHomepageUrl, 60000); MemoryStream memStream = new MemoryStream(); using (Stream respStream = resp.GetResponseStream()) StreamHelper.Transfer(respStream, memStream); //read in the HTML file and determine if it contains the title element memStream.Seek(0, SeekOrigin.Begin); tick.UpdateProgress(100, 100); //return the stream memStream.Seek(0, SeekOrigin.Begin); return(memStream); }
/// <summary> /// Downloads a webpage from a blog and searches for TEMPORARY_POST_TITLE_GUID. /// </summary> /// <param name="blogPageUrl"></param> /// <param name="progress"></param> /// <returns>Stream containing document which contains TEMPORARY_POST_TITLE_GUID.</returns> private Stream DownloadBlogPage(string blogPageUrl, IProgressHost progress) { ProgressTick tick = new ProgressTick(progress, 50, 100); MemoryStream memStream = new MemoryStream(); IHTMLDocument2 doc2 = null; // WinLive 221984: Theme detection timing out intermittently on WordPress.com // The temp post *often* takes more than a minute to show up on the blog home page. // The download progress dialog has a cancel button, we'll try a lot before giving up. for (int i = 0; i < 30 && doc2 == null; i++) { if (progress.CancelRequested) { throw new OperationCancelledException(); } tick.UpdateProgress(0, 0, Res.Get(StringId.ProgressDownloadingWeblogEditingStyle)); // Sleep to give the post enough time to show up. // We'll make 10 attempts with a 1 second delay. // Subsequent attempts will use a 10 second delay. // This means we'll try for 5 minutes (10s + 290s = 300s) before we consider the operation timed out. Thread.Sleep(i < 10 ? 1000 : 10000); // Add random parameter to URL to bypass cache var urlRandom = UrlHelper.AppendQueryParameters(blogPageUrl, new string[] { Guid.NewGuid().ToString() }); HttpWebResponse resp = _pageDownloader(urlRandom, 60000); memStream = new MemoryStream(); using (Stream respStream = resp.GetResponseStream()) StreamHelper.Transfer(respStream, memStream); //read in the HTML file and determine if it contains the title element memStream.Seek(0, SeekOrigin.Begin); doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, urlRandom); if (HTMLDocumentHelper.FindElementContainingText(doc2, TEMPORARY_POST_TITLE_GUID) == null) { doc2 = null; } } if (doc2 == null) { throw new OperationTimedOutException(); } tick.UpdateProgress(100, 100); //return the stream memStream.Seek(0, SeekOrigin.Begin); return(memStream); }
private string DownloadTemplateFiles(string templateContents, string templateUrl, IProgressHost progress) { progress.UpdateProgress(Res.Get(StringId.ProgressDownloadingSupportingFiles)); FileBasedSiteStorage files = new FileBasedSiteStorage(_blogTemplateDir); // convert the string to a stream MemoryStream templateStream = new MemoryStream(); StreamWriter writer = new StreamWriter(templateStream, Encoding.UTF8); writer.Write(templateContents); writer.Flush(); templateStream.Seek(0, SeekOrigin.Begin); //read the stream into a lightweight HTML. Note that we use from LightWeightHTMLDocument.FromIHTMLDocument2 //instead of LightWeightHTMLDocument.FromStream because from stream improperly shoves a saveFrom declaration //above the docType (bug 289357) IHTMLDocument2 doc = HTMLDocumentHelper.StreamToHTMLDoc(templateStream, templateUrl, true); LightWeightHTMLDocument ldoc = LightWeightHTMLDocument.FromIHTMLDocument2(doc, templateUrl, true, false); PageDownloadContext downloadContext = new PageDownloadContext(0); ApplyCredentials(downloadContext, templateUrl); using (PageToDownloadFactory downloadFactory = new PageToDownloadFactory(ldoc, downloadContext, _parentControl)) { //calculate the dependent styles and resources ProgressTick tick = new ProgressTick(progress, 50, 100); downloadFactory.CreatePagesToDownload(tick); tick.UpdateProgress(100, 100); //download the dependent styles and resources tick = new ProgressTick(progress, 50, 100); PageAndReferenceDownloader downloader = new PageAndReferenceDownloader(downloadFactory.PagesToDownload, files); this.ApplyCredentials(downloader, templateUrl); downloader.Download(tick); tick.UpdateProgress(100, 100); //Expand out the relative paths in the downloaded HTML file with absolute paths. //Note: this is necessary so that template resources are not improperly resolved relative // to the location of the file the editor is editing. string blogTemplateFile = Path.Combine(_blogTemplateDir, files.RootFile); string origFile = blogTemplateFile + ".token"; File.Move(blogTemplateFile, origFile); string absPath = String.Format(CultureInfo.InvariantCulture, "file:///{0}/{1}", _blogTemplateDir.Replace('\\', '/'), downloader.PathToken); TextHelper.ReplaceInFile(origFile, downloader.PathToken, blogTemplateFile, absPath); File.Delete(origFile); //fix up the files FixupDownloadedFiles(blogTemplateFile, files, downloader.PathToken); //complete the progress. progress.UpdateProgress(100, 100); File.WriteAllText(blogTemplateFile + ".path", absPath); return(blogTemplateFile); } }
public void DoWork() { ProgressTick progressTick = new ProgressTick(ParentProgress, ProgressSize, TotalProgressTicks); if (progressTick.CancelRequested) throw new OperationCancelledException(); _operationResult = WorkerMethod(progressTick); if (progressTick.CancelRequested) throw new OperationCancelledException(); progressTick.UpdateProgress(100, 100); //complete progress for the operation if (CompletedMethod != null) CompletedMethod(_operationResult); }
public void DoWork() { ProgressTick progressTick = new ProgressTick(ParentProgress, ProgressSize, TotalProgressTicks); if (progressTick.CancelRequested) { throw new OperationCancelledException(); } _operationResult = WorkerMethod(progressTick); if (progressTick.CancelRequested) { throw new OperationCancelledException(); } progressTick.UpdateProgress(100, 100); //complete progress for the operation if (CompletedMethod != null) { CompletedMethod(_operationResult); } }
/// <summary> /// Actually downloads the pages /// </summary> private PageToDownload[] DownloadPages(IProgressHost progress, string url, LightWeightHTMLDocument lightWeightDocument, PageToDownload parentPageToDownload) { // Check for cancel if (progress.CancelRequested) { throw new OperationCancelledException(); } _currentDepth++; ArrayList downloadedPages = new ArrayList(); // Set up our progress int thisPageTicks = FIRSTPAGETICKS; if (_context.Depth == _currentDepth) { thisPageTicks = TOTALTICKS; } ProgressTick firstPagedownloadProgress = new ProgressTick(progress, thisPageTicks, TOTALTICKS); string safeUrl = UrlHelper.GetUrlWithoutAnchorIdentifier(url); // Look up the content type of this pageToDownload UrlContentTypeInfo headerInfo = null; if (_headerInfo.ContainsKey(safeUrl)) { headerInfo = (UrlContentTypeInfo)_headerInfo[safeUrl]; } else { if (lightWeightDocument != null) { headerInfo = new UrlContentTypeInfo("text/html", url); } else if (headerInfo == null && !_context.IsTimedOutUrl(url) && _context.ShouldDownloadThisUrl(url)) { progress.UpdateProgress(string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressDeterminingType), url)); if (lightWeightDocument == null) { headerInfo = ContentTypeHelper.ExpensivelyGetUrlContentType(url, _context.TimeoutMS); } else { headerInfo = ContentTypeHelper.InexpensivelyGetUrlContentType(url); } } _headerInfo.Add(safeUrl, headerInfo); } // If this is a web page and we should download it, do it! if ((lightWeightDocument != null && IsDownloadablePageResource(headerInfo)) || (lightWeightDocument == null && IsDownloadablePageResource(headerInfo) && _context.ShouldDownloadThisUrl(headerInfo)) ) { bool downloadWorked = false; int downloadAttempts = -1; bool timedOut = true; // Max sure we are retrying the correct number of times ProgressTick pageDownloadProgress = new ProgressTick(firstPagedownloadProgress, 80, 100); while (!downloadWorked && downloadAttempts++ < _context.RetryCount && timedOut) { timedOut = false; pageDownloadProgress.UpdateProgress(0, 1); try { // If we haven't downloaded this page yet download it PageToDownload thisPageToDownload = null; if (!_context.UrlAlreadyDownloaded(safeUrl)) { if (lightWeightDocument == null) { thisPageToDownload = DownloadUrl(url, parentPageToDownload, pageDownloadProgress); } else { LightWeightHTMLDocument htmlDoc = lightWeightDocument; // Only redownload if we absolutely need to if (htmlDoc.HasFramesOrStyles && (htmlDoc.Frames == null || htmlDoc.StyleResourcesUrls == null)) { string html = htmlDoc.GenerateHtml(); string tempFile = TempFileManager.Instance.CreateTempFile("temp.htm"); using (StreamWriter writer = new StreamWriter(tempFile, false, Encoding.UTF8)) writer.Write(html); using (HTMLDocumentDownloader downloader = new HTMLDocumentDownloader(_parentControl, UrlHelper.GetLocalFileUrl(tempFile), htmlDoc.Title, _context.CookieString, _context.TimeoutMS, false)) { downloader.DownloadHTMLDocument(pageDownloadProgress); htmlDoc.UpdateBasedUponHTMLDocumentData(downloader.HtmlDocument, url); } } thisPageToDownload = new PageToDownload(htmlDoc, url, null, parentPageToDownload); if (htmlDoc.StyleResourcesUrls != null) { foreach (HTMLDocumentHelper.ResourceUrlInfo styleUrl in htmlDoc.StyleResourcesUrls) { thisPageToDownload.AddReference(new ReferenceToDownload(styleUrl.ResourceUrl, thisPageToDownload, styleUrl.ResourceAbsoluteUrl)); } } } // Add this page to our lists _context.AddPageToDownload(safeUrl, thisPageToDownload, true); downloadedPages.Add(thisPageToDownload); } else { thisPageToDownload = (PageToDownload)_context.CreatedPageToDownloadTable[safeUrl]; } // If we're downloading a site, add a second copy of the root page in the references subdir // This was, if the root page gets renamed, links back to it will still work correctly // This is a bit of a hack, but otherwise, we'll need to escape urls whenever we output // the site and change the root file name if (thisPageToDownload.IsRootPage && _context.Depth > 0) { PageToDownload copyOfThisPageToDownload = new PageToDownload(thisPageToDownload.LightWeightHTMLDocument.Clone(), thisPageToDownload.UrlToReplace, thisPageToDownload.FileName, thisPageToDownload); downloadedPages.Add(copyOfThisPageToDownload); } // enumerate the frames of this page and add them to the list of pages PageToDownload[] subFramesToDownload = GetFramePagesToDownload(thisPageToDownload); downloadedPages.AddRange(subFramesToDownload); foreach (PageToDownload pageToDownload in subFramesToDownload) { _context.AddPageToDownload(pageToDownload.AbsoluteUrl, pageToDownload, false); } // Now drill down based upon the depth configuration if (_context.ShouldContinue(_currentDepth)) { ProgressTick otherPagesdownloadProgress = new ProgressTick(progress, TOTALTICKS - thisPageTicks, TOTALTICKS); downloadedPages.AddRange(GetSubPagesToDownload(otherPagesdownloadProgress, downloadedPages, thisPageToDownload)); } downloadWorked = true; firstPagedownloadProgress.UpdateProgress(1, 1); } catch (OperationTimedOutException) { timedOut = true; } catch (WebPageDownloaderException htex) { HandleException(new Exception(htex.Message, htex)); } catch (Exception ex) { HandleException(new Exception(String.Format(CultureInfo.CurrentCulture, "{0} could not be downloaded", _url), ex)); } } // If we never got the download to succeed, add it to the list of timed out Urls if (!downloadWorked && timedOut) { _context.AddTimedOutUrl(_url); firstPagedownloadProgress.UpdateProgress(1, 1); } } // If it isn't a page we'll just add the file to the reference list for the parent page // There is not an else, because we could be looking at a reference, but a reference that // should not be downloaded (in which case we just ignore it) else if (headerInfo != null && _context.ShouldDownloadThisUrl(headerInfo)) { parentPageToDownload.AddReference(new ReferenceToDownload(url, parentPageToDownload)); progress.UpdateProgress(1, 1); } progress.UpdateProgress(1, 1); _currentDepth--; return((PageToDownload[])downloadedPages.ToArray(typeof(PageToDownload))); }
/// <summary> /// Downloads the pages and their references, providing progress feedback /// </summary> /// <param name="progressHost">The progresshost to use for feedback</param> /// <returns>this</returns> public object Download(IProgressHost progressHost) { // Prepare the list of references to download progressHost.UpdateProgress(Res.Get(StringId.ProgressPreparingListOfFiles)); foreach (PageToDownload pageToDownload in _pagesToDownload) { // Lay down a placeholder file with the correct file name try { string destination = Path.Combine(_siteStorage.BasePath, pageToDownload.RelativePath); destination = PathHelper.GetNonConflictingPath(destination); pageToDownload.FileName = Path.GetFileName(destination); using (Stream htmlStream = _siteStorage.Open(destination, AccessMode.Write)) { } } catch (Exception e) { HandleException(e); } foreach (ReferenceToDownload reference in pageToDownload.References) { // Don't add the same item more than once if (!_referencesToDownload.ContainsKey(reference.AbsoluteUrl)) { _referencesToDownload.Add(reference.AbsoluteUrl, reference); } } } // Enqueue the work items progressHost.UpdateProgress(Res.Get(StringId.ProgressStartingDownloadOfReferences)); IProgressHost[] progressHosts = new JointProgressHosts(progressHost, _referencesToDownload.Count, 8000, 10000).ProgressHosts; int tickNum = 0; foreach (ReferenceToDownload reference in _referencesToDownload.Values) { workQueue.Enqueue(new DownloadWorkItem(reference, _siteStorage, progressHosts[tickNum++])); } // Start up the parallel execution of the downloads ParallelExecution parallelExecution = new ParallelExecution(new ThreadStart(WorkerThreadStart), 2); parallelExecution.Execute(); parallelExecution = null; // Now go through and get HTML for each page, and emit the HTML to disk ProgressTick allPagesProgress = new ProgressTick(progressHost, 2000, 10000); for (int i = 0; i < _pagesToDownload.Length; i++) { try { allPagesProgress.UpdateProgress(i, _pagesToDownload.Length, string.Format(CultureInfo.CurrentCulture, Res.Get(StringId.ProgressSaving), _pagesToDownload[i].FileName)); WriteHtmlToDisk(_pagesToDownload[i], _siteStorage); } catch (Exception e) { HandleException(e); } if (allPagesProgress.CancelRequested) { throw new OperationCancelledException(); } } // We're complete! progressHost.UpdateProgress(1, 1, Res.Get(StringId.ProgressDownloadFinished)); return(this); }