/// <summary> /// 执行静态化 /// </summary> /// <param name="pages">要静态化的页面列表。Key为页面绝对URL,Value为这个页面保存在本地的路径。URL和Value必须是唯一的。</param> /// <param name="stepTaken">静态化状态,默认请传入此实例,它提供对异步线程获取静态化状态的支持。</param> /// <returns>静态化状态,与传入的 stepTaken 引用一致。</returns> public StaticizeStepStatus Staticize(IEnumerable<KeyValuePair<Uri, String>> pages, StaticizeStepStatus stepTaken) { if (pages == null) { throw new ArgumentNullException("pages"); } if (stepTaken == null) { throw new ArgumentNullException("stepTaken"); } #region 初始化 stepTaken.Step = StaticizeStep.Initialize; int pageCount = pages.Count(); stepTaken.pageCount = pageCount; // 创建 Context 对象,每个页面一个 Context HtmlStaticizeContext[] entries = new HtmlStaticizeContext[pageCount]; { int i = 0; foreach (var address in pages) { entries[i] = new HtmlStaticizeContext { uri = address.Key, fileName = address.Value, }; i++; } } stepTaken.Init(entries); AddValidation(GenerationSuccessfulValidation.Instance); #endregion stepTaken.Step = StaticizeStep.GenerationHtml; // 生成 HTML Generate(entries, stepTaken); stepTaken.Step = StaticizeStep.GenerationHtmlCompleted; #region 验证 stepTaken.Step = StaticizeStep.Validation; if ( (m_Behaviors != null && m_Behaviors.Count > 0) || (m_Validations != null && m_Validations.Count > 0) ) { for (int j = 0; j < entries.Length; j++) { var entry = entries[j]; // 如果 generationError 不为null,表示 HTML 生成失败。 if (entry.generationError != null) { var ex = entry.generationError; var vd = new ValidationResult() { ValidationType = ValidationType.Tag, Uri = entry.uri, Name = "页面HTML是否成功生成。", Message = string.Format("生成HTML期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()), Exception = ex, }; entry.validationResults = new ValidationResult[] { vd }; stepTaken.ValidationErrors.Add(vd); stepTaken.validatedPageCount++; continue; } // load document dom var doc = new HtmlAgilityPack.HtmlDocument(); // 尝试加载 document try { doc.Load(entry.fileName, System.Text.Encoding.UTF8); } catch (Exception ex) { // 加载 document失败 entry.DocumentLoadError = ex; var vd = new ValidationResult() { ValidationType = ValidationType.Tag, Uri = entry.uri, Name = "页面HTML是否成功生成。", Message = string.Format("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()), Exception = ex, }; entry.validationResults = new ValidationResult[] { vd }; stepTaken.ValidationErrors.Add(vd); stepTaken.AddValidatedPageCount(); continue; } if (m_Behaviors != null && m_Behaviors.Count > 0) { for (int k = 0; k < m_Behaviors.Count; k++) { m_Behaviors[k].Process(doc, entry); } } if (m_Validations != null && m_Validations.Count > 0) { Validate(doc, entry, stepTaken); } stepTaken.AddValidatedPageCount(); } } stepTaken.Step = StaticizeStep.ValidationCompleted; #endregion // add context errors results to status { var all = stepTaken.Errors; for (int i = 0; i < entries.Length; i++) { var items = entries[i].Errors; if (items != null && items.Count > 0) { all.AddRange(items); } } } stepTaken.Step = StaticizeStep.Completed; return stepTaken; }
/// <summary> /// 生成 /// </summary> /// <param name="entries"></param> /// <param name="step"></param> void Generate(HtmlStaticizeContext[] entries, StaticizeStepStatus step) { System.Threading.Tasks.Parallel.ForEach(entries, (entry) => { using (var wc = new WebClient()) { try { wc.DownloadFile(entry.uri, entry.fileName); step.AddGeneratedPageCount(); } catch (Exception ex) { entry.generationError = ex; entry.Errors.Add(ex); //step.Errors.Add(ex); } } }); //// batch download html file //using (var wc = new WebClient()) //{ // for (int j = 0; j < entries.Length; j++) // { // var entry = entries[j]; // // may be some url down failure // // should log error // try // { // wc.DownloadFile(entry.uri, entry.fileName); // } // catch (Exception ex) // { // entry.generationError = ex; // entry.Errors.Add(ex); // } // } //} }
/// <summary> /// 验证 /// </summary> /// <param name="doc"></param> /// <param name="context"></param> /// <param name="stepTaken"></param> void Validate(HtmlAgilityPack.HtmlDocument doc, HtmlStaticizeContext context, StaticizeStepStatus stepTaken) { if (this.m_Validations != null) { var result = m_Validations.Validate(doc, context); if (result != null && result.Count > 0) { if (context.validationResults == null) { context.validationResults = result; } else { context.validationResults.AddRange(result); } stepTaken.ValidationErrors.AddRange(result); } } }
public void StaticizeTest1() { // 编号 String batchId = CreateBatchId(); // 输出文件夹 string outputDirectory = System.IO.Path.Combine(AppDomain.CurrentDomain.BaseDirectory, batchId); System.IO.Directory.CreateDirectory(outputDirectory); List<KeyValuePair<Uri, String>> pages = new List<KeyValuePair<Uri, string>>(10000); // 将下面的 URL 生成为 HTML 静态化文件,文件会生成在 bin 下 var urls = new[] { "http://www.zhihu.com/question/25519625", "http://www.zhihu.com/question/27232313", "http://www.zhihu.com/question/31291872", "http://www.zhihu.com/question/31293043", "http://www.zhihu.com/question/31318753", "http://cn.bing.com/", "http://36kr.com/" }; // 需要说明,如果页面内的图片、CSS、JS 采用相对路径 即不含(http://host/),Staticize 能够自动下载并放在文件夹中 // 但如果是绝对路径,如 http://img3.douban.com/misc/mixed_static/7011201580a8cbed.css ,则是不会下载的。 { for (int i = 1; i < urls.Length; i++) { string outputFile = System.IO.Path.Combine(outputDirectory, string.Concat("zihu-", i.ToString(), ".html")); pages.Add(new KeyValuePair<Uri, String>(new Uri(urls[i]), outputFile)); } } CreateDirectory(pages, outputDirectory); Staticizer staticize = new Staticizer(); staticize.AddBehavior( new ImageResourcesDownloadBehavior(outputDirectory) ); //staticize.AddValidation( // //验证CSS文件是否存在 // ValidationProjection.HasCssLink("/resources/css/jquery-ui-themes.css"), // ValidationProjection.HasCssLink("/resources/css/axure_rp_page.css"), // //验证网页主要页面DOM元素(id)是否存在 // ValidationProjection.HasElement("main_container"), // //验证JS文件是否存在 // ValidationProjection.HasScriptLink("/data/sitemap.js"), // ValidationProjection.HasScriptLink("/resources/scripts/jquery-1.7.1.min.js"), // ValidationProjection.HasScriptLink("/resources/scripts/axutils.js"), // ValidationProjection.HasScriptLink("/resources/scripts/jquery-ui-1.8.10.custom.min.js"), // ValidationProjection.HasScriptLink("/resources/scripts/axurerp_beforepagescript.js"), // ValidationProjection.HasScriptLink("/resources/scripts/messagecenter.js") // ); //staticize.AddValidation( // //验证 HTML Docuemnt 中引用的资源是否存在。 // ValidationProjection.ResourcesExisting(outputDirectory), // //XPath // ValidationProjection.XPathEquals("main_template.html", "main_container"), // ValidationProjection.InternalALinkExisting(outputDirectory) // ); var stepTaken = new StaticizeStepStatus(); var staticizeResults = staticize.Staticize(pages, stepTaken); var validationResults = staticizeResults.GetValidationResults(); validationResults.Save(System.IO.Path.Combine(outputDirectory, "validationResults.txt")); }