网页静态化时的上下文信息。
提供对爬行网页过程中状态信息存储的支持。
提供错误信息列表。
/// <summary> /// 执行所有 HTML 检查 /// </summary> /// <param name="context"></param> /// <returns>HTML 检查结果。</returns> public static IList<ValidationResult> Validate(this IEnumerable<IValidation> validations, HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context) { if (document == null) { throw new ArgumentNullException("document"); } if (context == null) { throw new ArgumentNullException("context"); } if (validations == null) { return null; } var validationResult = new List<ValidationResult>(); foreach (var vd in validations) { var errorMessage = vd.Validate(document, context); if (errorMessage != null && errorMessage.Length > 0) { validationResult.Add(new ValidationResult { Uri = context.Uri, ValidationType = vd.Type, Name = vd.Name, Message = errorMessage, }); } } return validationResult; }
public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) { //得到当前页面的目录 string documentDir = status.Uri.GetFileDirectory(); var errorMessage = new StringBuilder(); var htmlNode = document.DocumentNode; var nodes = htmlNode.SelectNodes("//a"); if (nodes == null || nodes.Count == 0) { return null; } foreach (var aNode in nodes) { string href = aNode.GetAttributeValue("href", null); if (string.IsNullOrWhiteSpace(href) || href[0] == '#') { continue; } //如果href是相对当前页面来说的:<a href="1.html" /> if (!href[0].IsDirectorySeparator()) { href = documentDir + href; } Uri uri; if (!Uri.TryCreate(href, UriKind.RelativeOrAbsolute, out uri)) { continue; } //这里可以增加对站内域名的判断 if (uri.IsAbsoluteUri && !string.IsNullOrEmpty(uri.Host)) { continue; } string local = fileReslover.ResloveLocalPath(uri); if (string.IsNullOrEmpty(local)) { continue; } string localPath = System.IO.Path.Combine(searchDirectory, local); bool isExisting; if (!files.TryGetValue(localPath, out isExisting)) { isExisting = System.IO.File.Exists(localPath); try { files.Add(localPath, isExisting); } catch (Exception) { } } if (!isExisting) { errorMessage.AppendFormat("本地不存在链接 \"{0}\" 所指向的文件 \"{1}\"。", uri.ToString(), localPath); } } return errorMessage.Length == 0 ? null : errorMessage.ToString(); }
/// <summary> /// 执行验证。 /// </summary> /// <param name="document">被验证的 HtmlDocument</param> /// <returns>验证通过则返回true。</returns> public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) { if (document == null) { throw new ArgumentNullException("document"); } return documentValidation.Invoke(document) ? null : errorMessage; }
string Validate(IList<Uri> list, HtmlStaticizeContext status) { if (list == null) { return null; } Boolean isValid = true; var errorMessage = new StringBuilder(); foreach (var uri in list) { String fileName = fileReslover.ResloveLocalPath(uri); if (String.IsNullOrEmpty(fileName)) { continue; } String physicalFilePath = System.IO.Path.Combine(outputDir, fileName); bool fileExists = false; bool hasKey = false; try { //并发修改 patch hasKey = exisitingFiles.TryGetValue(physicalFilePath, out fileExists); } catch (Exception) { } if (!hasKey || !fileExists) { if (!hasKey) { fileExists = System.IO.File.Exists(physicalFilePath); } if (!fileExists) { isValid = false; status.Resources.NotExistsFiles.Add(uri, physicalFilePath); errorMessage.AppendFormat("资源 \"{0}\" 未能在本地预期的路径 \"{1}\" 中找到。\r\n", uri.ToString(), physicalFilePath); { var ex = status.GenerationError; if (ex != null) { errorMessage.AppendLine("这可能是由于请求文件时发生异常造成的,以下是异常信息:"); errorMessage.AppendFormat("{0}:\r\n{1}\r\n\r\n", ex.Message, ex.ToString()); } } } try { //并发修改 patch exisitingFiles.Add(physicalFilePath, fileExists); } catch (Exception) { } } } return isValid ? null : errorMessage.ToString(); }
public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) { var resources = status.Resources; String resultCss = Validate(resources.ReferenceCsses, status); String resultJs = Validate(resources.ReferenceJavascripts, status); String resultImage = Validate(resources.ReferenceImages, status); return ( String.IsNullOrEmpty(resultCss) && String.IsNullOrEmpty(resultImage) && String.IsNullOrEmpty(resultJs) ) ? null : ( String.Format("{0}\r\n{1}\r\n{2}", resultCss, resultJs, resultImage) ); }
string IValidation.Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) { var errorMessage = new StringBuilder(); if (status.GenerationError != null) { var err = status.GenerationError; errorMessage.AppendFormat("生成HTML期间发生错误:{0}\r\n{1}\r\n", err.Message, err.ToString()); } if (status.DocumentLoadError != null) { var err = status.DocumentLoadError; errorMessage.AppendFormat("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", err.Message, err.ToString()); } return errorMessage.Length == 0 ? null : errorMessage.ToString(); }
public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status) { var errorMessageBuilder = new StringBuilder(); foreach (var id in this.elementXPath.Keys) { String truthXPath = elementXPath[id]; var element = document.GetElementbyId(id); if (element == null) { errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" 在文档中不存在。", id); continue; } if (element.XPath != truthXPath) { errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" XPath 不匹配,应为\"{1}\",但实际为\"{2}\"。\r\n行号:{3}\r\n源HTML:\r\n{4}\r\n", id, truthXPath, element.XPath, element.Line.ToString(), element.OuterHtml); continue; } } return errorMessageBuilder.Length == 0 ? null : errorMessageBuilder.ToString(); }
protected override void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context) { base.OnResourceParsed(resourceUris, context); context.Resources.ReferenceJavascripts.AddRange(resourceUris); }
/// <summary> /// 执行静态化 /// </summary> /// <param name="pages">要静态化的页面列表。Key为页面绝对URL,Value为这个页面保存在本地的路径。URL和Value必须是唯一的。</param> /// <param name="stepTaken">静态化状态,默认请传入此实例,它提供对异步线程获取静态化状态的支持。</param> /// <returns>静态化状态,与传入的 stepTaken 引用一致。</returns> public StaticizeStepStatus Staticize(IEnumerable<KeyValuePair<Uri, String>> pages, StaticizeStepStatus stepTaken) { if (pages == null) { throw new ArgumentNullException("pages"); } if (stepTaken == null) { throw new ArgumentNullException("stepTaken"); } #region 初始化 stepTaken.Step = StaticizeStep.Initialize; int pageCount = pages.Count(); stepTaken.pageCount = pageCount; // 创建 Context 对象,每个页面一个 Context HtmlStaticizeContext[] entries = new HtmlStaticizeContext[pageCount]; { int i = 0; foreach (var address in pages) { entries[i] = new HtmlStaticizeContext { uri = address.Key, fileName = address.Value, }; i++; } } stepTaken.Init(entries); AddValidation(GenerationSuccessfulValidation.Instance); #endregion stepTaken.Step = StaticizeStep.GenerationHtml; // 生成 HTML Generate(entries, stepTaken); stepTaken.Step = StaticizeStep.GenerationHtmlCompleted; #region 验证 stepTaken.Step = StaticizeStep.Validation; if ( (m_Behaviors != null && m_Behaviors.Count > 0) || (m_Validations != null && m_Validations.Count > 0) ) { for (int j = 0; j < entries.Length; j++) { var entry = entries[j]; // 如果 generationError 不为null,表示 HTML 生成失败。 if (entry.generationError != null) { var ex = entry.generationError; var vd = new ValidationResult() { ValidationType = ValidationType.Tag, Uri = entry.uri, Name = "页面HTML是否成功生成。", Message = string.Format("生成HTML期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()), Exception = ex, }; entry.validationResults = new ValidationResult[] { vd }; stepTaken.ValidationErrors.Add(vd); stepTaken.validatedPageCount++; continue; } // load document dom var doc = new HtmlAgilityPack.HtmlDocument(); // 尝试加载 document try { doc.Load(entry.fileName, System.Text.Encoding.UTF8); } catch (Exception ex) { // 加载 document失败 entry.DocumentLoadError = ex; var vd = new ValidationResult() { ValidationType = ValidationType.Tag, Uri = entry.uri, Name = "页面HTML是否成功生成。", Message = string.Format("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()), Exception = ex, }; entry.validationResults = new ValidationResult[] { vd }; stepTaken.ValidationErrors.Add(vd); stepTaken.AddValidatedPageCount(); continue; } if (m_Behaviors != null && m_Behaviors.Count > 0) { for (int k = 0; k < m_Behaviors.Count; k++) { m_Behaviors[k].Process(doc, entry); } } if (m_Validations != null && m_Validations.Count > 0) { Validate(doc, entry, stepTaken); } stepTaken.AddValidatedPageCount(); } } stepTaken.Step = StaticizeStep.ValidationCompleted; #endregion // add context errors results to status { var all = stepTaken.Errors; for (int i = 0; i < entries.Length; i++) { var items = entries[i].Errors; if (items != null && items.Count > 0) { all.AddRange(items); } } } stepTaken.Step = StaticizeStep.Completed; return stepTaken; }
/// <summary> /// 验证 /// </summary> /// <param name="doc"></param> /// <param name="context"></param> /// <param name="stepTaken"></param> void Validate(HtmlAgilityPack.HtmlDocument doc, HtmlStaticizeContext context, StaticizeStepStatus stepTaken) { if (this.m_Validations != null) { var result = m_Validations.Validate(doc, context); if (result != null && result.Count > 0) { if (context.validationResults == null) { context.validationResults = result; } else { context.validationResults.AddRange(result); } stepTaken.ValidationErrors.AddRange(result); } } }
/// <summary> /// 生成 /// </summary> /// <param name="entries"></param> /// <param name="step"></param> void Generate(HtmlStaticizeContext[] entries, StaticizeStepStatus step) { System.Threading.Tasks.Parallel.ForEach(entries, (entry) => { using (var wc = new WebClient()) { try { wc.DownloadFile(entry.uri, entry.fileName); step.AddGeneratedPageCount(); } catch (Exception ex) { entry.generationError = ex; entry.Errors.Add(ex); //step.Errors.Add(ex); } } }); //// batch download html file //using (var wc = new WebClient()) //{ // for (int j = 0; j < entries.Length; j++) // { // var entry = entries[j]; // // may be some url down failure // // should log error // try // { // wc.DownloadFile(entry.uri, entry.fileName); // } // catch (Exception ex) // { // entry.generationError = ex; // entry.Errors.Add(ex); // } // } //} }
public void Process(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context) { Uri documentUri = context.Uri; String baseUrl = documentUri.GetParent();// GetParent(documentUri); var htmlNode = document.DocumentNode; var nodes = htmlNode.SelectNodes(resourcesNodeSelectPath); if (nodes == null || nodes.Count == 0) { return; } var srcAttributes = GetSrcAttributes(nodes); if (srcAttributes == null || srcAttributes.Count() == 0) { return; } var parsedSrcUris = ParseResourcesUris(documentUri, baseUrl, srcAttributes); if (parsedSrcUris == null || parsedSrcUris.Length == 0) { return; } OnResourceParsed(parsedSrcUris, context); for (int i = 0; i < parsedSrcUris.Length; i++) { var uri = parsedSrcUris[i]; String localPath = fileReslover.ResloveLocalPath(uri); if (String.IsNullOrEmpty(localPath)) { continue; } var localDirectory = System.IO.Path.Combine(outputDirectory, System.IO.Path.GetDirectoryName(localPath)); if (!directories.ContainsKey(localDirectory) && !System.IO.Directory.Exists(localDirectory)) { System.IO.Directory.CreateDirectory(localDirectory); //并发 patch try { directories.Add(localDirectory, null); } catch (Exception) { } } String saveFile = System.IO.Path.Combine(outputDirectory, localPath); //并发 patch try { //已存在相同文件,则跳过。为避免并发写同一个文件。 if (files.ContainsKey(saveFile)) { continue; } files.Add(saveFile, null); } catch (Exception) { } if (System.IO.File.Exists(saveFile)) { continue; } if (this.fileReslover.TryCopyFromLocal(uri, saveFile)) { continue; } using (System.Net.WebClient wc = new System.Net.WebClient()) { try { wc.DownloadFile(uri, saveFile); } catch (Exception ex) { //修复WebClient 文件不存在仍然本地保存了一个空文件 System.IO.File.Delete(saveFile); context.Errors.Add(new ResourcesDownloadException(String.Format(@"下载资源 ""{0}"" 时发生异常。", uri.ToString()), ex) { Url = uri, }); continue; } } } }
/// <summary> /// 当资源URL被正确解析,即将被下载时回调 /// </summary> /// <param name="resourceUris">当资源URL(集合)</param> /// <param name="context"></param> protected virtual void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context) { }