网页静态化时的上下文信息。

提供对爬行网页过程中状态信息存储的支持。

提供错误信息列表。

 /// <summary>
 /// 执行所有 HTML 检查
 /// </summary>
 /// <param name="context"></param>
 /// <returns>HTML 检查结果。</returns>
 public static IList<ValidationResult> Validate(this IEnumerable<IValidation> validations, HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context)
 {
     if (document == null)
     {
         throw new ArgumentNullException("document");
     }
     if (context == null)
     {
         throw new ArgumentNullException("context");
     }
     if (validations == null)
     {
         return null;
     }
     var validationResult = new List<ValidationResult>();
     foreach (var vd in validations)
     {
         var errorMessage = vd.Validate(document, context);
         if (errorMessage != null && errorMessage.Length > 0)
         {
             validationResult.Add(new ValidationResult
             {
                 Uri = context.Uri,
                 ValidationType = vd.Type,
                 Name = vd.Name,
                 Message = errorMessage,
             });
         }
     }
     return validationResult;
 }
        public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status)
        {
            //得到当前页面的目录
            string documentDir = status.Uri.GetFileDirectory();
            var errorMessage = new StringBuilder();
            var htmlNode = document.DocumentNode;
            var nodes = htmlNode.SelectNodes("//a");
            if (nodes == null || nodes.Count == 0)
            {
                return null;
            }
            foreach (var aNode in nodes)
            {
                string href = aNode.GetAttributeValue("href", null);
                if (string.IsNullOrWhiteSpace(href) || href[0] == '#')
                {
                    continue;
                }
                //如果href是相对当前页面来说的:<a href="1.html" />
                if (!href[0].IsDirectorySeparator())
                {
                    href = documentDir + href;
                }

                Uri uri;
                if (!Uri.TryCreate(href, UriKind.RelativeOrAbsolute, out uri))
                {
                    continue;
                }
                //这里可以增加对站内域名的判断
                if (uri.IsAbsoluteUri && !string.IsNullOrEmpty(uri.Host))
                {
                    continue;
                }
                string local = fileReslover.ResloveLocalPath(uri);
                if (string.IsNullOrEmpty(local))
                {
                    continue;
                }
                string localPath = System.IO.Path.Combine(searchDirectory, local);
                bool isExisting;
                if (!files.TryGetValue(localPath, out isExisting))
                {
                    isExisting = System.IO.File.Exists(localPath);
                    try
                    {
                        files.Add(localPath, isExisting);
                    }
                    catch (Exception)
                    {
                    }
                }
                if (!isExisting)
                {
                    errorMessage.AppendFormat("本地不存在链接 \"{0}\" 所指向的文件 \"{1}\"。", uri.ToString(), localPath);
                }
            }
            return errorMessage.Length == 0 ? null : errorMessage.ToString();
        }
 /// <summary>
 /// 执行验证。
 /// </summary>
 /// <param name="document">被验证的 HtmlDocument</param>
 /// <returns>验证通过则返回true。</returns>
 public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status)
 {
     if (document == null)
     {
         throw new ArgumentNullException("document");
     }
     return documentValidation.Invoke(document) ? null : errorMessage;
 }
 string Validate(IList<Uri> list, HtmlStaticizeContext status)
 {
     if (list == null)
     {
         return null;
     }
     Boolean isValid = true;
     var errorMessage = new StringBuilder();
     foreach (var uri in list)
     {
         String fileName = fileReslover.ResloveLocalPath(uri);
         if (String.IsNullOrEmpty(fileName))
         {
             continue;
         }
         String physicalFilePath = System.IO.Path.Combine(outputDir, fileName);
         bool fileExists = false;
         bool hasKey = false;
         try
         {
             //并发修改 patch
             hasKey = exisitingFiles.TryGetValue(physicalFilePath, out fileExists);
         }
         catch (Exception)
         {
         }
         if (!hasKey || !fileExists)
         {
             if (!hasKey)
             {
                 fileExists = System.IO.File.Exists(physicalFilePath);
             }
             if (!fileExists)
             {
                 isValid = false;
                 status.Resources.NotExistsFiles.Add(uri, physicalFilePath);
                 errorMessage.AppendFormat("资源 \"{0}\" 未能在本地预期的路径 \"{1}\" 中找到。\r\n", uri.ToString(), physicalFilePath);
                 {
                     var ex = status.GenerationError;
                     if (ex != null)
                     {
                         errorMessage.AppendLine("这可能是由于请求文件时发生异常造成的,以下是异常信息:");
                         errorMessage.AppendFormat("{0}:\r\n{1}\r\n\r\n", ex.Message, ex.ToString());
                     }
                 }
             }
             try
             {
                 //并发修改 patch
                 exisitingFiles.Add(physicalFilePath, fileExists);
             }
             catch (Exception)
             {
             }
         }
     }
     return isValid ? null : errorMessage.ToString();
 }
 public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status)
 {
     var resources = status.Resources;
     String resultCss = Validate(resources.ReferenceCsses, status);
     String resultJs = Validate(resources.ReferenceJavascripts, status);
     String resultImage = Validate(resources.ReferenceImages, status);
     return (
         String.IsNullOrEmpty(resultCss) &&
         String.IsNullOrEmpty(resultImage) &&
         String.IsNullOrEmpty(resultJs)
         ) ? null : (
         String.Format("{0}\r\n{1}\r\n{2}", resultCss, resultJs, resultImage)
         );
 }
 string IValidation.Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status)
 {
     var errorMessage = new StringBuilder();
     if (status.GenerationError != null)
     {
         var err = status.GenerationError;
         errorMessage.AppendFormat("生成HTML期间发生错误:{0}\r\n{1}\r\n", err.Message, err.ToString());
     }
     if (status.DocumentLoadError != null)
     {
         var err = status.DocumentLoadError;
         errorMessage.AppendFormat("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", err.Message, err.ToString());
     }
     return errorMessage.Length == 0 ? null : errorMessage.ToString();
 }
Beispiel #7
0
 public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status)
 {
     var errorMessageBuilder = new StringBuilder();
     foreach (var id in this.elementXPath.Keys)
     {
         String truthXPath = elementXPath[id];
         var element = document.GetElementbyId(id);
         if (element == null)
         {
             errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" 在文档中不存在。", id);
             continue;
         }
         if (element.XPath != truthXPath)
         {
             errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" XPath 不匹配,应为\"{1}\",但实际为\"{2}\"。\r\n行号:{3}\r\n源HTML:\r\n{4}\r\n", id, truthXPath, element.XPath, element.Line.ToString(), element.OuterHtml);
             continue;
         }
     }
     return errorMessageBuilder.Length == 0 ? null : errorMessageBuilder.ToString();
 }
 protected override void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context)
 {
     base.OnResourceParsed(resourceUris, context);
     context.Resources.ReferenceJavascripts.AddRange(resourceUris);
 }
Beispiel #9
0
        /// <summary>
        /// 执行静态化
        /// </summary>
        /// <param name="pages">要静态化的页面列表。Key为页面绝对URL,Value为这个页面保存在本地的路径。URL和Value必须是唯一的。</param>
        /// <param name="stepTaken">静态化状态,默认请传入此实例,它提供对异步线程获取静态化状态的支持。</param>
        /// <returns>静态化状态,与传入的 stepTaken 引用一致。</returns>
        public StaticizeStepStatus Staticize(IEnumerable<KeyValuePair<Uri, String>> pages, StaticizeStepStatus stepTaken)
        {
            if (pages == null)
            {
                throw new ArgumentNullException("pages");
            }
            if (stepTaken == null)
            {
                throw new ArgumentNullException("stepTaken");
            }
            #region 初始化
            stepTaken.Step = StaticizeStep.Initialize;

            int pageCount = pages.Count();
            stepTaken.pageCount = pageCount;
            // 创建 Context 对象,每个页面一个 Context
            HtmlStaticizeContext[] entries = new HtmlStaticizeContext[pageCount];
            {
                int i = 0;
                foreach (var address in pages)
                {
                    entries[i] = new HtmlStaticizeContext
                    {
                        uri = address.Key,
                        fileName = address.Value,
                    };
                    i++;
                }
            }
            stepTaken.Init(entries);

            AddValidation(GenerationSuccessfulValidation.Instance);

            #endregion

            stepTaken.Step = StaticizeStep.GenerationHtml;

            // 生成 HTML
            Generate(entries, stepTaken);
            stepTaken.Step = StaticizeStep.GenerationHtmlCompleted;

            #region 验证

            stepTaken.Step = StaticizeStep.Validation;
            if (
                (m_Behaviors != null && m_Behaviors.Count > 0)
                || (m_Validations != null && m_Validations.Count > 0)
                )
            {
                for (int j = 0; j < entries.Length; j++)
                {
                    var entry = entries[j];
                    // 如果 generationError 不为null,表示 HTML 生成失败。
                    if (entry.generationError != null)
                    {
                        var ex = entry.generationError;
                        var vd = new ValidationResult()
                        {
                            ValidationType = ValidationType.Tag,
                            Uri = entry.uri,
                            Name = "页面HTML是否成功生成。",
                            Message = string.Format("生成HTML期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()),
                            Exception = ex,
                        };
                        entry.validationResults = new ValidationResult[] { vd };
                        stepTaken.ValidationErrors.Add(vd);
                        stepTaken.validatedPageCount++;
                        continue;
                    }
                    // load document dom
                    var doc = new HtmlAgilityPack.HtmlDocument();
                    // 尝试加载 document
                    try
                    {
                        doc.Load(entry.fileName, System.Text.Encoding.UTF8);
                    }
                    catch (Exception ex)
                    {
                        // 加载 document失败
                        entry.DocumentLoadError = ex;
                        var vd = new ValidationResult()
                        {
                            ValidationType = ValidationType.Tag,
                            Uri = entry.uri,
                            Name = "页面HTML是否成功生成。",
                            Message = string.Format("加载HTML文档树期间发生错误:{0}\r\n{1}\r\n", ex.Message, ex.ToString()),
                            Exception = ex,
                        };
                        entry.validationResults = new ValidationResult[] { vd };
                        stepTaken.ValidationErrors.Add(vd);
                        stepTaken.AddValidatedPageCount();
                        continue;
                    }

                    if (m_Behaviors != null && m_Behaviors.Count > 0)
                    {
                        for (int k = 0; k < m_Behaviors.Count; k++)
                        {
                            m_Behaviors[k].Process(doc, entry);
                        }
                    }
                    if (m_Validations != null && m_Validations.Count > 0)
                    {
                        Validate(doc, entry, stepTaken);
                    }
                    stepTaken.AddValidatedPageCount();
                }
            }
            stepTaken.Step = StaticizeStep.ValidationCompleted;

            #endregion

            // add context errors results to status
            {
                var all = stepTaken.Errors;
                for (int i = 0; i < entries.Length; i++)
                {
                    var items = entries[i].Errors;
                    if (items != null && items.Count > 0)
                    {
                        all.AddRange(items);
                    }
                }
            }

            stepTaken.Step = StaticizeStep.Completed;
            return stepTaken;
        }
Beispiel #10
0
 /// <summary>
 /// 验证
 /// </summary>
 /// <param name="doc"></param>
 /// <param name="context"></param>
 /// <param name="stepTaken"></param>
 void Validate(HtmlAgilityPack.HtmlDocument doc, HtmlStaticizeContext context, StaticizeStepStatus stepTaken)
 {
     if (this.m_Validations != null)
     {
         var result = m_Validations.Validate(doc, context);
         if (result != null && result.Count > 0)
         {
             if (context.validationResults == null)
             {
                 context.validationResults = result;
             }
             else
             {
                 context.validationResults.AddRange(result);
             }
             stepTaken.ValidationErrors.AddRange(result);
         }
     }
 }
Beispiel #11
0
 /// <summary>
 /// 生成
 /// </summary>
 /// <param name="entries"></param>
 /// <param name="step"></param>
 void Generate(HtmlStaticizeContext[] entries, StaticizeStepStatus step)
 {
     System.Threading.Tasks.Parallel.ForEach(entries, (entry) =>
     {
         using (var wc = new WebClient())
         {
             try
             {
                 wc.DownloadFile(entry.uri, entry.fileName);
                 step.AddGeneratedPageCount();
             }
             catch (Exception ex)
             {
                 entry.generationError = ex;
                 entry.Errors.Add(ex);
                 //step.Errors.Add(ex);
             }
         }
     });
     //// batch download html file
     //using (var wc = new WebClient())
     //{
     //    for (int j = 0; j < entries.Length; j++)
     //    {
     //        var entry = entries[j];
     //        // may be some url down failure
     //        // should log error
     //        try
     //        {
     //            wc.DownloadFile(entry.uri, entry.fileName);
     //        }
     //        catch (Exception ex)
     //        {
     //            entry.generationError = ex;
     //            entry.Errors.Add(ex);
     //        }
     //    }
     //}
 }
        public void Process(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext context)
        {
            Uri documentUri = context.Uri;
            String baseUrl = documentUri.GetParent();// GetParent(documentUri);

            var htmlNode = document.DocumentNode;
            var nodes = htmlNode.SelectNodes(resourcesNodeSelectPath);
            if (nodes == null || nodes.Count == 0)
            {
                return;
            }
            var srcAttributes = GetSrcAttributes(nodes);
            if (srcAttributes == null || srcAttributes.Count() == 0)
            {
                return;
            }
            var parsedSrcUris = ParseResourcesUris(documentUri, baseUrl, srcAttributes);
            if (parsedSrcUris == null || parsedSrcUris.Length == 0)
            {
                return;
            }

            OnResourceParsed(parsedSrcUris, context);

            for (int i = 0; i < parsedSrcUris.Length; i++)
            {
                var uri = parsedSrcUris[i];
                String localPath = fileReslover.ResloveLocalPath(uri);
                if (String.IsNullOrEmpty(localPath))
                {
                    continue;
                }
                var localDirectory = System.IO.Path.Combine(outputDirectory, System.IO.Path.GetDirectoryName(localPath));
                if (!directories.ContainsKey(localDirectory) && !System.IO.Directory.Exists(localDirectory))
                {
                    System.IO.Directory.CreateDirectory(localDirectory);
                    //并发 patch
                    try
                    {
                        directories.Add(localDirectory, null);
                    }
                    catch (Exception)
                    {
                    }
                }

                String saveFile = System.IO.Path.Combine(outputDirectory, localPath);
                //并发 patch
                try
                {
                    //已存在相同文件,则跳过。为避免并发写同一个文件。
                    if (files.ContainsKey(saveFile))
                    {
                        continue;
                    }
                    files.Add(saveFile, null);
                }
                catch (Exception)
                {
                }
                if (System.IO.File.Exists(saveFile))
                {
                    continue;
                }
                if (this.fileReslover.TryCopyFromLocal(uri, saveFile))
                {
                    continue;
                }
                using (System.Net.WebClient wc = new System.Net.WebClient())
                {
                    try
                    {
                        wc.DownloadFile(uri, saveFile);
                    }
                    catch (Exception ex)
                    {
                        //修复WebClient 文件不存在仍然本地保存了一个空文件
                        System.IO.File.Delete(saveFile);
                        context.Errors.Add(new ResourcesDownloadException(String.Format(@"下载资源 ""{0}"" 时发生异常。", uri.ToString()), ex)
                        {
                            Url = uri,
                        });
                        continue;
                    }
                }
            }
        }
 /// <summary>
 /// 当资源URL被正确解析,即将被下载时回调
 /// </summary>
 /// <param name="resourceUris">当资源URL(集合)</param>
 /// <param name="context"></param>
 protected virtual void OnResourceParsed(Uri[] resourceUris, HtmlStaticizeContext context)
 {
 }