/// <summary> /// 查找到指定的表达式 没有则返回空 /// </summary> /// <param name="IndexPageUrl">索引页面</param> /// <param name="ContentPageUrls">分析用的一个页面</param> /// <returns>查找到指定的表达式 没有则返回空</returns> public string GetPathExpression(ListPageContentUrl IndexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls) { //得到当前所有的 XML分析结果 List <XMLDocuentAnalyseEntity> xdoms = ParsPageUrlsToXDocuments(IndexPageUrl, ContentPageUrls); //对得到的XML内容进行分胡 取出有效的表达式 var groups = xdoms.GroupBy(p => p.Expression); if (groups.Count() > 0) { //按表达式排序后的结果 var groupOrder = groups.OrderBy(p => p.Key.Length); string express = groupOrder.Last().Key; if (express.Length > 0) { //表达式 var xpathArr = express.Split('/').ToList(); //如果最后一个是文字节点则直接删除 if (xpathArr.Last().StartsWith("#")) { xpathArr.Remove(xpathArr.Last()); express = string.Join("/", xpathArr.ToArray()); } } //如果表达式有效 if (!(express.Contains("&") || express.Contains("\""))) { if (IndexPageUrl.Url.ToString() == this.IndexPageUrl.Url.ToString()) { //返回表达式 return(PathExpression = express); } else { return(express); } } else { System.Diagnostics.Stopwatch stopwatch = new System.Diagnostics.Stopwatch(); stopwatch.Restart(); PathExpression = express = GetValidaXpath(from p in xdoms select new ExpressionEntity { Expression = p.Expression, }); stopwatch.Stop(); System.Diagnostics.Debug.WriteLine("运行 GetValidaXpath 时间为" + stopwatch.ElapsedMilliseconds); return(PathExpression); } } return(""); }
/// <summary> /// 将指定的url转换成为 /// </summary> /// <param name="indexPageUrl">列表页面</param> /// <param name="ContentPageUrls">内容页面列表</param> /// <returns></returns> List <XMLDocuentAnalyseEntity> ParsPageUrlsToXDocuments(ListPageContentUrl indexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls) { List <XMLDocuentAnalyseEntity> result = new List <XMLDocuentAnalyseEntity>(); //需要分析的url System.Collections.Generic.List <ListPageContentUrl> analyseUrls = new List <ListPageContentUrl>(); //如果转url大于10则只取10个 analyseUrls = ContentPageUrls.Take(10).ToList(); //开始创建多个并行任务数组 System.Threading.Tasks.Task[] tasks = new System.Threading.Tasks.Task <XMLDocuentAnalyseEntity> [analyseUrls.Count]; //实例化数组 for (int k = 0; k < tasks.Length; k++) { tasks[k] = System.Threading.Tasks.Task.Factory.StartNew <XMLDocuentAnalyseEntity>((url) => { return(FillContentUrl(indexPageUrl, (ListPageContentUrl)url)); }, analyseUrls[k]); } //等待并行计算完成 System.Threading.Tasks.Task.WaitAll(tasks); //填充结果 tasks.ToList().ForEach((o) => { if (o.IsCompleted) { result.Add(((System.Threading.Tasks.Task <XMLDocuentAnalyseEntity>)o).Result); } //表示异步已经处理 if (o.Exception != null) { o.Exception.Handle((ex) => { System.Diagnostics.UDPGroup.SendStrGB2312(ex.Message + "|||||" + ex.StackTrace); return(true); }); } if (o.Status == System.Threading.Tasks.TaskStatus.RanToCompletion || o.Status == System.Threading.Tasks.TaskStatus.Faulted || o.Status == System.Threading.Tasks.TaskStatus.Canceled) {//释放资源 o.Dispose(); } }); analyseUrls.Clear(); return(result); }
/// <summary> /// 查找到指定的表达式 没有则返回空 /// </summary> /// <param name="IndexPageUrl">索引页面</param> /// <param name="ContentPageUrls">分析用的一个页面</param> /// <returns>查找到指定的表达式 没有则返回空</returns> public string GetPathExpression(ListPageContentUrl IndexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls) { //得到当前所有的 XML分析结果 List <XMLDocuentAnalyseEntity> xdoms = ParsPageUrlsToXDocuments(IndexPageUrl, ContentPageUrls); //对得到的XML内容进行分胡 取出有效的表达式 var groups = xdoms.GroupBy(p => p.Expression); if (groups.Count() > 0) { //按表达式排序后的结果 var groupOrder = groups.OrderBy(p => p.Key.Length); string express = groupOrder.Last().Key; if (express.Length > 0) { //表达式 var xpathArr = express.Split('/').ToList(); //如果最后一个是文字节点则直接删除 if (xpathArr.Last().StartsWith("#")) { xpathArr.Remove(xpathArr.Last()); express = string.Join("/", xpathArr.ToArray()); } } if (IndexPageUrl.Url.ToString() == this.IndexPageUrl.Url.ToString()) { //返回表达式 return(PathExpression = express); } else { return(express); } } return(""); }
/// <summary> /// 找到当前文章内容并填充为xdom原素 /// </summary> /// <param name="indexPageUrl"></param> /// <param name="ContentPageUrl"></param> /// <returns></returns> XMLDocuentAnalyseEntity FillContentUrl(ListPageContentUrl indexPageUrl, ListPageContentUrl ContentPageUrl) { //主要内容 string MainContent = ""; //用于保存原始内容 string originContent = ""; //得到要读取的页面 string pageurl = ContentPageUrl.Url.ToString(); //结果文档 XMLDocuentAnalyseEntity entity = null; //得到当前页面的HTML内容 string htmlContent = ""; //原素标签 var elementTags = new string[] { "p", "span", "strong", "font", "h1", "tbody", "o:p", "dd", "tr", "table" }; //基url Uri baseUri = new Uri(pageurl); #region 把文档转换成为 XDocument //得到当前页面的HTML内容 //并将HTML转换成为xml originContent = htmlContent = pageurl.GetWeb(); HtmlDocument htmlDom = new HtmlDocument(); //格式化为html htmlDom.LoadHtml(htmlContent); htmlContent = htmlDom.DocumentNode.OuterHtml; //重新加载HTML字符串 对一些标签进行闭合 htmlDom.LoadHtml(htmlContent.FiltrateHTML(elementTags)); #endregion entity = new XMLDocuentAnalyseEntity() { BaseUrlObject = ContentPageUrl, HtmlDocument = htmlDom, OriginContent = htmlContent, PageTitle = ContentPageUrl.Title, Content = MainContent, IsGetContentSuccess = MainContent.Length > 50 ? true : false, IndexPageUrl = indexPageUrl, XmlParseConten = "<data>" + htmlContent + "</data>", Expression = "", }; #region 得到文档内容 GetMainContentExpression(entity); #endregion //重新设置加载状态 entity.IsGetContentSuccess = MainContent.Length > 50 ? true : false; //返回结果 return(entity); }
public string GetPathExpression(ListPageContentUrl IndexPageUrl, IEnumerable <ListPageContentUrl> ContentPageUrls) { throw new NotImplementedException(); }
/// <summary> /// 列表页面不支持此方法 /// </summary> /// <param name="IndexPageUrl"></param> /// <param name="ContentPageUrls"></param> /// <returns></returns> public string GetPathExpression(ListPageContentUrl IndexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls) { throw new NotImplementedException("列表页面不支持此方法"); }
/// <summary> /// 得到一个标题多个章节内容合并时的索引 /// </summary> /// <returns></returns> List <int> GetTitleIndexs(ListPageContentUrl Item) { return(GetTitleIndexs(Item.index)); }
/// <summary> /// 找到当前文章内容并填充为xdom原素 /// </summary> /// <param name="indexPageUrl"></param> /// <param name="ContentPageUrl"></param> /// <returns></returns> XMLDocuentAnalyseEntity FillContentUrl(ListPageContentUrl indexPageUrl, ListPageContentUrl ContentPageUrl) { //主要内容 string MainContent = ""; //用于保存原始内容 string originContent = ""; //得到要读取的页面 string pageurl = ContentPageUrl.Url.ToString(); //结果文档 XMLDocuentAnalyseEntity entity = null; //得到当前页面的HTML内容 string htmlContent = ""; //原素标签 var elementTags = new string[] { "p", "span", "strong", "font", "h1", "tbody", "o:p", "dd", "tr", "table" }; //基url Uri baseUri = new Uri(pageurl); #region 把文档转换成为 XDocument //得到当前页面的HTML内容 //并将HTML转换成为xml originContent = htmlContent = pageurl.GetWeb(); HtmlDocument htmlDom = new HtmlDocument(); //格式化为html htmlDom.LoadHtml(htmlContent); //有些文章会出现 无限循环异常 //try //{ htmlContent = htmlDom.DocumentNode.OuterHtml; //重新加载HTML字符串 对一些标签进行闭合 htmlDom.LoadHtml(htmlContent.FiltrateHTML(elementTags)); //} //catch (OverflowException ex) //{ // System.Diagnostics.Debug.WriteLine(DateTime.Now + ex.Message + "|||||" + ex.StackTrace); // return new XMLDocuentAnalyseEntity() // { // BaseUrlObject = ContentPageUrl, // HtmlDocument = htmlDom, // OriginContent = htmlContent, // PageTitle = ContentPageUrl.Title, // Content = MainContent, // IsGetContentSuccess = MainContent.Length > 50 ? true : false, // IndexPageUrl = indexPageUrl, // XmlParseConten = "<data>" + htmlContent + "</data>", // Expression = "", // }; //} #endregion entity = new XMLDocuentAnalyseEntity() { BaseUrlObject = ContentPageUrl, HtmlDocument = htmlDom, OriginContent = htmlContent, PageTitle = ContentPageUrl.Title, Content = MainContent, IsGetContentSuccess = MainContent.Length > 50 ? true : false, IndexPageUrl = indexPageUrl, XmlParseConten = "<data>" + htmlContent + "</data>", Expression = "", }; #region 得到文档内容 GetMainContentExpression(entity); #endregion //重新设置加载状态 entity.IsGetContentSuccess = MainContent.Length > 50 ? true : false; //返回结果 return(entity); }