Beispiel #1
0
        /// <summary>
        /// 查找到指定的表达式 没有则返回空
        /// </summary>
        /// <param name="IndexPageUrl">索引页面</param>
        /// <param name="ContentPageUrls">分析用的一个页面</param>
        /// <returns>查找到指定的表达式 没有则返回空</returns>
        public string GetPathExpression(ListPageContentUrl IndexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls)
        {
            //得到当前所有的 XML分析结果
            List <XMLDocuentAnalyseEntity> xdoms = ParsPageUrlsToXDocuments(IndexPageUrl, ContentPageUrls);

            //对得到的XML内容进行分胡 取出有效的表达式
            var groups = xdoms.GroupBy(p => p.Expression);

            if (groups.Count() > 0)
            {
                //按表达式排序后的结果
                var groupOrder = groups.OrderBy(p => p.Key.Length);

                string express = groupOrder.Last().Key;

                if (express.Length > 0)
                {
                    //表达式
                    var xpathArr = express.Split('/').ToList();
                    //如果最后一个是文字节点则直接删除
                    if (xpathArr.Last().StartsWith("#"))
                    {
                        xpathArr.Remove(xpathArr.Last());
                        express = string.Join("/", xpathArr.ToArray());
                    }
                }
                //如果表达式有效
                if (!(express.Contains("&") || express.Contains("\"")))
                {
                    if (IndexPageUrl.Url.ToString() == this.IndexPageUrl.Url.ToString())
                    {
                        //返回表达式
                        return(PathExpression = express);
                    }
                    else
                    {
                        return(express);
                    }
                }
                else
                {
                    System.Diagnostics.Stopwatch stopwatch = new System.Diagnostics.Stopwatch();
                    stopwatch.Restart();
                    PathExpression = express = GetValidaXpath(from p in xdoms
                                                              select new ExpressionEntity
                    {
                        Expression = p.Expression,
                    });
                    stopwatch.Stop();

                    System.Diagnostics.Debug.WriteLine("运行  GetValidaXpath 时间为" + stopwatch.ElapsedMilliseconds);

                    return(PathExpression);
                }
            }



            return("");
        }
Beispiel #2
0
        /// <summary>
        /// 将指定的url转换成为
        /// </summary>
        /// <param name="indexPageUrl">列表页面</param>
        /// <param name="ContentPageUrls">内容页面列表</param>
        /// <returns></returns>
        List <XMLDocuentAnalyseEntity> ParsPageUrlsToXDocuments(ListPageContentUrl indexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls)
        {
            List <XMLDocuentAnalyseEntity> result = new List <XMLDocuentAnalyseEntity>();

            //需要分析的url
            System.Collections.Generic.List <ListPageContentUrl> analyseUrls = new List <ListPageContentUrl>();

            //如果转url大于10则只取10个
            analyseUrls = ContentPageUrls.Take(10).ToList();

            //开始创建多个并行任务数组
            System.Threading.Tasks.Task[] tasks = new System.Threading.Tasks.Task <XMLDocuentAnalyseEntity> [analyseUrls.Count];
            //实例化数组
            for (int k = 0; k < tasks.Length; k++)
            {
                tasks[k] = System.Threading.Tasks.Task.Factory.StartNew <XMLDocuentAnalyseEntity>((url) =>
                {
                    return(FillContentUrl(indexPageUrl, (ListPageContentUrl)url));
                }, analyseUrls[k]);
            }

            //等待并行计算完成
            System.Threading.Tasks.Task.WaitAll(tasks);

            //填充结果
            tasks.ToList().ForEach((o) =>
            {
                if (o.IsCompleted)
                {
                    result.Add(((System.Threading.Tasks.Task <XMLDocuentAnalyseEntity>)o).Result);
                }
                //表示异步已经处理
                if (o.Exception != null)
                {
                    o.Exception.Handle((ex) =>
                    {
                        System.Diagnostics.UDPGroup.SendStrGB2312(ex.Message + "|||||" + ex.StackTrace);
                        return(true);
                    });
                }
                if (o.Status == System.Threading.Tasks.TaskStatus.RanToCompletion ||
                    o.Status == System.Threading.Tasks.TaskStatus.Faulted ||
                    o.Status == System.Threading.Tasks.TaskStatus.Canceled)
                {//释放资源
                    o.Dispose();
                }
            });
            analyseUrls.Clear();

            return(result);
        }
Beispiel #3
0
        /// <summary>
        /// 查找到指定的表达式 没有则返回空
        /// </summary>
        /// <param name="IndexPageUrl">索引页面</param>
        /// <param name="ContentPageUrls">分析用的一个页面</param>
        /// <returns>查找到指定的表达式 没有则返回空</returns>
        public string GetPathExpression(ListPageContentUrl IndexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls)
        {
            //得到当前所有的 XML分析结果
            List <XMLDocuentAnalyseEntity> xdoms = ParsPageUrlsToXDocuments(IndexPageUrl, ContentPageUrls);

            //对得到的XML内容进行分胡 取出有效的表达式
            var groups = xdoms.GroupBy(p => p.Expression);

            if (groups.Count() > 0)
            {
                //按表达式排序后的结果
                var groupOrder = groups.OrderBy(p => p.Key.Length);

                string express = groupOrder.Last().Key;

                if (express.Length > 0)
                {
                    //表达式
                    var xpathArr = express.Split('/').ToList();
                    //如果最后一个是文字节点则直接删除
                    if (xpathArr.Last().StartsWith("#"))
                    {
                        xpathArr.Remove(xpathArr.Last());
                        express = string.Join("/", xpathArr.ToArray());
                    }
                }

                if (IndexPageUrl.Url.ToString() == this.IndexPageUrl.Url.ToString())
                {
                    //返回表达式
                    return(PathExpression = express);
                }
                else
                {
                    return(express);
                }
            }



            return("");
        }
Beispiel #4
0
        /// <summary>
        /// 找到当前文章内容并填充为xdom原素
        /// </summary>
        /// <param name="indexPageUrl"></param>
        /// <param name="ContentPageUrl"></param>
        /// <returns></returns>
        XMLDocuentAnalyseEntity FillContentUrl(ListPageContentUrl indexPageUrl, ListPageContentUrl ContentPageUrl)
        {
            //主要内容
            string MainContent = "";
            //用于保存原始内容
            string originContent = "";
            //得到要读取的页面
            string pageurl = ContentPageUrl.Url.ToString();

            //结果文档
            XMLDocuentAnalyseEntity entity = null;

            //得到当前页面的HTML内容
            string htmlContent = "";
            //原素标签
            var elementTags = new string[] { "p", "span", "strong", "font", "h1", "tbody", "o:p", "dd", "tr", "table" };

            //基url
            Uri baseUri = new Uri(pageurl);

            #region 把文档转换成为 XDocument


            //得到当前页面的HTML内容
            //并将HTML转换成为xml
            originContent = htmlContent = pageurl.GetWeb();
            HtmlDocument htmlDom = new HtmlDocument();
            //格式化为html
            htmlDom.LoadHtml(htmlContent);
            htmlContent = htmlDom.DocumentNode.OuterHtml;
            //重新加载HTML字符串 对一些标签进行闭合
            htmlDom.LoadHtml(htmlContent.FiltrateHTML(elementTags));



            #endregion


            entity = new XMLDocuentAnalyseEntity()
            {
                BaseUrlObject       = ContentPageUrl,
                HtmlDocument        = htmlDom,
                OriginContent       = htmlContent,
                PageTitle           = ContentPageUrl.Title,
                Content             = MainContent,
                IsGetContentSuccess = MainContent.Length > 50 ? true : false,
                IndexPageUrl        = indexPageUrl,
                XmlParseConten      = "<data>" + htmlContent + "</data>",
                Expression          = "",
            };

            #region 得到文档内容
            GetMainContentExpression(entity);
            #endregion

            //重新设置加载状态
            entity.IsGetContentSuccess = MainContent.Length > 50 ? true : false;

            //返回结果
            return(entity);
        }
Beispiel #5
0
 public string GetPathExpression(ListPageContentUrl IndexPageUrl, IEnumerable <ListPageContentUrl> ContentPageUrls)
 {
     throw new NotImplementedException();
 }
Beispiel #6
0
 /// <summary>
 /// 列表页面不支持此方法
 /// </summary>
 /// <param name="IndexPageUrl"></param>
 /// <param name="ContentPageUrls"></param>
 /// <returns></returns>
 public string GetPathExpression(ListPageContentUrl IndexPageUrl, System.Collections.Generic.IEnumerable <ListPageContentUrl> ContentPageUrls)
 {
     throw new NotImplementedException("列表页面不支持此方法");
 }
Beispiel #7
0
 /// <summary>
 /// 得到一个标题多个章节内容合并时的索引
 /// </summary>
 /// <returns></returns>
 List <int> GetTitleIndexs(ListPageContentUrl Item)
 {
     return(GetTitleIndexs(Item.index));
 }
Beispiel #8
0
        /// <summary>
        /// 找到当前文章内容并填充为xdom原素
        /// </summary>
        /// <param name="indexPageUrl"></param>
        /// <param name="ContentPageUrl"></param>
        /// <returns></returns>
        XMLDocuentAnalyseEntity FillContentUrl(ListPageContentUrl indexPageUrl, ListPageContentUrl ContentPageUrl)
        {
            //主要内容
            string MainContent = "";
            //用于保存原始内容
            string originContent = "";
            //得到要读取的页面
            string pageurl = ContentPageUrl.Url.ToString();

            //结果文档
            XMLDocuentAnalyseEntity entity = null;

            //得到当前页面的HTML内容
            string htmlContent = "";
            //原素标签
            var elementTags = new string[] { "p", "span", "strong", "font", "h1", "tbody", "o:p", "dd", "tr", "table" };

            //基url
            Uri baseUri = new Uri(pageurl);

            #region 把文档转换成为 XDocument


            //得到当前页面的HTML内容
            //并将HTML转换成为xml
            originContent = htmlContent = pageurl.GetWeb();
            HtmlDocument htmlDom = new HtmlDocument();
            //格式化为html
            htmlDom.LoadHtml(htmlContent);
            //有些文章会出现 无限循环异常
            //try
            //{
            htmlContent = htmlDom.DocumentNode.OuterHtml;
            //重新加载HTML字符串 对一些标签进行闭合
            htmlDom.LoadHtml(htmlContent.FiltrateHTML(elementTags));
            //}
            //catch (OverflowException ex)
            //{
            //    System.Diagnostics.Debug.WriteLine(DateTime.Now + ex.Message + "|||||" + ex.StackTrace);
            //    return new XMLDocuentAnalyseEntity()
            // {
            //     BaseUrlObject = ContentPageUrl,
            //     HtmlDocument = htmlDom,
            //     OriginContent = htmlContent,
            //     PageTitle = ContentPageUrl.Title,
            //     Content = MainContent,
            //     IsGetContentSuccess = MainContent.Length > 50 ? true : false,
            //     IndexPageUrl = indexPageUrl,
            //     XmlParseConten = "<data>" + htmlContent + "</data>",
            //     Expression = "",
            // };
            //}


            #endregion


            entity = new XMLDocuentAnalyseEntity()
            {
                BaseUrlObject       = ContentPageUrl,
                HtmlDocument        = htmlDom,
                OriginContent       = htmlContent,
                PageTitle           = ContentPageUrl.Title,
                Content             = MainContent,
                IsGetContentSuccess = MainContent.Length > 50 ? true : false,
                IndexPageUrl        = indexPageUrl,
                XmlParseConten      = "<data>" + htmlContent + "</data>",
                Expression          = "",
            };

            #region 得到文档内容
            GetMainContentExpression(entity);
            #endregion

            //重新设置加载状态
            entity.IsGetContentSuccess = MainContent.Length > 50 ? true : false;

            //返回结果
            return(entity);
        }