protected void Page_Load(object sender, EventArgs e)
    {
        Skybot.Collections.Analyse.SingleListPageAnalyse ListPageAnalyse = new Skybot.Collections.Analyse.SingleListPageAnalyse("https://www.biquge5200.cc/76_76271/");

        //处理url
        Filter114Zw(ListPageAnalyse);

        XMLDocuentAnalyse documentAnalyse = new Skybot.Collections.Analyse.XMLDocuentAnalyse()
        {
            IndexPageUrl = new Skybot.Collections.Analyse.ListPageContentUrl()
            {
                index = 0, Title = "书名", Url = new Uri(ListPageAnalyse.IndexPageUrl)
            }
        };


        //初始化内容分析器
        var regex = documentAnalyse.GetPathExpression(documentAnalyse.IndexPageUrl, ListPageAnalyse.ListPageContentUrls);
    }
Example #2
0
    /// <summary>
    /// 得到文章的内容
    /// </summary>
    /// <param name="o">记录</param>
    /// <param name="entities">数据库操作实体</param>
    private void GetContents(TygModel.书名表 o, TygModel.Entities entities)
    {
        //得到
        Skybot.Collections.Analyse.SingleListPageAnalyse ListPageAnalyse = null;
        Skybot.Collections.Analyse.XMLDocuentAnalyse     documentAnalyse = null;

        //打印状态
        System.Diagnostics.Debug.WriteLine("开始采集:" + o.书名 + " 目录:" + o.采集用的URL1);

        //得到转换后的对应分析器
        List <Task> tasks = new List <Task>();

        tasks.Add(Task.Factory.StartNew(() =>
        {
            ListPageAnalyse = new Skybot.Collections.Analyse.SingleListPageAnalyse(o.采集用的URL1);
            documentAnalyse = new Skybot.Collections.Analyse.XMLDocuentAnalyse()
            {
                IndexPageUrl = new Skybot.Collections.Analyse.ListPageContentUrl()
                {
                    index = 0, Title = o.书名, Url = new Uri(o.采集用的URL1)
                }
            };

            try
            {
                //初始化内容分析器
                documentAnalyse.GetPathExpression(documentAnalyse.IndexPageUrl, ListPageAnalyse.ListPageContentUrls);
            }
            catch (Exception)
            {
            }
        }));


        try
        {
            //等待添加页面列表分析完成 5 分钟没有完成则表示超时
            System.Threading.Tasks.Task.WaitAll(tasks.ToArray(), TimeSpan.FromMinutes(5));
        }
        catch (AggregateException ex)
        {
            ex.Handle((exx) =>
            {
                System.Diagnostics.Debug.WriteLine(exx.Message + "|||||" + exx.StackTrace);
                return(true);
            });
        }


        //如果数据有效
        if (ListPageAnalyse != null && documentAnalyse != null && documentAnalyse.PathExpression != null)
        {
            //得到已经存在的记录
            var docentitys = o.文章表.ToList();
            var docs       = docentitys.Select(p => p.章节名.Trim());

            //打印状态
            System.Diagnostics.Debug.WriteLine("内容共:" + ListPageAnalyse.ListPageContentUrls.Count + "条记录");

            //将章节列表索引转换成为扩展实体数据
            List <UrlExtentEntity> entitys = ListPageAnalyse.ListPageContentUrls.Select(p => new UrlExtentEntity()
            {
                index = p.index, Indexs = p.Indexs, Title = p.Title, Url = p.Url
            }).ToList();

            //开始采集数据
            for (int k = 0; k < entitys.Count; k++)
            {
                //上一条记录
                UrlExtentEntity PreviousItem = null; // entitys[k - 1];
                //当前记录
                UrlExtentEntity CurrentItem = null;  //  entitys[k];
                //下一条记录
                UrlExtentEntity NextItem = null;     //  entitys[k + 1];

                //当前记录
                CurrentItem = entitys[k];
                //看看记录是不是已经存在了 如果存在则不进行更新
                if (!docs.Contains(CurrentItem.Title.Trim()))
                {
                    #region  值

                    //上一条记录
                    if (k - 1 >= 0)
                    {
                        PreviousItem = entitys[k - 1];
                        //如果上一章节已经存在了则就用数据库中的记录
                        var Temprecords = docentitys.Where(p => p.章节名.Trim() == PreviousItem.Title.Trim());
                        if (Temprecords.Count() > 0)
                        {
                            PreviousItem.Token = Temprecords.ElementAt(0).本记录GUID.ToString();
                            //设置本记录ID
                            CurrentItem.Token = Temprecords.ElementAt(0).一章.Value.ToString();
                        }
                    }


                    //下一条记录
                    if (k + 1 < entitys.Count)
                    {
                        NextItem = entitys[k + 1];
                    }
                    #endregion

                    //当前章节的GUID
                    if (CurrentItem.Token.Length < 10)
                    {
                        CurrentItem.Token = Guid.NewGuid().ToString();
                    }

                    //产生下一章节的GUID
                    if (NextItem != null && NextItem.Token.Length < 10)
                    {
                        NextItem.Token = Guid.NewGuid().ToString();

                        //如果上一章节已经存在了则就用数据库中的记录
                        var Temprecords = docentitys.Where(p => p.章节名.Trim() == CurrentItem.Title.Trim());
                        if (Temprecords.Count() > 0)
                        {
                            NextItem.Token = Temprecords.ElementAt(0).一章.Value.ToString();
                        }
                    }
                    //得到内容
                    string content = "";

                    try
                    {
                        content = documentAnalyse.GetContent(CurrentItem.Url.GetWeb());
                    }
                    catch
                    {
                    }
                    //添加到数据库
                    entities.AddTo文章表(new TygModel.文章表()
                    {
                        GUID     = o.GUID,
                        书名       = o.书名,
                        分类标识     = o.分类标识,
                        本记录GUID  = Guid.Parse(CurrentItem.Token),
                        创建时间     = DateTime.Now,
                        分类名称     = o.分类表.分类名称,
                        章节名      = CurrentItem.Title,
                        一章       = PreviousItem == null ? Guid.Empty :Guid.Parse(PreviousItem.Token),
                        一章       = NextItem == null ? Guid.Empty : Guid.Parse(NextItem.Token),
                        最后访问时间   = DateTime.Now,
                        内容       = content,
                        采集用的URL1 = CurrentItem.Url.ToString()
                    });
                    //打印状态
                    System.Diagnostics.Debug.WriteLine("采集内容:" + CurrentItem.Title + "成功,正文长度:" + content.Length);
                }
                else
                {
                    //如果记录的内容比较少则更新内容
                    var Temprecords = docentitys.Where(p => p.章节名.Trim() == CurrentItem.Title.Trim());
                    if (Temprecords.Count() > 0)
                    {
                        //如果内容无效
                        if ( //中文小于6
                            string.Join("", System.Text.RegularExpressions.Regex.Matches(Temprecords.ElementAt(0).内容, @"[\u4e00-\u9fa5\d\w123456789~!!·#¥%……—*()——+/”》“‘’,;。、?,:…《]+[\u4e00-\u9fa5123456789~!!·#¥%……—*(!)——+/”》“‘,’\r\n;。、?,:…《]", System.Text.RegularExpressions.RegexOptions.Multiline)
                                        .Cast <System.Text.RegularExpressions.Match>().Select(p => p.Value).ToArray()
                                        ).Length < 6
                            )
                        {
                            //更新记录
                            var reccords = o.文章表.Where(p => p.章节名.Trim() == CurrentItem.Title.Trim());
                            if (reccords.Count() > 0)
                            {
                                var record = reccords.ElementAt(0);
                                try
                                {
                                    record.采集用的URL1 = CurrentItem.Url.ToString();
                                    record.内容       = documentAnalyse.GetContent(CurrentItem.Url.GetWeb());
                                }
                                catch { }
                                //打印状态
                                System.Diagnostics.Debug.WriteLine("更新记录:" + CurrentItem.Title + "成功,正文长度:" + record.内容.Length);
                            }
                        }
                    }
                }
            }
        }
    }