protected void Page_Load(object sender, EventArgs e)
    {
        Skybot.Collections.Analyse.SingleListPageAnalyse ListPageAnalyse = new Skybot.Collections.Analyse.SingleListPageAnalyse("https://www.biquge5200.cc/76_76271/");

        //处理url
        Filter114Zw(ListPageAnalyse);

        XMLDocuentAnalyse documentAnalyse = new Skybot.Collections.Analyse.XMLDocuentAnalyse()
        {
            IndexPageUrl = new Skybot.Collections.Analyse.ListPageContentUrl()
            {
                index = 0, Title = "书名", Url = new Uri(ListPageAnalyse.IndexPageUrl)
            }
        };


        //初始化内容分析器
        var regex = documentAnalyse.GetPathExpression(documentAnalyse.IndexPageUrl, ListPageAnalyse.ListPageContentUrls);
    }
    /// <summary>
    /// 114 中文网过滤.
    /// 1.去重
    ///     如果开始出现了 后面再出现就删除后面的.
    ///
    /// 2.url ID 排序
    /// 3.包含索引页面的url 才能正确排序  不包含索引URL的直接删除
    /// </summary>
    public void Filter114Zw(Skybot.Collections.Analyse.SingleListPageAnalyse analyse)
    {
        List <ListPageContentUrl> urls = new List <ListPageContentUrl>();

        //string x= string.Join("/", new Uri(analyse.IndexPageUrl).Segments);

        //包含索引页面的url
        var bookurls = analyse.ListPageContentUrls.Where(p => p.Url.ToString().Contains(analyse.IndexPageUrl.Replace("index.html", "")));
        var urlsu    = from p in bookurls
                       select new
        {
            Index = Get114Index(p.Url, analyse.IndexPageUrl),
            page  = p,
        };
        int index = 0;
        var urlsx = (from p in urlsu
                     where int.TryParse(p.Index, out index)
                     select new
        {
            Index = int.Parse(p.Index),
            Page = p.page
        }).ToList();

        int[] order = urlsx.Select(p => p.Index).Take(3).ToArray();
        if (order.Length > 2)
        {
            //如果第一个的索引ID大于第二个的索引ID,并且第二个索引小于第三个索引则移除第一个索引
            if (order[0] > order[1] && order[2] > order[1])
            {
                urlsx.RemoveAt(0);
            }
        }
        //处理url 内容页面列表完成
        //  urlsx.Take(3).ToList();

        analyse.ListPageContentUrls = urlsx.Select(p => p.Page).ToList();
    }
Пример #3
0
    /// <summary>
    /// 得到文章的内容
    /// </summary>
    /// <param name="o">记录</param>
    /// <param name="entities">数据库操作实体</param>
    private void GetContents(TygModel.书名表 o, TygModel.Entities entities)
    {
        //得到
        Skybot.Collections.Analyse.SingleListPageAnalyse ListPageAnalyse = null;
        Skybot.Collections.Analyse.XMLDocuentAnalyse     documentAnalyse = null;

        //打印状态
        System.Diagnostics.Debug.WriteLine("开始采集:" + o.书名 + " 目录:" + o.采集用的URL1);

        //得到转换后的对应分析器
        List <Task> tasks = new List <Task>();

        tasks.Add(Task.Factory.StartNew(() =>
        {
            ListPageAnalyse = new Skybot.Collections.Analyse.SingleListPageAnalyse(o.采集用的URL1);
            documentAnalyse = new Skybot.Collections.Analyse.XMLDocuentAnalyse()
            {
                IndexPageUrl = new Skybot.Collections.Analyse.ListPageContentUrl()
                {
                    index = 0, Title = o.书名, Url = new Uri(o.采集用的URL1)
                }
            };

            try
            {
                //初始化内容分析器
                documentAnalyse.GetPathExpression(documentAnalyse.IndexPageUrl, ListPageAnalyse.ListPageContentUrls);
            }
            catch (Exception)
            {
            }
        }));


        try
        {
            //等待添加页面列表分析完成 5 分钟没有完成则表示超时
            System.Threading.Tasks.Task.WaitAll(tasks.ToArray(), TimeSpan.FromMinutes(5));
        }
        catch (AggregateException ex)
        {
            ex.Handle((exx) =>
            {
                System.Diagnostics.Debug.WriteLine(exx.Message + "|||||" + exx.StackTrace);
                return(true);
            });
        }


        //如果数据有效
        if (ListPageAnalyse != null && documentAnalyse != null && documentAnalyse.PathExpression != null)
        {
            //得到已经存在的记录
            var docentitys = o.文章表.ToList();
            var docs       = docentitys.Select(p => p.章节名.Trim());

            //打印状态
            System.Diagnostics.Debug.WriteLine("内容共:" + ListPageAnalyse.ListPageContentUrls.Count + "条记录");

            //将章节列表索引转换成为扩展实体数据
            List <UrlExtentEntity> entitys = ListPageAnalyse.ListPageContentUrls.Select(p => new UrlExtentEntity()
            {
                index = p.index, Indexs = p.Indexs, Title = p.Title, Url = p.Url
            }).ToList();

            //开始采集数据
            for (int k = 0; k < entitys.Count; k++)
            {
                //上一条记录
                UrlExtentEntity PreviousItem = null; // entitys[k - 1];
                //当前记录
                UrlExtentEntity CurrentItem = null;  //  entitys[k];
                //下一条记录
                UrlExtentEntity NextItem = null;     //  entitys[k + 1];

                //当前记录
                CurrentItem = entitys[k];
                //看看记录是不是已经存在了 如果存在则不进行更新
                if (!docs.Contains(CurrentItem.Title.Trim()))
                {
                    #region  值

                    //上一条记录
                    if (k - 1 >= 0)
                    {
                        PreviousItem = entitys[k - 1];
                        //如果上一章节已经存在了则就用数据库中的记录
                        var Temprecords = docentitys.Where(p => p.章节名.Trim() == PreviousItem.Title.Trim());
                        if (Temprecords.Count() > 0)
                        {
                            PreviousItem.Token = Temprecords.ElementAt(0).本记录GUID.ToString();
                            //设置本记录ID
                            CurrentItem.Token = Temprecords.ElementAt(0).一章.Value.ToString();
                        }
                    }


                    //下一条记录
                    if (k + 1 < entitys.Count)
                    {
                        NextItem = entitys[k + 1];
                    }
                    #endregion

                    //当前章节的GUID
                    if (CurrentItem.Token.Length < 10)
                    {
                        CurrentItem.Token = Guid.NewGuid().ToString();
                    }

                    //产生下一章节的GUID
                    if (NextItem != null && NextItem.Token.Length < 10)
                    {
                        NextItem.Token = Guid.NewGuid().ToString();

                        //如果上一章节已经存在了则就用数据库中的记录
                        var Temprecords = docentitys.Where(p => p.章节名.Trim() == CurrentItem.Title.Trim());
                        if (Temprecords.Count() > 0)
                        {
                            NextItem.Token = Temprecords.ElementAt(0).一章.Value.ToString();
                        }
                    }
                    //得到内容
                    string content = "";

                    try
                    {
                        content = documentAnalyse.GetContent(CurrentItem.Url.GetWeb());
                    }
                    catch
                    {
                    }
                    //添加到数据库
                    entities.AddTo文章表(new TygModel.文章表()
                    {
                        GUID     = o.GUID,
                        书名       = o.书名,
                        分类标识     = o.分类标识,
                        本记录GUID  = Guid.Parse(CurrentItem.Token),
                        创建时间     = DateTime.Now,
                        分类名称     = o.分类表.分类名称,
                        章节名      = CurrentItem.Title,
                        一章       = PreviousItem == null ? Guid.Empty :Guid.Parse(PreviousItem.Token),
                        一章       = NextItem == null ? Guid.Empty : Guid.Parse(NextItem.Token),
                        最后访问时间   = DateTime.Now,
                        内容       = content,
                        采集用的URL1 = CurrentItem.Url.ToString()
                    });
                    //打印状态
                    System.Diagnostics.Debug.WriteLine("采集内容:" + CurrentItem.Title + "成功,正文长度:" + content.Length);
                }
                else
                {
                    //如果记录的内容比较少则更新内容
                    var Temprecords = docentitys.Where(p => p.章节名.Trim() == CurrentItem.Title.Trim());
                    if (Temprecords.Count() > 0)
                    {
                        //如果内容无效
                        if ( //中文小于6
                            string.Join("", System.Text.RegularExpressions.Regex.Matches(Temprecords.ElementAt(0).内容, @"[\u4e00-\u9fa5\d\w123456789~!!·#¥%……—*()——+/”》“‘’,;。、?,:…《]+[\u4e00-\u9fa5123456789~!!·#¥%……—*(!)——+/”》“‘,’\r\n;。、?,:…《]", System.Text.RegularExpressions.RegexOptions.Multiline)
                                        .Cast <System.Text.RegularExpressions.Match>().Select(p => p.Value).ToArray()
                                        ).Length < 6
                            )
                        {
                            //更新记录
                            var reccords = o.文章表.Where(p => p.章节名.Trim() == CurrentItem.Title.Trim());
                            if (reccords.Count() > 0)
                            {
                                var record = reccords.ElementAt(0);
                                try
                                {
                                    record.采集用的URL1 = CurrentItem.Url.ToString();
                                    record.内容       = documentAnalyse.GetContent(CurrentItem.Url.GetWeb());
                                }
                                catch { }
                                //打印状态
                                System.Diagnostics.Debug.WriteLine("更新记录:" + CurrentItem.Title + "成功,正文长度:" + record.内容.Length);
                            }
                        }
                    }
                }
            }
        }
    }