Ejemplo n.º 1
0
        /// <summary>
        /// 构造函数
        /// </summary>
        public CollectBaseV2()
        {
            this.CollectStatus = new StatusObject();
            this.BH            = new Voodoo.Basement.Client.BookHelper(RulesOperate.GetSetting().TargetUrl);

            googleProxy     = XmlRpcProxyGen.Create <IMath>();
            googleProxy.Url = "http://blogsearch.google.com/ping/RPC2";

            baiduProxy     = XmlRpcProxyGen.Create <IMath>();
            baiduProxy.Url = "http://ping.baidu.com/ping/RPC2";
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 替换图片章节为文本章节
        /// </summary>
        public void CollectText()
        {
            Setting s = Book.RulesOperate.GetSetting();

            this.CollectStatus.Status = "正在获取系统书籍列表"; Status_Chage();
            var books = BH.SearchBook("", "", "");

            foreach (var book in books)
            {
                this.CollectStatus.BookTitle = book.Title; Status_Chage();
                #region 获取书籍信息
                BookAndChapter bc = new BookAndChapter();
                bc.Author    = book.Author;
                bc.BookTitle = book.Title;
                bc.Class     = book.ClassName;
                bc.ClassID   = book.ClassID;
                bc.ID        = book.ID;
                bc.Intro     = book.Intro;
                bc.Status    = book.Status;
                bc.Chapters  = new List <Chapter>();
                #endregion

                #region 获取图片章节
                this.CollectStatus.Status = "正在获取需要处理的章节"; Status_Chage();
                var chapters = BH.ChapterSearch(book.Title, "", true);//获取所有图片章节

                if (chapters.Count == 0)
                {
                    continue;
                }

                foreach (var chapter in chapters)
                {
                    bc.Chapters.Add(new Chapter()
                    {
                        IsImageChapter = true,
                        IsVip          = true,
                        Title          = chapter.Title,
                        id             = chapter.ID
                    });
                }//获得书籍待采集章节结束
                #endregion


                //获得文本采集规则
                var Rules = RulesOperate.GetBookRules().Where(p => p.IsImageSite == false);

                #region 循环规则,开始采集
                foreach (var Rule in Rules)
                {
                    BookAndChapter b = new BookAndChapter();

                    #region 搜索书籍
                    this.CollectStatus.Status = string.Format("正在从{0}搜索书籍", Rule.SiteName); Status_Chage();
                    //搜索书籍
                    string html_Search = "";
                    string url_search  = "";
                    if (Rule.SearchMethod.ToLower() == "get")//采集站搜索使用get提交
                    {
                        url_search  = Rule.SearchPageUrl + "?" + string.Format(Rule.SearchPars, bc.BookTitle.UrlEncode(Encoding.GetEncoding("gb2312")));
                        html_Search = Url.GetHtml(url_search,
                                                  Rule.CharSet);
                    }
                    else
                    {
                        //采集站搜索使用POST提交
                        url_search  = Rule.SearchPageUrl;
                        html_Search = Url.Post(
                            string.Format(Rule.SearchPars, bc.BookTitle).ParamToNameValueCollection(),
                            Rule.SearchPageUrl,
                            Encoding.GetEncoding(Rule.CharSet),
                            new System.Net.CookieContainer(),
                            "*.*",
                            Rule.Url,
                            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.2 Safari/535.11"
                            );
                    }
                    #endregion

                    #region 打开书籍信息页

                    this.CollectStatus.Status = string.Format("正在从{0}打开书籍", Rule.SiteName); Status_Chage();
                    string html_BookInfo = "";
                    string bookUrl       = "";
                    if (html_Search.IsMatch(Rule.BookInfoUrl))
                    {
                        CollectStatus.Status = "打开书籍信息页"; Status_Chage();
                        bookUrl = html_Search.GetMatchGroup(Rule.BookInfoUrl).Groups["url"].Value.AppendToDomain(Rule.Url);
                        //打开书籍信息页
                        html_BookInfo = Url.GetHtml(bookUrl, Rule.CharSet);
                    }
                    else
                    {
                        //系统自动跳转到了书籍信息页
                        html_BookInfo = html_Search;
                        bookUrl       = url_search;
                    }
                    #endregion

                    #region 获取章节列表
                    //获得章节列表页地址
                    this.CollectStatus.Status = string.Format("正在从{0}打开章节列表", Rule.SiteName); Status_Chage();
                    string chapterListUrl = html_BookInfo.GetMatchGroup(Rule.ChapterListUrl).Groups["url"].Value.AppendToDomain(Rule.Url);
                    if (Regex.IsMatch(html_BookInfo, Rule.ChapterListUrl) == false)
                    {
                        chapterListUrl = bookUrl;
                    }


                    //打开章节列表
                    CollectStatus.Status = "打开章节列表"; Status_Chage();
                    string html_ChapterList = "";
                    if (chapterListUrl == bookUrl)
                    {
                        html_ChapterList = html_BookInfo;
                    }
                    else
                    {
                        html_ChapterList = Url.GetHtml(chapterListUrl, Rule.CharSet);
                    }
                    var match_Chapters = html_ChapterList.GetMatchGroup(Rule.ChapterNameAndUrl);

                    //获取章节列表
                    b.Chapters = new List <Chapter>();
                    int i = 0;
                    while (match_Chapters.Success)
                    {
                        b.Chapters.Add(new Chapter()
                        {
                            Title = match_Chapters.Groups["title"].Value,
                            Url   = match_Chapters.Groups["url"].Value.AppendToDomain(chapterListUrl),
                            Index = i
                        });
                        i++;
                        match_Chapters = match_Chapters.NextMatch();
                    }
                    #endregion

                    #region 循环获取待采集图片章节,替换成文本
                    this.CollectStatus.Status = string.Format("正在从{0}处理章节", Rule.SiteName); Status_Chage();
                    foreach (Chapter c in bc.Chapters)
                    {
                        this.CollectStatus.ChapterTitle = c.Title; Status_Chage();
                        if (!c.Content.IsNullOrEmpty())
                        {
                            //如果章节内容不为空,则不需要采集,继续采集下一章节
                            continue;
                        }
                        //获取章节在分站点的URL和标题
                        //var chapter_NeedCollect = b.Chapters.Where(p => p.Title.Replace(" ", "").Contains(c.Title.Replace(" ", "")));
                        var chapter_NeedCollect = (from n in b.Chapters select new { n.Index, n.Url, n.Length, n.Title, n.Content, weight = n.Title.GetSimilarityWith(c.Title) }).OrderByDescending(p => p.weight).ToList();
                        if (chapter_NeedCollect.Count() > 0 && chapter_NeedCollect.First().weight > (0.8).ToDecimal())//相似度大于0.8的才进行采集
                        {
                            this.CollectStatus.ChapterTitle = c.Title;
                            this.CollectStatus.Status       = "正在采集";
                            Status_Chage();
                            //采集章节内容


                            string html_Content = Url.GetHtml(chapter_NeedCollect.First().Url, Rule.CharSet);

                            Match matchGroup = html_Content.GetMatchGroup(Rule.ChapterContent);

                            string Content = "";
                            while (matchGroup.Success)
                            {
                                if (matchGroup.Groups["content"].Value.Length > 200)
                                {
                                    Content += matchGroup.Groups["content"].Value;
                                }
                                matchGroup = matchGroup.NextMatch();
                            }

                            //过滤
                            Content = Filter(Content);
                            if (Content.ToLower().Contains("<img ") == false)
                            {
                                c.Content  = Content;
                                bc.Changed = true;
                                //编辑章节
                                this.CollectStatus.Status = "章节保存到系统"; Status_Chage();
                                BH.ChapterEdit(c.id, c.Title, c.Content, false, false);

                                //完成之后将本章节去掉
                                bc.Chapters = bc.Chapters.Where(p => p.id != c.id).ToList();
                            }

                            this.CollectStatus.ChapterleftCout--; Status_Chage();
                        } //end of 判断章节在分站中存在
                    }     //end of 循环采集章节
                    #endregion 循环采集章节
                }
                #endregion 循环规则

                #region 重新生成章节
                if (bc.Changed)
                {
                    CollectStatus.Status = "正在生成章节"; Status_Chage();
                    BH.CreateChapters(bc.ID);
                }

                #endregion
            }//书籍循环结束
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 根据书籍名称采集
        /// </summary>
        /// <param name="BookTitle"></param>
        public void CollectBookByTitle(string BookTitle)
        {
            this.CollectStatus.BookTitle = BookTitle; Status_Chage();
            CollectStatus.ChapterTitle   = "";
            //1.获取起点书籍
            CollectStatus.Status = "从起点搜索"; Status_Chage();
            this.QidianBook      = GetQidianBook(BookTitle);
            if (QidianBook.ID <= 0)
            {
                return;//在起点没有采集到这本书
            }
            if (QidianBook.Chapters.Count == 0)
            {
                CollectStatus.Status = "没能打开章节列表";
                Status_Chage();
                return;
            }

            //2.获取本地书籍
            CollectStatus.Status = "从本地检查"; Status_Chage();
            this.LocalBook       = GetLocalBook(QidianBook);

            //3.对比获取需要采集的章节
            var tmp = QidianBook.Chapters.Where(p => p.Title.ReplaceSynonyms() == LocalBook.LastChapter.Title.ReplaceSynonyms());//最后一张在起点中的章节

            if (tmp.Count() == 0 && !LocalBook.LastChapter.Title.IsNullOrEmpty())
            {
                CollectStatus.Status = "最新章节在起点中不存在";
                Status_Chage();

                return;
            }
            if (LocalBook.LastChapter.Title.IsNullOrEmpty())
            {
                //本地书籍没有任何章节
                BookNeedCollect = QidianBook;
            }
            else
            {
                var localLastBook = tmp.First();
                BookNeedCollect          = QidianBook;
                BookNeedCollect.Url      = LocalBook.Url;
                BookNeedCollect.Chapters = BookNeedCollect.Chapters.Where(p => p.Index > localLastBook.Index).ToList();
            }

            BookNeedCollect.ID      = LocalBook.ID;
            BookNeedCollect.Class   = LocalBook.Class;
            BookNeedCollect.ClassID = LocalBook.ClassID;

            CollectStatus.ChapterCount = BookNeedCollect.Chapters.Count(); CollectStatus.ChapterleftCout = BookNeedCollect.Chapters.Count(); Status_Chage();//剩余章节数量

            //可以从起点采集的章节
            if (BookNeedCollect.Chapters.Where(p => p.IsVip == false && p.Content.IsNullOrEmpty()).Count() > 0)
            {
                foreach (Chapter c in BookNeedCollect.Chapters.Where(p => p.IsVip == false).ToList())
                {
                    CollectStatus.ChapterTitle = c.Title; CollectStatus.Status = "正在从起点采集"; Status_Chage();
                    c.Content = GetQidianNormalChapter(c.Url);
                }
            }

            //4.循环采集书籍
            var Rules = RulesOperate.GetBookRules();

            foreach (CollectRule rule in Rules.Where(p => p.IsImageSite == false))
            {
                //如果没有任何章节需要采集,则直接退出章节
                CollectStatus.Status = "开始采集-" + rule.SiteName; Status_Chage();
                if (BookNeedCollect.Chapters.Count == 0)
                {
                    CollectStatus.Status = "没有任何章节需要采集"; Status_Chage(); Thread.Sleep(500);
                    break;
                }



                //需要采集的章节没有空内容的,也就是说需要采集的已经全都采集完成了
                if (BookNeedCollect.Chapters.Where(p => p.Content.IsNullOrEmpty()).Count() == 0)
                {
                    CollectStatus.Status = "章节全部采集完成"; Status_Chage(); Thread.Sleep(100);
                    break;
                }

                CollectChapter(BookNeedCollect, rule);
            }

            foreach (CollectRule rule in Rules.Where(p => p.IsImageSite == true))
            {
                //如果没有任何章节需要采集,则直接退出章节
                CollectStatus.Status = "开始采集-" + rule.SiteName; Status_Chage();
                if (BookNeedCollect.Chapters.Count == 0)
                {
                    CollectStatus.Status = "没有任何章节需要采集"; Status_Chage(); Thread.Sleep(500);
                    break;
                }



                //需要采集的章节没有空内容的,也就是说需要采集的已经全都采集完成了
                if (BookNeedCollect.Chapters.Where(p => p.Content.IsNullOrEmpty()).Count() == 0)
                {
                    CollectStatus.Status = "章节全部采集完成"; Status_Chage(); Thread.Sleep(100);
                    break;
                }

                CollectChapterFromLuoqiu(BookNeedCollect, rule);
            }


            //5.提交到目标站点
            CollectStatus.Status = "保存到目标站点"; Status_Chage();
            SubmitBook(BookNeedCollect);


            //6. 发博客
            PublishBlog(BookNeedCollect);

            //7.采集完成 生成书籍
            CollectStatus.Status = "采集完成,正在生成"; Status_Chage();
            if (BookNeedCollect.Chapters.Count > 0)
            {
                CreatePage(BookNeedCollect.ID.ToS(), BookNeedCollect.ClassID);
            }
            CollectStatus.Status = string.Format("书籍《{0}》完成", BookTitle); Status_Chage();
        }