/// <summary> /// 构造函数 /// </summary> public CollectBaseV2() { this.CollectStatus = new StatusObject(); this.BH = new Voodoo.Basement.Client.BookHelper(RulesOperate.GetSetting().TargetUrl); googleProxy = XmlRpcProxyGen.Create <IMath>(); googleProxy.Url = "http://blogsearch.google.com/ping/RPC2"; baiduProxy = XmlRpcProxyGen.Create <IMath>(); baiduProxy.Url = "http://ping.baidu.com/ping/RPC2"; }
/// <summary> /// 替换图片章节为文本章节 /// </summary> public void CollectText() { Setting s = Book.RulesOperate.GetSetting(); this.CollectStatus.Status = "正在获取系统书籍列表"; Status_Chage(); var books = BH.SearchBook("", "", ""); foreach (var book in books) { this.CollectStatus.BookTitle = book.Title; Status_Chage(); #region 获取书籍信息 BookAndChapter bc = new BookAndChapter(); bc.Author = book.Author; bc.BookTitle = book.Title; bc.Class = book.ClassName; bc.ClassID = book.ClassID; bc.ID = book.ID; bc.Intro = book.Intro; bc.Status = book.Status; bc.Chapters = new List <Chapter>(); #endregion #region 获取图片章节 this.CollectStatus.Status = "正在获取需要处理的章节"; Status_Chage(); var chapters = BH.ChapterSearch(book.Title, "", true);//获取所有图片章节 if (chapters.Count == 0) { continue; } foreach (var chapter in chapters) { bc.Chapters.Add(new Chapter() { IsImageChapter = true, IsVip = true, Title = chapter.Title, id = chapter.ID }); }//获得书籍待采集章节结束 #endregion //获得文本采集规则 var Rules = RulesOperate.GetBookRules().Where(p => p.IsImageSite == false); #region 循环规则,开始采集 foreach (var Rule in Rules) { BookAndChapter b = new BookAndChapter(); #region 搜索书籍 this.CollectStatus.Status = string.Format("正在从{0}搜索书籍", Rule.SiteName); Status_Chage(); //搜索书籍 string html_Search = ""; string url_search = ""; if (Rule.SearchMethod.ToLower() == "get")//采集站搜索使用get提交 { url_search = Rule.SearchPageUrl + "?" + string.Format(Rule.SearchPars, bc.BookTitle.UrlEncode(Encoding.GetEncoding("gb2312"))); html_Search = Url.GetHtml(url_search, Rule.CharSet); } else { //采集站搜索使用POST提交 url_search = Rule.SearchPageUrl; html_Search = Url.Post( string.Format(Rule.SearchPars, bc.BookTitle).ParamToNameValueCollection(), Rule.SearchPageUrl, Encoding.GetEncoding(Rule.CharSet), new System.Net.CookieContainer(), "*.*", Rule.Url, "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.2 Safari/535.11" ); } #endregion #region 打开书籍信息页 this.CollectStatus.Status = string.Format("正在从{0}打开书籍", Rule.SiteName); Status_Chage(); string html_BookInfo = ""; string bookUrl = ""; if (html_Search.IsMatch(Rule.BookInfoUrl)) { CollectStatus.Status = "打开书籍信息页"; Status_Chage(); bookUrl = html_Search.GetMatchGroup(Rule.BookInfoUrl).Groups["url"].Value.AppendToDomain(Rule.Url); //打开书籍信息页 html_BookInfo = Url.GetHtml(bookUrl, Rule.CharSet); } else { //系统自动跳转到了书籍信息页 html_BookInfo = html_Search; bookUrl = url_search; } #endregion #region 获取章节列表 //获得章节列表页地址 this.CollectStatus.Status = string.Format("正在从{0}打开章节列表", Rule.SiteName); Status_Chage(); string chapterListUrl = html_BookInfo.GetMatchGroup(Rule.ChapterListUrl).Groups["url"].Value.AppendToDomain(Rule.Url); if (Regex.IsMatch(html_BookInfo, Rule.ChapterListUrl) == false) { chapterListUrl = bookUrl; } //打开章节列表 CollectStatus.Status = "打开章节列表"; Status_Chage(); string html_ChapterList = ""; if (chapterListUrl == bookUrl) { html_ChapterList = html_BookInfo; } else { html_ChapterList = Url.GetHtml(chapterListUrl, Rule.CharSet); } var match_Chapters = html_ChapterList.GetMatchGroup(Rule.ChapterNameAndUrl); //获取章节列表 b.Chapters = new List <Chapter>(); int i = 0; while (match_Chapters.Success) { b.Chapters.Add(new Chapter() { Title = match_Chapters.Groups["title"].Value, Url = match_Chapters.Groups["url"].Value.AppendToDomain(chapterListUrl), Index = i }); i++; match_Chapters = match_Chapters.NextMatch(); } #endregion #region 循环获取待采集图片章节,替换成文本 this.CollectStatus.Status = string.Format("正在从{0}处理章节", Rule.SiteName); Status_Chage(); foreach (Chapter c in bc.Chapters) { this.CollectStatus.ChapterTitle = c.Title; Status_Chage(); if (!c.Content.IsNullOrEmpty()) { //如果章节内容不为空,则不需要采集,继续采集下一章节 continue; } //获取章节在分站点的URL和标题 //var chapter_NeedCollect = b.Chapters.Where(p => p.Title.Replace(" ", "").Contains(c.Title.Replace(" ", ""))); var chapter_NeedCollect = (from n in b.Chapters select new { n.Index, n.Url, n.Length, n.Title, n.Content, weight = n.Title.GetSimilarityWith(c.Title) }).OrderByDescending(p => p.weight).ToList(); if (chapter_NeedCollect.Count() > 0 && chapter_NeedCollect.First().weight > (0.8).ToDecimal())//相似度大于0.8的才进行采集 { this.CollectStatus.ChapterTitle = c.Title; this.CollectStatus.Status = "正在采集"; Status_Chage(); //采集章节内容 string html_Content = Url.GetHtml(chapter_NeedCollect.First().Url, Rule.CharSet); Match matchGroup = html_Content.GetMatchGroup(Rule.ChapterContent); string Content = ""; while (matchGroup.Success) { if (matchGroup.Groups["content"].Value.Length > 200) { Content += matchGroup.Groups["content"].Value; } matchGroup = matchGroup.NextMatch(); } //过滤 Content = Filter(Content); if (Content.ToLower().Contains("<img ") == false) { c.Content = Content; bc.Changed = true; //编辑章节 this.CollectStatus.Status = "章节保存到系统"; Status_Chage(); BH.ChapterEdit(c.id, c.Title, c.Content, false, false); //完成之后将本章节去掉 bc.Chapters = bc.Chapters.Where(p => p.id != c.id).ToList(); } this.CollectStatus.ChapterleftCout--; Status_Chage(); } //end of 判断章节在分站中存在 } //end of 循环采集章节 #endregion 循环采集章节 } #endregion 循环规则 #region 重新生成章节 if (bc.Changed) { CollectStatus.Status = "正在生成章节"; Status_Chage(); BH.CreateChapters(bc.ID); } #endregion }//书籍循环结束 }
/// <summary> /// 根据书籍名称采集 /// </summary> /// <param name="BookTitle"></param> public void CollectBookByTitle(string BookTitle) { this.CollectStatus.BookTitle = BookTitle; Status_Chage(); CollectStatus.ChapterTitle = ""; //1.获取起点书籍 CollectStatus.Status = "从起点搜索"; Status_Chage(); this.QidianBook = GetQidianBook(BookTitle); if (QidianBook.ID <= 0) { return;//在起点没有采集到这本书 } if (QidianBook.Chapters.Count == 0) { CollectStatus.Status = "没能打开章节列表"; Status_Chage(); return; } //2.获取本地书籍 CollectStatus.Status = "从本地检查"; Status_Chage(); this.LocalBook = GetLocalBook(QidianBook); //3.对比获取需要采集的章节 var tmp = QidianBook.Chapters.Where(p => p.Title.ReplaceSynonyms() == LocalBook.LastChapter.Title.ReplaceSynonyms());//最后一张在起点中的章节 if (tmp.Count() == 0 && !LocalBook.LastChapter.Title.IsNullOrEmpty()) { CollectStatus.Status = "最新章节在起点中不存在"; Status_Chage(); return; } if (LocalBook.LastChapter.Title.IsNullOrEmpty()) { //本地书籍没有任何章节 BookNeedCollect = QidianBook; } else { var localLastBook = tmp.First(); BookNeedCollect = QidianBook; BookNeedCollect.Url = LocalBook.Url; BookNeedCollect.Chapters = BookNeedCollect.Chapters.Where(p => p.Index > localLastBook.Index).ToList(); } BookNeedCollect.ID = LocalBook.ID; BookNeedCollect.Class = LocalBook.Class; BookNeedCollect.ClassID = LocalBook.ClassID; CollectStatus.ChapterCount = BookNeedCollect.Chapters.Count(); CollectStatus.ChapterleftCout = BookNeedCollect.Chapters.Count(); Status_Chage();//剩余章节数量 //可以从起点采集的章节 if (BookNeedCollect.Chapters.Where(p => p.IsVip == false && p.Content.IsNullOrEmpty()).Count() > 0) { foreach (Chapter c in BookNeedCollect.Chapters.Where(p => p.IsVip == false).ToList()) { CollectStatus.ChapterTitle = c.Title; CollectStatus.Status = "正在从起点采集"; Status_Chage(); c.Content = GetQidianNormalChapter(c.Url); } } //4.循环采集书籍 var Rules = RulesOperate.GetBookRules(); foreach (CollectRule rule in Rules.Where(p => p.IsImageSite == false)) { //如果没有任何章节需要采集,则直接退出章节 CollectStatus.Status = "开始采集-" + rule.SiteName; Status_Chage(); if (BookNeedCollect.Chapters.Count == 0) { CollectStatus.Status = "没有任何章节需要采集"; Status_Chage(); Thread.Sleep(500); break; } //需要采集的章节没有空内容的,也就是说需要采集的已经全都采集完成了 if (BookNeedCollect.Chapters.Where(p => p.Content.IsNullOrEmpty()).Count() == 0) { CollectStatus.Status = "章节全部采集完成"; Status_Chage(); Thread.Sleep(100); break; } CollectChapter(BookNeedCollect, rule); } foreach (CollectRule rule in Rules.Where(p => p.IsImageSite == true)) { //如果没有任何章节需要采集,则直接退出章节 CollectStatus.Status = "开始采集-" + rule.SiteName; Status_Chage(); if (BookNeedCollect.Chapters.Count == 0) { CollectStatus.Status = "没有任何章节需要采集"; Status_Chage(); Thread.Sleep(500); break; } //需要采集的章节没有空内容的,也就是说需要采集的已经全都采集完成了 if (BookNeedCollect.Chapters.Where(p => p.Content.IsNullOrEmpty()).Count() == 0) { CollectStatus.Status = "章节全部采集完成"; Status_Chage(); Thread.Sleep(100); break; } CollectChapterFromLuoqiu(BookNeedCollect, rule); } //5.提交到目标站点 CollectStatus.Status = "保存到目标站点"; Status_Chage(); SubmitBook(BookNeedCollect); //6. 发博客 PublishBlog(BookNeedCollect); //7.采集完成 生成书籍 CollectStatus.Status = "采集完成,正在生成"; Status_Chage(); if (BookNeedCollect.Chapters.Count > 0) { CreatePage(BookNeedCollect.ID.ToS(), BookNeedCollect.ClassID); } CollectStatus.Status = string.Format("书籍《{0}》完成", BookTitle); Status_Chage(); }