public void CollectArticleList() { using (var context = new TeraArticleDataContext()) { var jumpPagingSize = 1; var address = MakePagingPageAddress(CurrentWorkingPage); foreach (var article in ParsePagingPage(address.CrawlIt(Encoding.UTF8))) { try { // 문제가 있당... 큰일이당... 이를 우짜누... // 우짜긴 배째 ㄱ= if (article.ArticleId == 0) continue; // 기존 데이터가 존재하는지 확인한 후 페이지 건너뛰기 if ( Mode == "normal" && context.Articles.Any(e => e.Game == article.Game && e.TargetSite == article.TargetSite && e.CategoryId == article.CategoryId && e.ArticleId == article.ArticleId)) { var prevArticleCount = context.Articles .Where(e => e.Game == article.Game) .Where(e => e.TargetSite == article.TargetSite) .Where(e => e.CategoryId == article.CategoryId) .Count(e => e.ArticleId < article.ArticleId); jumpPagingSize = prevArticleCount / PagingSize(); continue; } else { jumpPagingSize = 1; } if (!context.Articles.Any(e => e.Game == article.Game && e.TargetSite == article.TargetSite && e.CategoryId == article.CategoryId && e.ArticleId == article.ArticleId)) { ArticleQueueToCrawl.Enqueue(article); } } catch (Exception ex) { Logger.Log("Error occurred during CollectArticleList for ArticleID: {0} and Link: {1}", article.ArticleId, article.Link); Logger.Log(ex); } } CurrentWorkingPage += Math.Max(1, jumpPagingSize); } }
public void CrawlArticles() { while (ArticleQueueToCrawl.Count > 0) { foreach (var item in ArticleQueueToCrawl) //ThreadPool.QueueUserWorkItem(item => { Article article = null; IList<Comment> comments = null; while (!ArticleQueueToCrawl.TryDequeue(out article)) { } // 큐에서 꺼내오고 잠깐 숨좀 돌리자 Thread.Sleep(1000 * _rand.Next(3, 5)); try { var address = MakeArticlePageAddress(article.ArticleId); article.RawHtml = address.CrawlIt(encoding, headerCollection, cookieContainer); article.CrawledTime = DateTime.Now; ParseArticlePage(article); using (var context = new TeraArticleDataContext()) { context.Articles.InsertOnSubmit(article); context.SubmitChanges(); } comments = CrawlComments(article); using (var context = new TeraArticleDataContext()) { context.Comments.InsertAllOnSubmit(comments); context.SubmitChanges(); } } catch (Exception ex) { Logger.Log("Error occurred ArticleID: {0}, Link: {1}", article.ArticleId, article.Link); Logger.Log(ex); } // 작업 완료하고 잠깐 숨좀 돌리자 Thread.Sleep(1000 * _rand.Next(1, 5)); //}, ArticleQueueToCrawl); } } }