Exemplo n.º 1
0
        public void CollectArticleList()
        {
            using (var context = new TeraArticleDataContext())
            {
                var jumpPagingSize = 1;
                var address = MakePagingPageAddress(CurrentWorkingPage);
                foreach (var article in ParsePagingPage(address.CrawlIt(Encoding.UTF8)))
                {
                    try
                    {
                        // 문제가 있당... 큰일이당... 이를 우짜누...
                        // 우짜긴 배째 ㄱ=
                        if (article.ArticleId == 0) continue;

                        // 기존 데이터가 존재하는지 확인한 후 페이지 건너뛰기
                        if (
                            Mode == "normal" &&
                            context.Articles.Any(e =>
                                e.Game == article.Game &&
                                e.TargetSite == article.TargetSite &&
                                e.CategoryId == article.CategoryId &&
                                e.ArticleId == article.ArticleId))
                        {
                            var prevArticleCount = context.Articles
                                .Where(e => e.Game == article.Game)
                                .Where(e => e.TargetSite == article.TargetSite)
                                .Where(e => e.CategoryId == article.CategoryId)
                                .Count(e => e.ArticleId < article.ArticleId);
                            jumpPagingSize = prevArticleCount / PagingSize();

                            continue;
                        }
                        else
                        {
                            jumpPagingSize = 1;
                        }

                        if (!context.Articles.Any(e =>
                                e.Game == article.Game &&
                                e.TargetSite == article.TargetSite &&
                                e.CategoryId == article.CategoryId &&
                                e.ArticleId == article.ArticleId))
                        {
                            ArticleQueueToCrawl.Enqueue(article);
                        }
                    }
                    catch (Exception ex)
                    {
                        Logger.Log("Error occurred during CollectArticleList for ArticleID: {0} and Link: {1}", article.ArticleId, article.Link);
                        Logger.Log(ex);
                    }
                }

                CurrentWorkingPage += Math.Max(1, jumpPagingSize);
            }
        }
Exemplo n.º 2
0
        public void CrawlArticles()
        {
            while (ArticleQueueToCrawl.Count > 0)
            {
                foreach (var item in ArticleQueueToCrawl)
                    //ThreadPool.QueueUserWorkItem(item =>
                {
                    Article article = null;
                    IList<Comment> comments = null;
                    while (!ArticleQueueToCrawl.TryDequeue(out article))
                    {
                    }

                    // 큐에서 꺼내오고 잠깐 숨좀 돌리자
                    Thread.Sleep(1000 * _rand.Next(3, 5));

                    try
                    {
                        var address = MakeArticlePageAddress(article.ArticleId);
                        article.RawHtml = address.CrawlIt(encoding, headerCollection, cookieContainer);
                        article.CrawledTime = DateTime.Now;
                        ParseArticlePage(article);
                        using (var context = new TeraArticleDataContext())
                        {
                            context.Articles.InsertOnSubmit(article);
                            context.SubmitChanges();
                        }

                        comments = CrawlComments(article);
                        using (var context = new TeraArticleDataContext())
                        {
                            context.Comments.InsertAllOnSubmit(comments);
                            context.SubmitChanges();
                        }
                    }
                    catch (Exception ex)
                    {
                        Logger.Log("Error occurred ArticleID: {0}, Link: {1}", article.ArticleId, article.Link);
                        Logger.Log(ex);
                    }

                    // 작업 완료하고 잠깐 숨좀 돌리자
                    Thread.Sleep(1000 * _rand.Next(1, 5));

                    //}, ArticleQueueToCrawl);
                }
            }
        }