public void Start(Database database) { var lastArticle = database.ExecuteReader<Article>("SELECT * FROM Article ORDER BY ArticleId DESC LIMIT 1"); if (lastArticle.Count == 0) { _lastCrawledArticleId = 0; } else { _lastCrawledArticleId = lastArticle[0].ArticleId; } var lastArticleId = GetLastArticleId(); while (true) { // crawling 해야 할 글 지정 var nextArticleId = _lastCrawledArticleId + 1; // 웹사이트 주소 구성 var targetUrl = MakeArticleUrl(CategoryId, nextArticleId); try { // 웹사이트 긁기 - 5초 이후 timeout var rawHtml = targetUrl.CrawlIt(Encoding.GetEncoding(51949), 5000); // 원하는 내용 추출 var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(rawHtml); var article = new Article { CategoryId = CategoryId, ArticleId = nextArticleId, RawHtml = rawHtml, CrawlingTime = DateTime.Now, }; article.IsDeleted = htmlDoc.IsDeletedArticle(); if (!article.IsDeleted) { article.Author = htmlDoc.ExtractAutor(); article.WriteTime = htmlDoc.ExtractWrittenTime(); article.Title = htmlDoc.ExtractTitle(); article.Content = htmlDoc.ExtractContent(); } // 데이터베이스에 저장 database.SyncData<Article>(article); } catch (Exception ex) { LogHelper.Log(new Exception(targetUrl)); LogHelper.Log(ex); } // 최신 글인지 확인 while (nextArticleId == lastArticleId) { lastArticleId = GetLastArticleId(); if (nextArticleId == lastArticleId) { // 글이 없을 경우 스레드 10분간 휴식 Thread.Sleep(10 * 60 * 1000); } } // 각 글을 crawling 한 후 3초간 휴식 Thread.Sleep(3 * 1000); _lastCrawledArticleId++; } }