public void Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
public async void RunCrawl(string aUri, string aOutputFolder, CrawlConfiguration crawlConfig) { _outputfolder = aOutputFolder; CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; await Task.Run(() => { CrawlResult result = crawler.Crawl(new Uri(aUri), cancellationTokenSource); OnMessageReceived( result.ErrorOccurred ? $"Crawl of {result.RootUri.AbsoluteUri} completed with error: {result.ErrorException.Message}" : $"Crawl of {result.RootUri.AbsoluteUri} completed without error."); }, cancellationTokenSource.Token); }
static void Main(string[] args) { // 크롤러 인스턴스 생성 IWebCrawler crawler = new PoliteWebCrawler(); // 옵션과 함께 크롤러 인스턴스 생성할 경우 // var crawlConfig = new CrawlConfiguration(); // crawlConfig.CrawlTimeoutSeconds = 1000; // crawlConfig.MaxConcurrentThreads = 10; // crawlConfig.MaxPagesToCrawl = 10; // crawlConfig.UserAgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"; // IWebCrawler crawler = new PoliteWebCrawler(crawlConfig); // 이벤트 핸들러 셋업 crawler.PageCrawlStartingAsync += (s, e) => { Console.WriteLine("Starting : {0}", e.PageToCrawl); }; crawler.PageCrawlCompletedAsync += (s, e) => { CrawledPage pg = e.CrawledPage; string fn = pg.Uri.Segments[pg.Uri.Segments.Length - 1]; File.WriteAllText(fn, pg.Content.Text); //var hdoc = pg.HtmlDocument; //HtmlAgilityPack HtmlDocument Console.WriteLine("Completed : {0}", pg.Uri.AbsoluteUri); }; // 크롤 시작 string siteUrl = "http://www.naver.com"; Uri uri = new Uri(siteUrl); crawler.Crawl(uri); }
public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Never()); }
public string Execute() { using (var db = new SongRecommendContext()) { if (db.BaseWordCollectingSong.Find(SongId) != null) { return("이미 추가된 곡입니다"); } } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={SongId}")); return(_message); }
private void button_crawl_Click(object sender, EventArgs e) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("https://belaruspartisan.by/")); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { MessageBox.Show("Crawl of " + result.RootUri.AbsoluteUri + " completed with error: " + result.ErrorException.Message); } else { MessageBox.Show("Crawl of " + result.RootUri.AbsoluteUri + " completed without error."); } }
public void Crawl_MinCrawlDelayDelayZero_StillCallsDomainRateLimiter() { CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(1)); }
public int DoCrawl() { CrawlConfiguration CConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); CConfig.MaxConcurrentThreads = maxConcurrentThreads; CConfig.MaxPagesToCrawl = maxPagesToCrawl; CConfig.CrawlTimeoutSeconds = crawlTimeoutSeconds; CConfig.HttpRequestTimeoutInSeconds = httpRequestTimeoutInSeconds; CConfig.LoginUser = loginUser; CConfig.LoginPassword = loginPassword; Console.WriteLine("Doing Crawl With Slack " + (slackBotEnabled ? "Enabled" : "Disabled")); PoliteWebCrawler crawler = new PoliteWebCrawler(CConfig, null, null, null, null, null, null, null, null); //PoliteWebCrawler crawler = new PoliteWebCrawler(); errors = new List <Errors>(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(URL)); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } IEnumerable <Errors> EnumList = errors.AsEnumerable(); for (int i = 0; i < 525; i++) { if (EnumList.Where(x => x.ErrorCode == i).Count() != 0) { returnInt = 1; Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(i + " (" + getErrorName(i) + ") Errors:"); slackMessage += i + " (" + getErrorName(i) + ") Errors:\n"; Console.ForegroundColor = ConsoleColor.Red; foreach (Errors err in EnumList.Where(x => x.ErrorCode == i)) { Console.WriteLine(" " + err.ErrorURL); slackMessage += " " + err.ErrorURL + "\n"; } } } Console.ResetColor(); if (slackMessage == "") { slackMessage = "No Errors In WebPage!"; } Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Done"); Console.ResetColor(); return(returnInt); }
private void ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e) { var crawledPage = e.CrawledPage; var doc = crawledPage.HtmlDocument.DocumentNode; var songNodes = doc.SelectNodes("//table/tbody/tr"); //--------------------------- // 크롤링 유효성 검사 //--------------------------- if (songNodes == null || songNodes.Count == 0) { _isCrawlingSuccess = false; return; } _isCrawlingSuccess = true; foreach (var node in songNodes) { try { using (var db = new SongRecommendContext()) { //--------------------------- // 노래정보 파싱 //--------------------------- var songId = node.SelectSingleNode(".//input[@class='input_check'] | .//input[@class='input_check ']").GetAttributeValue("value", 0); var title = node.SelectSingleNode(".//div[@class='ellipsis rank01']//a | .//div[@class='ellipsis rank01']//span[@class='fc_lgray']").InnerText; var singer = node.SelectSingleNode(".//div[@class='ellipsis rank02']//span").InnerText; if (songId == 0 || db.ProposeSong.Find(songId) != null) { continue; } //--------------------------- // 가사 가져오기 //--------------------------- HttpClient client = new HttpClient(); string jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={songId}").Result; var lyric = JObject.Parse(jsonString).Value <string>("lyric"); if (lyric == null || lyric.Length == 0) { continue; } //--------------------------- // 적합도 분석 //--------------------------- var rate = AnalyzeRateSvc.Execute(lyric).Rate; //--------------------------- // DB 저장 //--------------------------- if (rate > 70) { //--------------------------- // 좋아요 가져오기 //--------------------------- jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={songId}").Result; var like = 0; try { like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT"); } catch { } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={songId}")); db.ProposeSong.Add(new ProposeSong { SongId = songId, PlayListSeq = PlayListSeq, Title = title, Singer = singer, Lyric = lyric, Rate = rate, Like = like, Genre = _genre, ReleaseDate = _releaseDate, AddDate = DateTime.Now }); db.SaveChanges(); _successCount++; } } } catch { } } }
static void Main(string[] args) { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value crawlConfig.MaxCrawlDepth = 0; crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; //var doc = new HtmlDocument(); //doc.Load(@"C:\Users\lucao\Downloads\keketest.html"); //var embedNodes = doc.DocumentNode.SelectSingleNode("//script[contains(text(), 'thunder_url')]"); //var domain = Regex.Match(embedNodes.InnerText, @".*domain.*'(.*)'").Groups[1].ToString(); //var thunder_url = Regex.Match(embedNodes.InnerText, ".*thunder_url.*\"(.*)\"").Groups[1].ToString(); //var downloadMp3Link = domain + thunder_url; CrawlResult result; for (int i = 58; i > 30; i--) { DownloadLinkList.Clear(); Thread.Sleep(60000); result = crawler.Crawl(new Uri($"http://www.kekenet.com/Article/15410/List_{i}.shtml")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } if (DownloadLinkList.Count > 0) { DownloadMP3LinkList.Clear(); foreach (var link in DownloadLinkList) { var sub_crawler = new PoliteWebCrawler(); sub_crawler.PageCrawlStartingAsync += sub_crawler_ProcessPageCrawlStarting; sub_crawler.PageCrawlCompletedAsync += sub_crawler_ProcessPageCrawlCompleted; sub_crawler.PageCrawlDisallowedAsync += sub_crawler_PageCrawlDisallowed; sub_crawler.PageLinksCrawlDisallowedAsync += sub_crawler_PageLinksCrawlDisallowed; sub_crawler.Crawl(new Uri(link)); Thread.Sleep(20000); sub_crawler?.Dispose(); } } //"http://k6.kekenet.com/Sound/2018/01/scad180110.mp3" if (DownloadMP3LinkList.Count > 0) { foreach (var mp3Link in DownloadMP3LinkList) { WebClient client = new WebClient(); Uri ur = new Uri(mp3Link); client.DownloadProgressChanged += WebClientDownloadProgressChanged; client.DownloadDataCompleted += WebClientDownloadCompleted; var file = @"C:\Users\lucao\Downloads\keke\" + mp3Link.Split('/').Last().ToString(); client.DownloadFile(ur, file); Thread.Sleep(60000); } } } }
public async Task Start(Uri targetUri) { if (!(await IsRemoteServerAlive(targetUri))) { throw new WebException("No response from external server"); } resultDetails = new ConcurrentBag <TestResultDetail>(); processedPages = new ConcurrentDictionary <string, byte>(); result = new TestResult() { Authority = targetUri.AbsoluteUri, TestDate = DateTime.Now, Status = 1 }; RepositoryInsertRequested?.Invoke(this, new TestResultArgs(result)); CrawlConfiguration configuration = new CrawlConfiguration() { MaxPagesToCrawl = MaxPagesToCrawl, MaxCrawlDepth = MaxCrawlDepth, IsExternalPageCrawlingEnabled = IsExternalPageCrawlingEnabled, IsExternalPageLinksCrawlingEnabled = IsExternalPageLinksCrawlingEnabled, NumberOfRecurrentRequests = NumberOfRecurrentRequests, MaxConcurrentThreads = MaxConcurrentThreads }; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, crawlDecisionMaker: null, memoryManager: null, scheduler: null, hyperLinkParser: null, domainRateLimiter: null, robotsDotTextFinder: null, threadManager: null, pageRequester: new PageRequesterWithRepeats(configuration)); crawler.PageRequestSent += Crawler_PageRequestSent; crawler.PageResponseReceived += Crawler_PageResponseReceived; crawler.PageCrawlCompleted += Crawler_ProcessPageCrawlCompleted; crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; MatchCollection mc = Regex.Matches((pageToCrawl.Uri.AbsoluteUri), @"http[s]?:\/\/"); if (mc.Count > 1) { return new CrawlDecision { Allow = false, Reason = "Dont want to crawl external pages" } } ; return(decision); }); TestStarted?.Invoke(this, new TestResultArgs(result)); crawler.Crawl(targetUri); result.TestResultDetails = resultDetails.ToList(); result.MinResponseTime = rootMinResponseTime; result.MaxResponseTime = rootMaxResponseTime; result.MeanResponseTime = rootMeanResponseTime / numberOfPagesCrawled; result.Status = 0; TestFinished?.Invoke(this, new TestResultArgs(result)); RepositoryInsertDetailsRequested?.Invoke(this, new TestResultArgs(result)); }
public int Execute() { using (var db = new SongRecommendContext()) { // 대상 조회 var targetSong = from baseSong in db.BaseWordCollectingSong join proposeSong in db.ProposeSong on baseSong.SongId equals proposeSong.SongId into proposeSongs from defaultPropose in proposeSongs.DefaultIfEmpty() where baseSong.Status == "Tokenized" && defaultPropose == null select baseSong; // Rate 계산 foreach (var song in targetSong) { try { var rateResult = AnalyzeRateSvc.Execute(song.Lyric); song.Rate = rateResult.Rate; song.Status = "Analyzed"; if (song.Rate > 70) { //--------------------------- // 좋아요 가져오기 //--------------------------- HttpClient client = new HttpClient(); var jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={song.SongId}").Result; var like = 0; try { like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT"); } catch { } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={song.SongId}")); db.ProposeSong.Add(new ProposeSong { SongId = song.SongId, PlayListSeq = song.PlayListSeq, Title = song.Title, Singer = song.Singer, Lyric = song.Lyric, Rate = song.Rate ?? 0, Like = like, Genre = _genre, ReleaseDate = _releaseDate, AddDate = DateTime.Now }); _successCount++; } } catch { } } db.SaveChanges(); return(_successCount); } }
public void StartWebCrawler() { if (File.Exists("FailLog.txt")) { File.Delete("FailLog.txt"); } FileStream FailLog = new FileStream("FailLog.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite); StreamWriter sw = new StreamWriter(FailLog, Encoding.Default); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); for (int i = 1; i <= Chalaoshi.MaximumTeacherPage; i++) { LeftLinkList.Add("https://chalaoshi.cn/teacher/" + i.ToString() + "/"); } LeftLinkList.Remove("https://chalaoshi.cn/teacher/2485/"); LeftLinkList.Remove("https://chalaoshi.cn/teacher/3433/"); for (int Loopi = 0; Loopi < 5; Loopi++) //最多循环6次,还爬不到的就是服务器问题了 { if (LeftLinkList.Count == 0) { break; } PageLinkList.Clear(); LeftLinkList.ForEach(i => PageLinkList.Add(i)); //Task task = new Task(() => //{ Parallel.For(0, PageLinkList.Count, (i) => { var crawler = new PoliteWebCrawler(); var url = PageLinkList[i]; Console.WriteLine("Start:" + url); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; try { crawler.Crawl(new Uri(url)); } catch (Exception ex) { sw.WriteLine(url + ex.Message); } Console.WriteLine("Finish:" + url); Thread.Sleep(20); //给服务器休息一下 }); //}); //task.Start(); //task.Wait(); //Thread.Sleep(200);//给服务器休息一下 } if (LeftLinkList.Count > 0) { for (int i = 0; i < LeftLinkList.Count; i++) { sw.WriteLine("#{0}#未写入", LeftLinkList[i]); } sw.Close(); FailLog.Close(); } var fileName = $"CLStext_{DateTime.Now.Year.ToString()}_{DateTime.Now.Month.ToString()}_{DateTime.Now.Day}_{DateTime.Now.Hour}_{DateTime.Now.Minute}_{DateTime.Now.Second}.csv"; FileStream file = new FileStream(fileName, FileMode.Append, FileAccess.Write); //在此处定义,保证读写锁的最大性能 StreamWriter streamWriter = new StreamWriter(file, Encoding.Default); // 创建写入流 for (int i = 0; i < TotalInfo.Count; i++) { streamWriter.WriteLine(TotalInfo[i]); } streamWriter.Close(); file.Close(); stopwatch.Stop(); try { Console.WriteLine("Finished completely.\nTime Consumption:{0}\nPage Number:{1}\nTime Per Page:{2} milliseconds.\nPress any key to continue.", stopwatch.Elapsed, TotalInfo.Count, stopwatch.ElapsedMilliseconds / TotalInfo.Count); } catch { Console.WriteLine("当前已爬网页数量为0"); } Console.WriteLine(FailedPageCount.ToString()); Console.ReadLine(); }
static void Main(string[] args) { try { // 크롤러 인스턴스 생성 // IWebCrawler crawler = new PoliteWebCrawler(); // 옵션과 함께 크롤러 인스턴스 생성할 경우 var crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 5000; // crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxConcurrentThreads = 1; // crawlConfig.MaxPagesToCrawl = 10; crawlConfig.MaxPagesToCrawl = 50; crawlConfig.UserAgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"; IWebCrawler crawler = new PoliteWebCrawler(crawlConfig); // 이벤트 핸들러 셋업 crawler.PageCrawlStartingAsync += (s, e) => { Console.WriteLine($"Starting : {e.PageToCrawl}"); }; crawler.PageCrawlCompletedAsync += (s, e) => { CrawledPage pg = e.CrawledPage; HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); string fn = pg.Uri.Segments[pg.Uri.Segments.Length - 1]; string path = @"C:\Users\yjs3694\source\repos\AbotCrawler\AbotCrawler\bin\Debug\crawl.txt"; // File.WriteAllText(fn, pg.Content.Text); // File.WriteAllText(directory, pg.Content.Text); doc.LoadHtml(pg.Content.Text); HtmlAgilityPack.HtmlNode singleNode = doc.GetElementbyId("mArticle"); // 태그의 ID=tagId 인것 //singleNode 노드의 자식중 a 태그들 *(.) 이 있어야 현재 노드부터 찾는다 // HtmlNodeCollection anchors = singleNode.SelectNodes(".//a"); //singleNode 노드의 프로퍼티(클래스)값을 리턴한다. // string className = singleNode.GetAttributeValue("class", ""); // HtmlAgilityPack.HtmlNodeCollection article = doc.DocumentNode.SelectNodes("//div[@class='articles']"); // HtmlAgilityPack.HtmlNodeCollection article = doc.DocumentNode.SelectNodes("div[@class='hotissue_builtin']"); if (singleNode != null) { // File.WriteAllText(directory, singleNode.SelectSingleNode(".//article/div[1]/div[0]/div[2]").InnerText); //*[@id="mArticle"]/div[2]/div[1]/div[2]/div[1]/ol //*[@id="mArticle"]/div[2]/div[1]/div[3]/div[1]/ol // var content = singleNode.SelectSingleNode("//div[2]/div[1]/div[2]/div[1]/ol")?.InnerText; var content = singleNode.SelectSingleNode("//div[2]/div[1]/div[3]/div[1]/ol")?.InnerText; if (content != null) { var bbb = content.Replace("\n\n\n", ""); // File.WriteAllText(path, bbb); // File.AppendAllText(path, "\n\n\n"); File.AppendAllText(path, bbb); File.AppendAllText(path, "\n\n\n"); } } //var hdoc = pg.HtmlDocument; //HtmlAgilityPack HtmlDocument Console.WriteLine("Completed : {0}", pg.Uri.AbsoluteUri); }; // 크롤 시작 string siteUrl = "http://www.daum.net"; Uri uri = new Uri(siteUrl); for (int i = 0; i < 5; i++) { crawler.Crawl(uri); System.Threading.Thread.Sleep(300); } } catch (Exception ex) { Console.WriteLine(ex); } }
static int FailedPageCount = 0; //用于debug public void StartWebCrawler() { XmlConfigurator.Configure();//本行控制是否采用log4net提供的log功能 /* * 单步写法,已被废弃 * * PoliteWebCrawler[] crawler = new PoliteWebCrawler[Chalaoshi.MaximumTeacherPage]; * for (int i = 0; i < Chalaoshi.MaximumTeacherPage; i++) * { * crawler[i] = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); * crawler[i].PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; * crawler[i].PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; * crawler[i].PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; * crawler[i].PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; * * CrawlResult result = crawler[i].Crawl(new Uri($"https://chalaoshi.cn/teacher/{i + 1}/")); //This is synchronous, it will not go to the next line until the crawl has completed * * if (result.ErrorOccurred) * Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); * else * Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); * } */ /* * Task 写法未成功,原因不明 * Task[] TotalTasks = new Task[Chalaoshi.MaximumTeacherPage]; * Console.WriteLine("Start Crawling"); * for (var i = 0; i < Chalaoshi.MaximumTeacherPage; i++) * { * string url = "https://chalaoshi.cn/teacher/" + (i + 1).ToString() + "/"; * TotalTasks[i] = new Task(() => * { * var crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); * Console.WriteLine("Start:" + url); * crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; * crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; * crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; * crawler.Crawl(new Uri(url)); * Console.WriteLine("Finish:" + url); * }); * TotalTasks[i].Start(); * } * Task.WaitAll(TotalTasks); */ Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Task task = new Task(() => { Parallel.For(1, Chalaoshi.MaximumTeacherPage + 1, (i) => { var crawler = new PoliteWebCrawler(); string url = "https://chalaoshi.cn/teacher/" + i.ToString() + "/"; Console.WriteLine("Start:" + url); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; crawler.Crawl(new Uri(url)); Console.WriteLine("Finish:" + url); Thread.Sleep(50);//给服务器喘息一下 }); }); task.Start(); task.Wait(); stopwatch.Stop(); Console.WriteLine("Finished Completely.\nTime Consume:{0}\nPage Number:{1}\nTime Per Page:{2} milliseconds.\nPress any key to continue.", stopwatch.Elapsed, CrawledPageCount, stopwatch.ElapsedMilliseconds / CrawledPageCount); Console.WriteLine(FailedPageCount.ToString()); Console.ReadLine(); }
private static void ParseSucceed(Options options) { CrawlConfiguration crawlConfig = new CrawlConfiguration() { CrawlTimeoutSeconds = options.Timeout, MaxConcurrentThreads = options.ConcurrentPages, MaxPagesToCrawl = 0, MaxPagesToCrawlPerDomain = 0, UserAgentString = options.UserAgentString ?? new Uri(options.Url).Host + " Crawler", }; Dictionary <int, string> fileList = new Dictionary <int, string>(Math.Abs(options.Start - options.End)); string fileToWrite = options.Output ?? new Uri(options.Url).Host + ".txt"; if (File.Exists(fileToWrite)) { string[] preliminaryFileList = File.ReadAllLines(fileToWrite); for (int i = 0; i < preliminaryFileList.Length; i++) { Match entry = Regex.Match(preliminaryFileList[i], @"^(\d+) = (.*)$"); if (entry.Groups[0].Success) { fileList.Add(int.Parse(entry.Groups[1].Value), entry.Groups[2].Value); } } } for (int i = options.Start; i != options.End; i += options.End.CompareTo(i)) { if (!fileList.ContainsKey(i) || (Uri.TryCreate(fileList[i], UriKind.Absolute, out Uri entry) ? options.RetryValid : options.RetryInvalid)) { Uri uri = new Uri(options.Url + i); CrawlResult rawResult; using (PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, new NobotDotTxtIgnorer())) rawResult = crawler.Crawl(uri); string processedResult = "invalid file"; switch (options.ValidationMode) { case (ValidationModes.Redirect): if (!rawResult.ErrorOccurred && rawResult.CrawlContext.RootUri.AbsoluteUri != uri.AbsoluteUri) { processedResult = rawResult.CrawlContext.RootUri.AbsoluteUri; } break; case (ValidationModes.Valid): if (!rawResult.ErrorOccurred) { processedResult = rawResult.CrawlContext.RootUri.AbsoluteUri; } break; } //HACK if (!fileList.ContainsKey(i) || fileList[i] != processedResult) { if (!fileList.ContainsKey(i)) { fileList.Add(i, processedResult); } else if (fileList[i] != processedResult) { fileList[i] = processedResult; } File.WriteAllLines(fileToWrite, fileList.Select(x => x.Key + " = " + x.Value).ToArray()); //HACK } Console.WriteLine(i + " = " + fileList[i]); } } }
public void Run() { _crawler.Crawl(new Uri("https://www.baidu.com/s?wd=乙肝 症状")); }
public AnalyzeSongResult Execute() { //--------------------------- // 가사 가져오기 //--------------------------- HttpClient client = new HttpClient(); string jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={SongId}").Result; var lyric = JObject.Parse(jsonString).Value <string>("lyric"); if (lyric == null || lyric.Length == 0) { return(null); } var analyzeResult = AnalyzeRateSvc.Execute(lyric); //--------------------------- // 좋아요 가져오기 //--------------------------- jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={SongId}").Result; var like = 0; try { like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT"); } catch { } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={SongId}")); var song = new ProposeSong { SongId = SongId, Title = _title, Singer = _singer, Lyric = lyric, Rate = analyzeResult.Rate, Like = like, Genre = _genre, ReleaseDate = _releaseDate, AddDate = DateTime.Now }; if (analyzeResult.Rate > 70) { using (var db = new SongRecommendContext()) { if (db.ProposeSong.Find(SongId) == null) { db.ProposeSong.Add(song); db.SaveChanges(); } } } var resultLyric = lyric; foreach (var word in analyzeResult.Words) { resultLyric = resultLyric.Replace(word.Word, $@"<span class='v-chip theme--dark light-green darken-2'><span class='v-chip__content tooltip'>{word.Word}<span class='tooltiptext'>{(int)word.Rate}%</span></span></span>"); } var result = new AnalyzeSongResult { SongId = SongId, Title = _title, Singer = _singer, Lyric = resultLyric, Rate = analyzeResult.Rate, AlbumCover = _albumCover, AlbumName = _albumName }; return(result); }