public void Consume_ValidDomain_AppConfigHttpStatusesToProcessNullWebResponse_CrawlsPerformed() { //Arrange CrawlResult fakeResult = new CrawlResult { CrawlContext = GetCrawlContext(_dummyCrawlProcessors) }; CrawledPage crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://www.adamthings.com")); _dummyConfig.HttpStatusesToProcess = new string[] { }; crawledPage.HttpWebResponse = null; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult) .Callback(() => _fakeWebCrawler .Raise(f => f.PageCrawlCompleted += null, new PageCrawlCompletedArgs(GetCrawlContext(_dummyCrawlProcessors), crawledPage))); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); //Act _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://www.adamthings.com") }, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); }
public void Consume_PageProcessorThrowsException_DoesNotCrash() { //Arrange CrawlResult fakeResult = new CrawlResult { CrawlContext = GetCrawlContext(_dummyCrawlProcessors) }; CrawledPage crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://www.adamthings.com")); _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult) .Callback(() => _fakeWebCrawler .Raise(f => f.PageCrawlCompleted += null, new PageCrawlCompletedArgs(GetCrawlContext(_dummyCrawlProcessors), crawledPage))); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); _fakeProcessor1.Setup(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), crawledPage)).Throws(new Exception("oh no page")); //Act _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://www.adamthings.com") }, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); }
public void SetUp() { _fakeHttpClient = new Mock <HttpClient>(); _fakeWebContentExtractor = new Mock <IWebContentExtractor>(); _unitUnderTest = new PageRequester(_crawlConfig, _fakeWebContentExtractor.Object, _fakeHttpClient.Object); }
static void Main(string[] args) { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe. // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe. // Make sure to dispose this instance or you will have a zombie process! IWebDriver driver = CreatePhantomJsDriver(config); // Create the content extractor that uses PhantomJS. IWebContentExtractor extractor = new JavaScriptContentExtractor(driver); // Create a PageRequester that will use the extractor. IPageRequester requester = new PageRequester(config, extractor); using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) { crawler.PageCrawlCompleted += OnPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } } Console.Read(); }
public async Task MakeRequestAsync_RealCall_ReturnsExpectedCrawledPageObject() { //Arrange var unitUnderTest = new PageRequester( new CrawlConfiguration() { IsSslCertificateValidationEnabled = false, IsAlwaysLogin = true, IsHttpRequestAutomaticDecompressionEnabled = true, IsSendingCookiesEnabled = true, HttpProtocolVersion = HttpProtocolVersion.Version10 }, new WebContentExtractor()); var google = new Uri("https://google.com/"); //Act var result = await unitUnderTest.MakeRequestAsync(google); //Assert Assert.IsNull(result.HttpRequestException); Assert.AreSame(google, result.Uri); Assert.IsNotNull(result.HttpRequestMessage); Assert.IsNotNull(result.HttpResponseMessage); Assert.IsNotNull(result.Content); Assert.AreNotEqual("", result.Content.Text); unitUnderTest.Dispose(); }
public void TestFixtureSetup() { PageRequester pageRequster = new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }); _goodPageResult = pageRequster.MakeRequest(new Uri("http://localhost.fiddler:1111/")); _badPageResult = pageRequster.MakeRequest(new Uri("http://localhost.fiddler:1111/HttpResponse/Status404")); }
private async Task DemoPageRequester() { var pageRequester = new PageRequester(new CrawlConfiguration(), new WebContentExtractor()); //var result = await pageRequester.MakeRequestAsync(new Uri("http://google.com")); var result = await pageRequester.MakeRequestAsync(new Uri("https://diksiyonaryo.ph")); Console.WriteLine("{result}" + new { url = result.Uri, status = Convert.ToInt32(result.HttpResponseMessage.StatusCode) }); }
private static async Task DemoPageRequester() { var pageRequester = new PageRequester(new CrawlConfiguration(), new WebContentExtractor()); //var result = await pageRequester.MakeRequestAsync(new Uri("http://google.com")); var result = await pageRequester.MakeRequestAsync(new Uri("http://wvtesting2.com")); Log.Logger.Information("{result}", new { url = result.Uri, status = Convert.ToInt32(result.HttpResponseMessage.StatusCode) }); }
public async Task TestFixtureSetup() { UnitTestConfig unitTestConfig = new UnitTestConfig(); PageRequester pageRequster = new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }); _goodPageResult = await pageRequster.MakeRequestAsync(new Uri(unitTestConfig.SiteSimulatorBaseAddress)); _badPageResult = await pageRequster.MakeRequestAsync(new Uri(string.Concat(unitTestConfig.SiteSimulatorBaseAddress, "/HttpResponse/Status404"))); }
private static async Task DemoSinglePageRequest() { var pageRequester = new PageRequester(new CrawlConfiguration(), new WebContentExtractor()); var crawledPage = await pageRequester.MakeRequestAsync(new Uri("http://msn.com")); Log.Logger.Information("{result}", new { url = crawledPage.Uri, status = Convert.ToInt32(crawledPage.HttpResponseMessage.StatusCode) }); }
public void SetUp() { PageRequester pageRequster = new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }); _goodPageResult = pageRequster.MakeRequest(new Uri("http://localhost.fiddler:1111/")); _badPageResult = pageRequster.MakeRequest(new Uri("http://localhost.fiddler:1111/HttpResponse/Status404")); _fakePageRequester = new Mock <IPageRequester>(); _uut = new RobotsDotTextFinder(_fakePageRequester.Object); }
public async Task <IPerformanceTickets> ParseFromUrl(string url, CancellationToken cancellationToken) { if (string.IsNullOrEmpty(url) || url == CommonTags.NotDefined) { return(new PerformanceTickets { Description = CommonTags.NoTickets }); } var content = await PageRequester.Request(url, cancellationToken); return(await PrivateParse(content, cancellationToken)); }
public int Execute() { // 1. DJ플레이리스트 크롤링 _isCrawlingSuccess = true; for (var startIndex = 1; _isCrawlingSuccess == true; startIndex += 50) { //--------------------------- // 크롤링 설정 //--------------------------- var config = new CrawlConfiguration { CrawlTimeoutSeconds = 0, DownloadableContentTypes = "text/html, text/plain", HttpServicePointConnectionLimit = 200, HttpRequestTimeoutInSeconds = 35, HttpRequestMaxAutoRedirects = 7, IsExternalPageCrawlingEnabled = false, IsExternalPageLinksCrawlingEnabled = false, IsUriRecrawlingEnabled = false, IsHttpRequestAutoRedirectsEnabled = true, IsHttpRequestAutomaticDecompressionEnabled = false, IsRespectRobotsDotTextEnabled = false, IsRespectMetaRobotsNoFollowEnabled = false, IsRespectAnchorRelNoFollowEnabled = false, IsForcedLinkParsingEnabled = false, /* activate */ IsSendingCookiesEnabled = true, MaxConcurrentThreads = 10, MaxPagesToCrawl = 1000, MaxPagesToCrawlPerDomain = 0, MaxPageSizeInBytes = 0, MaxMemoryUsageInMb = 0, MaxMemoryUsageCacheTimeInSeconds = 0, MaxRobotsDotTextCrawlDelayInSeconds = 5, MaxCrawlDepth = 0, MinAvailableMemoryRequiredInMb = 0, MinCrawlDelayPerDomainMilliSeconds = 1000, UserAgentString = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" }; var pageRequester = new PageRequester(config); var crawler = new PoliteWebCrawler(config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/dj/playlist/djplaylist_listsong.htm?startIndex={startIndex}&pageSize=50&plylstSeq={PlayListSeq}")); } return(_successCount); }
private static async Task DemoSinglePageRequest() { var pageRequester = new PageRequester(new CrawlConfiguration(), new WebContentExtractor()); var crawledPage = await pageRequester.MakeRequestAsync(new Uri("https://www.onet.pl")); var articleTitles = crawledPage.AngleSharpHtmlDocument.All .Where(x => x.LocalName == "span" && x.ClassName == "title" && !string.IsNullOrWhiteSpace(x.TextContent)) .Select(x => x.TextContent) .ToList(); Log.Logger.Information("{result}", new { url = crawledPage.Uri, status = Convert.ToInt32(crawledPage.HttpResponseMessage.StatusCode), rawResponse = crawledPage.AngleSharpHtmlDocument.QuerySelectorAll("article") }); }
public int Execute() { // 1. DJ플레이리스트 크롤링 _isCrawlingSuccess = true; //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/new/index.htm#params%5BareaFlg%5D=I&po=pageObj&startIndex=1")); return(_successCount); }
public int Execute() { // 1. DJ플레이리스트 크롤링 _isCrawlingSuccess = true; for (var startIndex = 1; _isCrawlingSuccess == true; startIndex += 50) { //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/dj/playlist/djplaylist_listsong.htm?startIndex={startIndex}&pageSize=50&plylstSeq={PlayListSeq}")); } return(_successCount); }
public int Execute() { // 1. DJ플레이리스트 크롤링 _isCrawlingSuccess = true; for (var startIndex = 1; _isCrawlingSuccess == true && startIndex <= 51; startIndex += 50) { //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/chart/index.htm#params%5Bidx%5D={startIndex}")); } return(_successCount); }
private async Task <CrawledPage> MakeOneRequestAsync(Uri uri) { var perfWatch = Stopwatch.StartNew(); await lastCalls.WaitForCallAsync(uri) .ConfigureAwait(false); using (var contentExtractor = new WebContentExtractor()) { using (var pageRequester = new PageRequester(new CrawlConfiguration(), contentExtractor)) { var page = await pageRequester.MakeRequestAsync(uri).ConfigureAwait(false); if (page.HttpRequestException != null) { throw new InvalidOperationException("HTTP error.", page.HttpRequestException); } logger.LogDebug("Request to {Url} took {Elapsed}", uri, perfWatch.Elapsed); return(page); } } }
public string Execute() { using (var db = new SongRecommendContext()) { if (db.BaseWordCollectingSong.Find(SongId) != null) { return("이미 추가된 곡입니다"); } } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={SongId}")); return(_message); }
public int Execute() { using (var db = new SongRecommendContext()) { // 대상 조회 var targetSong = from baseSong in db.BaseWordCollectingSong join proposeSong in db.ProposeSong on baseSong.SongId equals proposeSong.SongId into proposeSongs from defaultPropose in proposeSongs.DefaultIfEmpty() where baseSong.Status == "Tokenized" && defaultPropose == null select baseSong; // Rate 계산 foreach (var song in targetSong) { try { var rateResult = AnalyzeRateSvc.Execute(song.Lyric); song.Rate = rateResult.Rate; song.Status = "Analyzed"; if (song.Rate > 70) { //--------------------------- // 좋아요 가져오기 //--------------------------- HttpClient client = new HttpClient(); var jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={song.SongId}").Result; var like = 0; try { like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT"); } catch { } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={song.SongId}")); db.ProposeSong.Add(new ProposeSong { SongId = song.SongId, PlayListSeq = song.PlayListSeq, Title = song.Title, Singer = song.Singer, Lyric = song.Lyric, Rate = song.Rate ?? 0, Like = like, Genre = _genre, ReleaseDate = _releaseDate, AddDate = DateTime.Now }); _successCount++; } } catch { } } db.SaveChanges(); return(_successCount); } }
private void ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e) { var crawledPage = e.CrawledPage; var doc = crawledPage.HtmlDocument.DocumentNode; var songNodes = doc.SelectNodes("//table/tbody/tr"); //--------------------------- // 크롤링 유효성 검사 //--------------------------- if (songNodes == null || songNodes.Count == 0) { _isCrawlingSuccess = false; return; } _isCrawlingSuccess = true; foreach (var node in songNodes) { try { using (var db = new SongRecommendContext()) { //--------------------------- // 노래정보 파싱 //--------------------------- var songId = node.SelectSingleNode(".//input[@class='input_check'] | .//input[@class='input_check ']").GetAttributeValue("value", 0); var title = node.SelectSingleNode(".//div[@class='ellipsis rank01']//a | .//div[@class='ellipsis rank01']//span[@class='fc_lgray']").InnerText; var singer = node.SelectSingleNode(".//div[@class='ellipsis rank02']//span").InnerText; if (songId == 0 || db.ProposeSong.Find(songId) != null) { continue; } //--------------------------- // 가사 가져오기 //--------------------------- HttpClient client = new HttpClient(); string jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={songId}").Result; var lyric = JObject.Parse(jsonString).Value <string>("lyric"); if (lyric == null || lyric.Length == 0) { continue; } //--------------------------- // 적합도 분석 //--------------------------- var rate = AnalyzeRateSvc.Execute(lyric).Rate; //--------------------------- // DB 저장 //--------------------------- if (rate > 70) { //--------------------------- // 좋아요 가져오기 //--------------------------- jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={songId}").Result; var like = 0; try { like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT"); } catch { } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={songId}")); db.ProposeSong.Add(new ProposeSong { SongId = songId, PlayListSeq = PlayListSeq, Title = title, Singer = singer, Lyric = lyric, Rate = rate, Like = like, Genre = _genre, ReleaseDate = _releaseDate, AddDate = DateTime.Now }); db.SaveChanges(); _successCount++; } } } catch { } } }
public void SetUp() { _unitUnderTest = new PageRequester(_crawlConfig); }
public AnalyzeSongResult Execute() { //--------------------------- // 가사 가져오기 //--------------------------- HttpClient client = new HttpClient(); string jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={SongId}").Result; var lyric = JObject.Parse(jsonString).Value <string>("lyric"); if (lyric == null || lyric.Length == 0) { return(null); } var analyzeResult = AnalyzeRateSvc.Execute(lyric); //--------------------------- // 좋아요 가져오기 //--------------------------- jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={SongId}").Result; var like = 0; try { like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT"); } catch { } //--------------------------- // 크롤링 설정 //--------------------------- var pageRequester = new PageRequester(_config); var crawler = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null); crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync; //--------------------------- // 크롤링 시작 //--------------------------- crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={SongId}")); var song = new ProposeSong { SongId = SongId, Title = _title, Singer = _singer, Lyric = lyric, Rate = analyzeResult.Rate, Like = like, Genre = _genre, ReleaseDate = _releaseDate, AddDate = DateTime.Now }; if (analyzeResult.Rate > 70) { using (var db = new SongRecommendContext()) { if (db.ProposeSong.Find(SongId) == null) { db.ProposeSong.Add(song); db.SaveChanges(); } } } var resultLyric = lyric; foreach (var word in analyzeResult.Words) { resultLyric = resultLyric.Replace(word.Word, $@"<span class='v-chip theme--dark light-green darken-2'><span class='v-chip__content tooltip'>{word.Word}<span class='tooltiptext'>{(int)word.Rate}%</span></span></span>"); } var result = new AnalyzeSongResult { SongId = SongId, Title = _title, Singer = _singer, Lyric = resultLyric, Rate = analyzeResult.Rate, AlbumCover = _albumCover, AlbumName = _albumName }; return(result); }