Beispiel #1
0
        public void Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = false
            });

            _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
Beispiel #2
0
        public async void RunCrawl(string aUri, string aOutputFolder, CrawlConfiguration crawlConfig)
        {
            _outputfolder = aOutputFolder;
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();

            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig);

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            await Task.Run(() =>
            {
                CrawlResult result = crawler.Crawl(new Uri(aUri),
                                                   cancellationTokenSource);

                OnMessageReceived(
                    result.ErrorOccurred
                        ? $"Crawl of {result.RootUri.AbsoluteUri} completed with error: {result.ErrorException.Message}"
                        : $"Crawl of {result.RootUri.AbsoluteUri} completed without error.");
            }, cancellationTokenSource.Token);
        }
        static void Main(string[] args)
        {
            // 크롤러 인스턴스 생성
            IWebCrawler crawler = new PoliteWebCrawler();

            // 옵션과 함께 크롤러 인스턴스 생성할 경우
            // var crawlConfig = new CrawlConfiguration();
            // crawlConfig.CrawlTimeoutSeconds = 1000;
            // crawlConfig.MaxConcurrentThreads = 10;
            // crawlConfig.MaxPagesToCrawl = 10;
            // crawlConfig.UserAgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0";
            // IWebCrawler crawler = new PoliteWebCrawler(crawlConfig);

            // 이벤트 핸들러 셋업
            crawler.PageCrawlStartingAsync += (s, e) =>
            {
                Console.WriteLine("Starting : {0}", e.PageToCrawl);
            };

            crawler.PageCrawlCompletedAsync += (s, e) =>
            {
                CrawledPage pg = e.CrawledPage;

                string fn = pg.Uri.Segments[pg.Uri.Segments.Length - 1];
                File.WriteAllText(fn, pg.Content.Text);

                //var hdoc = pg.HtmlDocument; //HtmlAgilityPack HtmlDocument

                Console.WriteLine("Completed : {0}", pg.Uri.AbsoluteUri);
            };

            // 크롤 시작
            string siteUrl = "http://www.naver.com";
            Uri    uri     = new Uri(siteUrl);

            crawler.Crawl(uri);
        }
Beispiel #4
0
        public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Never());
        }
        public string Execute()
        {
            using (var db = new SongRecommendContext()) {
                if (db.BaseWordCollectingSong.Find(SongId) != null)
                {
                    return("이미 추가된 곡입니다");
                }
            }

            //---------------------------
            // 크롤링 설정
            //---------------------------
            var pageRequester = new PageRequester(_config);
            var crawler       = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null);

            crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompletedAsync;

            //---------------------------
            // 크롤링 시작
            //---------------------------
            crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={SongId}"));

            return(_message);
        }
Beispiel #6
0
        private void button_crawl_Click(object sender, EventArgs e)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 1000;
            crawlConfig.UserAgentString      = "abot v1.0 http://code.google.com/p/abot";

            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;

            CrawlResult result = crawler.Crawl(new Uri("https://belaruspartisan.by/")); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                MessageBox.Show("Crawl of " + result.RootUri.AbsoluteUri + " completed with error: " + result.ErrorException.Message);
            }
            else
            {
                MessageBox.Show("Crawl of " + result.RootUri.AbsoluteUri + " completed without error.");
            }
        }
        public void Crawl_MinCrawlDelayDelayZero_StillCallsDomainRateLimiter()
        {
            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(1));
        }
        public int DoCrawl()
        {
            CrawlConfiguration CConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            CConfig.MaxConcurrentThreads        = maxConcurrentThreads;
            CConfig.MaxPagesToCrawl             = maxPagesToCrawl;
            CConfig.CrawlTimeoutSeconds         = crawlTimeoutSeconds;
            CConfig.HttpRequestTimeoutInSeconds = httpRequestTimeoutInSeconds;
            CConfig.LoginUser     = loginUser;
            CConfig.LoginPassword = loginPassword;

            Console.WriteLine("Doing Crawl With Slack " + (slackBotEnabled ? "Enabled" : "Disabled"));

            PoliteWebCrawler crawler = new PoliteWebCrawler(CConfig, null, null, null, null, null, null, null, null);

            //PoliteWebCrawler crawler = new PoliteWebCrawler();

            errors = new List <Errors>();


            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(URL)); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }

            IEnumerable <Errors> EnumList = errors.AsEnumerable();

            for (int i = 0; i < 525; i++)
            {
                if (EnumList.Where(x => x.ErrorCode == i).Count() != 0)
                {
                    returnInt = 1;
                    Console.ForegroundColor = ConsoleColor.Yellow;
                    Console.WriteLine(i + " (" + getErrorName(i) + ") Errors:");
                    slackMessage           += i + " (" + getErrorName(i) + ") Errors:\n";
                    Console.ForegroundColor = ConsoleColor.Red;
                    foreach (Errors err in EnumList.Where(x => x.ErrorCode == i))
                    {
                        Console.WriteLine("   " + err.ErrorURL);
                        slackMessage += "   " + err.ErrorURL + "\n";
                    }
                }
            }

            Console.ResetColor();

            if (slackMessage == "")
            {
                slackMessage = "No Errors In WebPage!";
            }

            Console.ForegroundColor = ConsoleColor.Green;
            Console.WriteLine("Done");
            Console.ResetColor();
            return(returnInt);
        }
Beispiel #9
0
        private void ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            var crawledPage = e.CrawledPage;
            var doc         = crawledPage.HtmlDocument.DocumentNode;
            var songNodes   = doc.SelectNodes("//table/tbody/tr");

            //---------------------------
            // 크롤링 유효성 검사
            //---------------------------
            if (songNodes == null || songNodes.Count == 0)
            {
                _isCrawlingSuccess = false;
                return;
            }

            _isCrawlingSuccess = true;
            foreach (var node in songNodes)
            {
                try {
                    using (var db = new SongRecommendContext()) {
                        //---------------------------
                        // 노래정보 파싱
                        //---------------------------
                        var songId = node.SelectSingleNode(".//input[@class='input_check'] | .//input[@class='input_check ']").GetAttributeValue("value", 0);
                        var title  = node.SelectSingleNode(".//div[@class='ellipsis rank01']//a | .//div[@class='ellipsis rank01']//span[@class='fc_lgray']").InnerText;
                        var singer = node.SelectSingleNode(".//div[@class='ellipsis rank02']//span").InnerText;
                        if (songId == 0 || db.ProposeSong.Find(songId) != null)
                        {
                            continue;
                        }

                        //---------------------------
                        // 가사 가져오기
                        //---------------------------
                        HttpClient client     = new HttpClient();
                        string     jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={songId}").Result;
                        var        lyric      = JObject.Parse(jsonString).Value <string>("lyric");
                        if (lyric == null || lyric.Length == 0)
                        {
                            continue;
                        }

                        //---------------------------
                        // 적합도 분석
                        //---------------------------
                        var rate = AnalyzeRateSvc.Execute(lyric).Rate;

                        //---------------------------
                        // DB 저장
                        //---------------------------
                        if (rate > 70)
                        {
                            //---------------------------
                            // 좋아요 가져오기
                            //---------------------------
                            jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={songId}").Result;
                            var like = 0;
                            try {
                                like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT");
                            }
                            catch { }

                            //---------------------------
                            // 크롤링 설정
                            //---------------------------
                            var pageRequester = new PageRequester(_config);
                            var crawler       = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null);
                            crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync;

                            //---------------------------
                            // 크롤링 시작
                            //---------------------------
                            crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={songId}"));

                            db.ProposeSong.Add(new ProposeSong {
                                SongId      = songId,
                                PlayListSeq = PlayListSeq,
                                Title       = title,
                                Singer      = singer,
                                Lyric       = lyric,
                                Rate        = rate,
                                Like        = like,
                                Genre       = _genre,
                                ReleaseDate = _releaseDate,
                                AddDate     = DateTime.Now
                            });
                            db.SaveChanges();
                            _successCount++;
                        }
                    }
                }
                catch {
                }
            }
        }
Beispiel #10
0
        static void Main(string[] args)
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value
            crawlConfig.MaxCrawlDepth        = 0;
            crawler = new PoliteWebCrawler();
            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //var doc = new HtmlDocument();
            //doc.Load(@"C:\Users\lucao\Downloads\keketest.html");
            //var embedNodes = doc.DocumentNode.SelectSingleNode("//script[contains(text(), 'thunder_url')]");
            //var domain = Regex.Match(embedNodes.InnerText, @".*domain.*'(.*)'").Groups[1].ToString();
            //var thunder_url = Regex.Match(embedNodes.InnerText, ".*thunder_url.*\"(.*)\"").Groups[1].ToString();
            //var downloadMp3Link = domain + thunder_url;


            CrawlResult result;

            for (int i = 58; i > 30; i--)
            {
                DownloadLinkList.Clear();
                Thread.Sleep(60000);
                result = crawler.Crawl(new Uri($"http://www.kekenet.com/Article/15410/List_{i}.shtml"));
                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }

                if (DownloadLinkList.Count > 0)
                {
                    DownloadMP3LinkList.Clear();
                    foreach (var link in DownloadLinkList)
                    {
                        var sub_crawler = new PoliteWebCrawler();
                        sub_crawler.PageCrawlStartingAsync        += sub_crawler_ProcessPageCrawlStarting;
                        sub_crawler.PageCrawlCompletedAsync       += sub_crawler_ProcessPageCrawlCompleted;
                        sub_crawler.PageCrawlDisallowedAsync      += sub_crawler_PageCrawlDisallowed;
                        sub_crawler.PageLinksCrawlDisallowedAsync += sub_crawler_PageLinksCrawlDisallowed;
                        sub_crawler.Crawl(new Uri(link));
                        Thread.Sleep(20000);
                        sub_crawler?.Dispose();
                    }
                }
                //"http://k6.kekenet.com/Sound/2018/01/scad180110.mp3"
                if (DownloadMP3LinkList.Count > 0)
                {
                    foreach (var mp3Link in DownloadMP3LinkList)
                    {
                        WebClient client = new WebClient();
                        Uri       ur     = new Uri(mp3Link);
                        client.DownloadProgressChanged += WebClientDownloadProgressChanged;
                        client.DownloadDataCompleted   += WebClientDownloadCompleted;
                        var file = @"C:\Users\lucao\Downloads\keke\" + mp3Link.Split('/').Last().ToString();
                        client.DownloadFile(ur, file);
                        Thread.Sleep(60000);
                    }
                }
            }
        }
        public async Task Start(Uri targetUri)
        {
            if (!(await IsRemoteServerAlive(targetUri)))
            {
                throw new WebException("No response from external server");
            }

            resultDetails  = new ConcurrentBag <TestResultDetail>();
            processedPages = new ConcurrentDictionary <string, byte>();
            result         = new TestResult()
            {
                Authority = targetUri.AbsoluteUri,
                TestDate  = DateTime.Now,
                Status    = 1
            };
            RepositoryInsertRequested?.Invoke(this, new TestResultArgs(result));

            CrawlConfiguration configuration = new CrawlConfiguration()
            {
                MaxPagesToCrawl = MaxPagesToCrawl,
                MaxCrawlDepth   = MaxCrawlDepth,
                IsExternalPageCrawlingEnabled      = IsExternalPageCrawlingEnabled,
                IsExternalPageLinksCrawlingEnabled = IsExternalPageLinksCrawlingEnabled,
                NumberOfRecurrentRequests          = NumberOfRecurrentRequests,
                MaxConcurrentThreads = MaxConcurrentThreads
            };

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, crawlDecisionMaker: null, memoryManager: null, scheduler: null,
                                                            hyperLinkParser: null, domainRateLimiter: null, robotsDotTextFinder: null,
                                                            threadManager: null, pageRequester: new PageRequesterWithRepeats(configuration));

            crawler.PageRequestSent      += Crawler_PageRequestSent;
            crawler.PageResponseReceived += Crawler_PageResponseReceived;
            crawler.PageCrawlCompleted   += Crawler_ProcessPageCrawlCompleted;

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision {
                    Allow = true
                };
                MatchCollection mc = Regex.Matches((pageToCrawl.Uri.AbsoluteUri), @"http[s]?:\/\/");
                if (mc.Count > 1)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "Dont want to crawl external pages"
                    }
                }
                ;

                return(decision);
            });

            TestStarted?.Invoke(this, new TestResultArgs(result));
            crawler.Crawl(targetUri);

            result.TestResultDetails = resultDetails.ToList();
            result.MinResponseTime   = rootMinResponseTime;
            result.MaxResponseTime   = rootMaxResponseTime;
            result.MeanResponseTime  = rootMeanResponseTime / numberOfPagesCrawled;
            result.Status            = 0;

            TestFinished?.Invoke(this, new TestResultArgs(result));
            RepositoryInsertDetailsRequested?.Invoke(this, new TestResultArgs(result));
        }
Beispiel #12
0
        public int Execute()
        {
            using (var db = new SongRecommendContext()) {
                // 대상 조회
                var targetSong = from baseSong in db.BaseWordCollectingSong
                                 join proposeSong in db.ProposeSong
                                 on baseSong.SongId equals proposeSong.SongId into proposeSongs
                                 from defaultPropose in proposeSongs.DefaultIfEmpty()
                                 where baseSong.Status == "Tokenized" && defaultPropose == null
                                 select baseSong;

                // Rate 계산
                foreach (var song in targetSong)
                {
                    try {
                        var rateResult = AnalyzeRateSvc.Execute(song.Lyric);
                        song.Rate   = rateResult.Rate;
                        song.Status = "Analyzed";

                        if (song.Rate > 70)
                        {
                            //---------------------------
                            // 좋아요 가져오기
                            //---------------------------
                            HttpClient client     = new HttpClient();
                            var        jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={song.SongId}").Result;
                            var        like       = 0;
                            try {
                                like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT");
                            }
                            catch { }

                            //---------------------------
                            // 크롤링 설정
                            //---------------------------
                            var pageRequester = new PageRequester(_config);
                            var crawler       = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null);
                            crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync;

                            //---------------------------
                            // 크롤링 시작
                            //---------------------------
                            crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={song.SongId}"));

                            db.ProposeSong.Add(new ProposeSong {
                                SongId      = song.SongId,
                                PlayListSeq = song.PlayListSeq,
                                Title       = song.Title,
                                Singer      = song.Singer,
                                Lyric       = song.Lyric,
                                Rate        = song.Rate ?? 0,
                                Like        = like,
                                Genre       = _genre,
                                ReleaseDate = _releaseDate,
                                AddDate     = DateTime.Now
                            });
                            _successCount++;
                        }
                    }
                    catch { }
                }

                db.SaveChanges();

                return(_successCount);
            }
        }
Beispiel #13
0
        public void StartWebCrawler()
        {
            if (File.Exists("FailLog.txt"))
            {
                File.Delete("FailLog.txt");
            }

            FileStream   FailLog = new FileStream("FailLog.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite);
            StreamWriter sw      = new StreamWriter(FailLog, Encoding.Default);

            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            for (int i = 1; i <= Chalaoshi.MaximumTeacherPage; i++)
            {
                LeftLinkList.Add("https://chalaoshi.cn/teacher/" + i.ToString() + "/");
            }

            LeftLinkList.Remove("https://chalaoshi.cn/teacher/2485/");
            LeftLinkList.Remove("https://chalaoshi.cn/teacher/3433/");

            for (int Loopi = 0; Loopi < 5; Loopi++) //最多循环6次,还爬不到的就是服务器问题了
            {
                if (LeftLinkList.Count == 0)
                {
                    break;
                }
                PageLinkList.Clear();
                LeftLinkList.ForEach(i => PageLinkList.Add(i));
                //Task task = new Task(() =>
                //{
                Parallel.For(0, PageLinkList.Count, (i) =>
                {
                    var crawler = new PoliteWebCrawler();
                    var url     = PageLinkList[i];
                    Console.WriteLine("Start:" + url);
                    crawler.PageCrawlStartingAsync   += crawler_ProcessPageCrawlStarting;
                    crawler.PageCrawlCompletedAsync  += crawler_ProcessPageCrawlCompleted;
                    crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
                    //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
                    try
                    {
                        crawler.Crawl(new Uri(url));
                    }
                    catch (Exception ex)
                    {
                        sw.WriteLine(url + ex.Message);
                    }
                    Console.WriteLine("Finish:" + url);
                    Thread.Sleep(20);    //给服务器休息一下
                });
                //});
                //task.Start();
                //task.Wait();
                //Thread.Sleep(200);//给服务器休息一下
            }

            if (LeftLinkList.Count > 0)
            {
                for (int i = 0; i < LeftLinkList.Count; i++)
                {
                    sw.WriteLine("#{0}#未写入", LeftLinkList[i]);
                }
                sw.Close();
                FailLog.Close();
            }

            var          fileName     = $"CLStext_{DateTime.Now.Year.ToString()}_{DateTime.Now.Month.ToString()}_{DateTime.Now.Day}_{DateTime.Now.Hour}_{DateTime.Now.Minute}_{DateTime.Now.Second}.csv";
            FileStream   file         = new FileStream(fileName, FileMode.Append, FileAccess.Write); //在此处定义,保证读写锁的最大性能
            StreamWriter streamWriter = new StreamWriter(file, Encoding.Default);                    // 创建写入流

            for (int i = 0; i < TotalInfo.Count; i++)
            {
                streamWriter.WriteLine(TotalInfo[i]);
            }
            streamWriter.Close();
            file.Close();

            stopwatch.Stop();
            try
            {
                Console.WriteLine("Finished completely.\nTime Consumption:{0}\nPage Number:{1}\nTime Per Page:{2} milliseconds.\nPress any key to continue.", stopwatch.Elapsed, TotalInfo.Count, stopwatch.ElapsedMilliseconds / TotalInfo.Count);
            }
            catch
            {
                Console.WriteLine("当前已爬网页数量为0");
            }
            Console.WriteLine(FailedPageCount.ToString());
            Console.ReadLine();
        }
Beispiel #14
0
        static void Main(string[] args)
        {
            try
            {
                // 크롤러 인스턴스 생성
                // IWebCrawler crawler = new PoliteWebCrawler();

                // 옵션과 함께 크롤러 인스턴스 생성할 경우
                var crawlConfig = new CrawlConfiguration();
                crawlConfig.CrawlTimeoutSeconds = 5000;
//                crawlConfig.MaxConcurrentThreads = 10;

                crawlConfig.MaxConcurrentThreads = 1;
//                crawlConfig.MaxPagesToCrawl = 10;
                crawlConfig.MaxPagesToCrawl = 50;
                crawlConfig.UserAgentString = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0";
                IWebCrawler crawler = new PoliteWebCrawler(crawlConfig);

                // 이벤트 핸들러 셋업
                crawler.PageCrawlStartingAsync += (s, e) =>
                {
                    Console.WriteLine($"Starting : {e.PageToCrawl}");
                };

                crawler.PageCrawlCompletedAsync += (s, e) =>
                {
                    CrawledPage pg = e.CrawledPage;
                    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

                    string fn   = pg.Uri.Segments[pg.Uri.Segments.Length - 1];
                    string path = @"C:\Users\yjs3694\source\repos\AbotCrawler\AbotCrawler\bin\Debug\crawl.txt";
//                    File.WriteAllText(fn, pg.Content.Text);
//                    File.WriteAllText(directory, pg.Content.Text);

                    doc.LoadHtml(pg.Content.Text);


                    HtmlAgilityPack.HtmlNode singleNode = doc.GetElementbyId("mArticle"); // 태그의 ID=tagId 인것

                    //singleNode 노드의 자식중 a 태그들 *(.) 이 있어야 현재 노드부터 찾는다
                    // HtmlNodeCollection anchors = singleNode.SelectNodes(".//a");

                    //singleNode 노드의 프로퍼티(클래스)값을 리턴한다.
                    //  string className = singleNode.GetAttributeValue("class", "");

//                    HtmlAgilityPack.HtmlNodeCollection article = doc.DocumentNode.SelectNodes("//div[@class='articles']");
//                    HtmlAgilityPack.HtmlNodeCollection article = doc.DocumentNode.SelectNodes("div[@class='hotissue_builtin']");


                    if (singleNode != null)
                    {
//                        File.WriteAllText(directory, singleNode.SelectSingleNode(".//article/div[1]/div[0]/div[2]").InnerText);

                        //*[@id="mArticle"]/div[2]/div[1]/div[2]/div[1]/ol
                        //*[@id="mArticle"]/div[2]/div[1]/div[3]/div[1]/ol

//                        var content = singleNode.SelectSingleNode("//div[2]/div[1]/div[2]/div[1]/ol")?.InnerText;
                        var content = singleNode.SelectSingleNode("//div[2]/div[1]/div[3]/div[1]/ol")?.InnerText;

                        if (content != null)
                        {
                            var bbb = content.Replace("\n\n\n", "");

//                            File.WriteAllText(path, bbb);
//                            File.AppendAllText(path, "\n\n\n");
                            File.AppendAllText(path, bbb);
                            File.AppendAllText(path, "\n\n\n");
                        }
                    }

                    //var hdoc = pg.HtmlDocument; //HtmlAgilityPack HtmlDocument

                    Console.WriteLine("Completed : {0}", pg.Uri.AbsoluteUri);
                };

                // 크롤 시작
                string siteUrl = "http://www.daum.net";

                Uri uri = new Uri(siteUrl);

                for (int i = 0; i < 5; i++)
                {
                    crawler.Crawl(uri);
                    System.Threading.Thread.Sleep(300);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
        static int FailedPageCount  = 0; //用于debug

        public void StartWebCrawler()
        {
            XmlConfigurator.Configure();//本行控制是否采用log4net提供的log功能

            /*
             * 单步写法,已被废弃
             *
             * PoliteWebCrawler[] crawler = new PoliteWebCrawler[Chalaoshi.MaximumTeacherPage];
             * for (int i = 0; i < Chalaoshi.MaximumTeacherPage; i++)
             * {
             *  crawler[i] = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);
             *  crawler[i].PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
             *  crawler[i].PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
             *  crawler[i].PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
             *  crawler[i].PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
             *
             *  CrawlResult result = crawler[i].Crawl(new Uri($"https://chalaoshi.cn/teacher/{i + 1}/")); //This is synchronous, it will not go to the next line until the crawl has completed
             *
             *  if (result.ErrorOccurred)
             *      Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
             *  else
             *      Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
             * }
             */

            /*
             * Task 写法未成功,原因不明
             * Task[] TotalTasks = new Task[Chalaoshi.MaximumTeacherPage];
             * Console.WriteLine("Start Crawling");
             * for (var i = 0; i < Chalaoshi.MaximumTeacherPage; i++)
             * {
             *  string url = "https://chalaoshi.cn/teacher/" + (i + 1).ToString() + "/";
             *  TotalTasks[i] = new Task(() =>
             *  {
             *      var crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);
             *      Console.WriteLine("Start:" + url);
             *      crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
             *      crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
             *      crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
             *      crawler.Crawl(new Uri(url));
             *      Console.WriteLine("Finish:" + url);
             *  });
             *  TotalTasks[i].Start();
             * }
             * Task.WaitAll(TotalTasks);
             */

            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Task task = new Task(() =>
            {
                Parallel.For(1, Chalaoshi.MaximumTeacherPage + 1, (i) =>
                {
                    var crawler = new PoliteWebCrawler();
                    string url  = "https://chalaoshi.cn/teacher/" + i.ToString() + "/";
                    Console.WriteLine("Start:" + url);
                    crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
                    crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
                    crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
                    crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
                    crawler.Crawl(new Uri(url));
                    Console.WriteLine("Finish:" + url);
                    Thread.Sleep(50);//给服务器喘息一下
                });
            });

            task.Start();
            task.Wait();
            stopwatch.Stop();
            Console.WriteLine("Finished Completely.\nTime Consume:{0}\nPage Number:{1}\nTime Per Page:{2} milliseconds.\nPress any key to continue.", stopwatch.Elapsed, CrawledPageCount, stopwatch.ElapsedMilliseconds / CrawledPageCount);
            Console.WriteLine(FailedPageCount.ToString());
            Console.ReadLine();
        }
Beispiel #16
0
        private static void ParseSucceed(Options options)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration()
            {
                CrawlTimeoutSeconds      = options.Timeout,
                MaxConcurrentThreads     = options.ConcurrentPages,
                MaxPagesToCrawl          = 0,
                MaxPagesToCrawlPerDomain = 0,
                UserAgentString          = options.UserAgentString ?? new Uri(options.Url).Host + " Crawler",
            };

            Dictionary <int, string> fileList = new Dictionary <int, string>(Math.Abs(options.Start - options.End));
            string fileToWrite = options.Output ?? new Uri(options.Url).Host + ".txt";

            if (File.Exists(fileToWrite))
            {
                string[] preliminaryFileList = File.ReadAllLines(fileToWrite);
                for (int i = 0; i < preliminaryFileList.Length; i++)
                {
                    Match entry = Regex.Match(preliminaryFileList[i], @"^(\d+) = (.*)$");
                    if (entry.Groups[0].Success)
                    {
                        fileList.Add(int.Parse(entry.Groups[1].Value), entry.Groups[2].Value);
                    }
                }
            }

            for (int i = options.Start; i != options.End; i += options.End.CompareTo(i))
            {
                if (!fileList.ContainsKey(i) || (Uri.TryCreate(fileList[i], UriKind.Absolute, out Uri entry) ? options.RetryValid : options.RetryInvalid))
                {
                    Uri         uri = new Uri(options.Url + i);
                    CrawlResult rawResult;
                    using (PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, new NobotDotTxtIgnorer()))
                        rawResult = crawler.Crawl(uri);

                    string processedResult = "invalid file";
                    switch (options.ValidationMode)
                    {
                    case (ValidationModes.Redirect):
                        if (!rawResult.ErrorOccurred && rawResult.CrawlContext.RootUri.AbsoluteUri != uri.AbsoluteUri)
                        {
                            processedResult = rawResult.CrawlContext.RootUri.AbsoluteUri;
                        }
                        break;

                    case (ValidationModes.Valid):
                        if (!rawResult.ErrorOccurred)
                        {
                            processedResult = rawResult.CrawlContext.RootUri.AbsoluteUri;
                        }
                        break;
                    }

                    //HACK
                    if (!fileList.ContainsKey(i) || fileList[i] != processedResult)
                    {
                        if (!fileList.ContainsKey(i))
                        {
                            fileList.Add(i, processedResult);
                        }
                        else if (fileList[i] != processedResult)
                        {
                            fileList[i] = processedResult;
                        }
                        File.WriteAllLines(fileToWrite, fileList.Select(x => x.Key + " = " + x.Value).ToArray()); //HACK
                    }
                    Console.WriteLine(i + " = " + fileList[i]);
                }
            }
        }
Beispiel #17
0
 public void Run()
 {
     _crawler.Crawl(new Uri("https://www.baidu.com/s?wd=乙肝 症状"));
 }
Beispiel #18
0
        public AnalyzeSongResult Execute()
        {
            //---------------------------
            // 가사 가져오기
            //---------------------------
            HttpClient client     = new HttpClient();
            string     jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={SongId}").Result;
            var        lyric      = JObject.Parse(jsonString).Value <string>("lyric");

            if (lyric == null || lyric.Length == 0)
            {
                return(null);
            }

            var analyzeResult = AnalyzeRateSvc.Execute(lyric);

            //---------------------------
            // 좋아요 가져오기
            //---------------------------
            jsonString = client.GetStringAsync($"https://www.melon.com/commonlike/getSongLike.json?contsIds={SongId}").Result;
            var like = 0;

            try {
                like = JObject.Parse(jsonString).Value <IEnumerable <JToken> >("contsLike").First().Value <int>("SUMMCNT");
            }
            catch { }

            //---------------------------
            // 크롤링 설정
            //---------------------------
            var pageRequester = new PageRequester(_config);
            var crawler       = new PoliteWebCrawler(_config, null, null, null, pageRequester, null, null, null, null);

            crawler.PageCrawlCompletedAsync += ProcessDetailPageCrawlCompletedAsync;

            //---------------------------
            // 크롤링 시작
            //---------------------------
            crawler.Crawl(new Uri($"https://www.melon.com/song/detail.htm?songId={SongId}"));

            var song = new ProposeSong {
                SongId      = SongId,
                Title       = _title,
                Singer      = _singer,
                Lyric       = lyric,
                Rate        = analyzeResult.Rate,
                Like        = like,
                Genre       = _genre,
                ReleaseDate = _releaseDate,
                AddDate     = DateTime.Now
            };

            if (analyzeResult.Rate > 70)
            {
                using (var db = new SongRecommendContext()) {
                    if (db.ProposeSong.Find(SongId) == null)
                    {
                        db.ProposeSong.Add(song);
                        db.SaveChanges();
                    }
                }
            }

            var resultLyric = lyric;

            foreach (var word in analyzeResult.Words)
            {
                resultLyric = resultLyric.Replace(word.Word, $@"<span class='v-chip theme--dark light-green darken-2'><span class='v-chip__content tooltip'>{word.Word}<span class='tooltiptext'>{(int)word.Rate}%</span></span></span>");
            }

            var result = new AnalyzeSongResult {
                SongId     = SongId,
                Title      = _title,
                Singer     = _singer,
                Lyric      = resultLyric,
                Rate       = analyzeResult.Rate,
                AlbumCover = _albumCover,
                AlbumName  = _albumName
            };

            return(result);
        }