private static void SentimentAnalysisHandler(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (e.CrawledPage.HttpResponseMessage.StatusCode != HttpStatusCode.OK)
            {
                Log.Logger.Debug("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
                return;
            }
            else
            {
                Log.Logger.Debug("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Log.Logger.Debug("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                return;
            }

            var httpStatus  = e.CrawledPage.HttpResponseMessage.StatusCode;
            var rawPageText = e.CrawledPage.Content.Text;

            // this returns a list of parsed out text content from the raw html
            // var parsedText = ParseRawHTML(rawPageText);
            var parsedText = ParseRawHTML_bodyText(rawPageText);

            if (parsedText == null)
            {
                Log.Logger.Debug("WARNING: \"parsedText\" is null after parsing.");
                return;
            }

            allParsedText = allParsedText.Concat(parsedText).ToList();
        }
示例#2
0
        private static void PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            var         url         = crawledPage.Uri.ToString();

            ScrapeData(url);
        }
示例#3
0
        private static void PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            if (e.CrawledPage.HttpRequestException != null)
            {
                return;
            }

            IParserSettings settings = e.CrawlContext.CrawlBag.Settings;

            if (!settings.IsPageParseAllowed(e.CrawledPage))
            {
                return;
            }

            IParser <NewsData> parser = e.CrawlContext.CrawlBag.Parser;

            var news = parser.Parse(e.CrawledPage.AngleSharpHtmlDocument, e.CrawledPage.Uri.AbsoluteUri);

            if (news == null) // something went wrong
            {
                return;
            }

            news_list.Add(news);


            var splitted = TextProcessingHelper.TextSplittingAndRemovingSymbols(news.Text);

            TextProcessingHelper.CountFrequentWords(ref words_dictionary, splitted);



            Console.WriteLine(e.CrawledPage.Uri);
            Console.WriteLine("=================================");
        }
示例#4
0
        private void ProcessDetailPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            var crawledPage = e.CrawledPage;
            var doc         = crawledPage.HtmlDocument.DocumentNode;

            var genreNode = doc.SelectSingleNode(".//dt[text()='장르']");

            if (genreNode == null)
            {
                _genre = null;
            }
            else
            {
                _genre = genreNode.NextSibling.NextSibling.InnerText;
            }

            var releaseNode = doc.SelectSingleNode(".//dt[text()='발매일']");

            if (releaseNode == null)
            {
                _releaseDate = null;
            }
            else
            {
                _releaseDate = releaseNode.NextSibling.NextSibling.InnerText;
            }
        }
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlDocument = crawledPage.HtmlDocument;
            var analyticCode = htmlDocument.DocumentNode.SelectNodes("//script[contains(., 'Google Analytics')]");

            if (analyticCode != null)
            {
                _logger.Info("OK Analytic code exist");
            }
            else
            {
                _logger.Info("Failed code ");
            }
        }
示例#6
0
        private static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                //Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
                //Console.WriteLine(crawledPage.HtmlDocument.DocumentNode.InnerHtml);
                var quotes = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//div[@class='quoteText']");
                foreach (var quote in quotes)
                {
                    var readableQuote = Regex.Match(quote.InnerHtml, @"&ldquo;(.*)&rdquo;").Groups[1].Value;
                    Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "<quote>\n{0}\n</quote>", RemoveUnwantedTags(readableQuote)));
                }
            }
            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlAgilityPackDocument = crawledPage.HtmlDocument;           //Html Agility Pack parser
            var angleSharpHtmlDocument  = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser
        }
示例#7
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            //Process data
            if (e.CrawledPage.Content.Text != "")
            {
                var fpath = e.CrawledPage.Uri.ToString();
                if (fpath.IndexOf("?") > 0)
                {
                    fpath = fpath.Substring(0, fpath.IndexOf("?"));
                }
                fpath = getDirectoryFromUrl(fpath);
                fpath = rootPath + fpath;
                ensurePath(fpath);

                try
                {
                    if (!File.Exists(fpath))
                    {
                        var s    = e.CrawledPage.Content.Text;
                        var find = s.IndexOf("/NEW HEADER");
                        if (find > -1)
                        {
                            s = s.Substring(find);
                        }
                        find = s.IndexOf("<!-- Start Bottom Footer -->");
                        if (find > -1)
                        {
                            s = s.Substring(0, find);
                        }
                        File.WriteAllText(fpath, s);
                    }
                }
                catch { }
            }
        }
        private void ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            try {
                using (var db = new SongRecommendContext()) {
                    //---------------------------
                    // 가사 가져오기
                    //---------------------------
                    HttpClient client     = new HttpClient();
                    string     jsonString = client.GetStringAsync($"https://www.melon.com/song/lyricInfo.json?songId={SongId}").Result;
                    var        lyric      = JObject.Parse(jsonString).Value <string>("lyric");
                    if (string.IsNullOrEmpty(lyric))
                    {
                        _isCrawlingSuccess = false;
                        _message           = "가사가 없습니다";
                        return;
                    }

                    //---------------------------
                    // DB 저장
                    //---------------------------
                    var crawledPage = e.CrawledPage;
                    var doc         = crawledPage.HtmlDocument.DocumentNode;

                    //---------------------------
                    // 노래정보 파싱
                    //---------------------------
                    var title = doc.SelectSingleNode(".//div[@class='song_name']").ChildNodes[2].InnerText.Trim();
                    if (string.IsNullOrEmpty(title))
                    {
                        _isCrawlingSuccess = false;
                        _message           = "곡명이 없습니다";
                        return;
                    }

                    var singer = doc.SelectSingleNode(".//div[@class='artist']").InnerText.Trim();
                    if (string.IsNullOrEmpty(singer))
                    {
                        _isCrawlingSuccess = false;
                        _message           = "가수명이 없습니다";
                        return;
                    }

                    db.BaseWordCollectingSong.Add(new BaseWordCollectingSong {
                        SongId = SongId,
                        Title  = title,
                        Singer = singer,
                        Lyric  = Regex.Replace(lyric, @"<br>|<br/>|</br>", " ", RegexOptions.IgnoreCase).Trim(),
                        Status = CrawlingStatus.Ready
                    });

                    db.SaveChanges();
                    _isCrawlingSuccess = true;
                    _message           = $"{title} 을 추가했습니다";
                }
            }
            catch (Exception ex) {
                _isCrawlingSuccess = false;
                _message           = ex.Message;
            }
        }
示例#9
0
        private static void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            string        key   = e.CrawledPage.Uri.AbsoluteUri;
            List <string> edges = e.CrawledPage.ParsedLinks?.Select(l => l.AbsoluteUri).ToList();

            Graph.AddNode(key, edges);
        }
示例#10
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlAgilityPackDocument = crawledPage.HtmlDocument;           //Html Agility Pack parser
            var angleSharpHtmlDocument  = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser


            HtmlNodeCollection keywordContent = htmlAgilityPackDocument.DocumentNode.SelectNodes("//*[text()[contains(., '小港醫院')]]");


            foreach (HtmlNode node in keywordContent)
            {
                Console.WriteLine(node.InnerText);
            }
        }
示例#11
0
        async void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
                return;
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                return;
            }

            var scrapeResult = _websiteSource.Scraper.GetScrapeResult(crawledPage);

            if (scrapeResult != null)
            {
                if (scrapeResult.IsValid())
                {
                    await _mongoManager.AddScrapeResult(scrapeResult);
                }
                else
                {
                    await _mongoManager.AddErroredScrapeResult(scrapeResult);
                }
            }
        }
示例#12
0
        private static void Crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            if (e.CrawledPage.Uri.ToString() == "https://filehippo.com/windows/browsers/")
            {
                //Debugger.Break();
            }
            string matchString = @"""(https:[^""]*)";

            //Console.WriteLine("start rgx");
            //var sw = Stopwatch.StartNew();
            MatchCollection matches = Regex.Matches(e.CrawledPage.Content.Text, matchString, RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant);

            //if (sw.ElapsedMilliseconds > 100) Console.WriteLine(sw.ElapsedMilliseconds);
            //Console.WriteLine("end rgx");

            foreach (var match in matches.OfType <Match>())
            {
                try
                {
                    var uri = new Uri(match.Groups[1].Value);

                    if (BetterDecisionMaker.ShouldCrawl(uri, null).Allow)
                    {
                        e.CrawlContext.Scheduler.Add(new PageToCrawl(uri));
                    }
                }
                catch { }
            }
        }
示例#13
0
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            var images = e.CrawledPage.AngleSharpHtmlDocument.Images;

            imageUrls = images.Select(image => image.Source).ToList();
            var validImgUrls   = imageUrls.Where(x => x.StartsWith("http")).ToList();
            var invalidImgUrls = imageUrls.Where(x => !x.StartsWith("http"));

            try
            {
                ValidateImageUrls(invalidImgUrls).ForEach(async url => await SaveImageAsync(url));
            }
            catch (Exception)
            {
                exceptionCounter++;
            }

            try
            {
                validImgUrls.ForEach(async url => await SaveImageAsync(url));
            }
            catch (Exception)
            {
                exceptionCounter++;
            }
        }
        private void _crawler_PageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            //数据页
            if (crawledPage.Uri.AbsoluteUri.Contains("user") && crawledPage.Uri.AbsoluteUri.Contains("traces") && !crawledPage.Uri.AbsoluteUri.Contains("/tag") && !crawledPage.Uri.AbsoluteUri.Contains("login?referer=") && !crawledPage.Uri.AbsoluteUri.Contains("new?referer=") && !crawledPage.Uri.AbsoluteUri.Contains("/rss") && !crawledPage.Uri.AbsoluteUri.Contains("/login?"))
            {
                string                      traceId    = crawledPage.Uri.Segments.Last();
                HtmlNodeCollection          collection = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//tr");
                Dictionary <string, string> props      = new Dictionary <string, string>();
                props.Add("TraceId", traceId);
                foreach (var element in collection)
                {
                    string name  = element.ChildNodes[1].InnerText.Replace(":", "").Replace(" ", "");
                    string value = element.ChildNodes[3].InnerText;
                    props.Add(name, value);
                }
                OnTraceInfoComplete(props);
            }
            //信息概览页
            else if (crawledPage.Uri.AbsoluteUri.Contains("trace") && crawledPage.Uri.AbsoluteUri.Contains("data"))
            {
                //string tarceId = crawledPage.Uri.Segments[2].Replace("/","");
                //string xmlContent = crawledPage.Content.Text;
                //OnTraceDataComplete(tarceId,xmlContent);
            }
        }
示例#15
0
        //after page is crawled, here's the result
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var result         = DataFinder.FindNameAndPrice(crawledPage);
            var resultImageUrl = DataFinder.FindImage(crawledPage);

            if (result != "")
            {
                _fileSystem.AddLine($"{result}\t\t{resultImageUrl}");
            }
        }
示例#16
0
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            var    crawledPage = e.CrawledPage;
            string pageUri     = crawledPage.Uri.AbsoluteUri;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Log($"Crawl of page failed {pageUri}");
                return;
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Log($"Page had no content {pageUri}");
                return;
            }

            // Parse the document
            var htmlAgilityPackDocument = crawledPage.HtmlDocument;           //Html Agility Pack parser
            var angleSharpHtmlDocument  = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

            var page = Page.CreateFrom(crawledPage);

            Process(page);
        }
示例#17
0
        private static void sub_crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
            var embedNodes      = htmlAgilityPackDocument.DocumentNode.SelectSingleNode("//script[contains(text(), 'thunder_url')]");
            var domain          = Regex.Match(embedNodes.InnerText, @".*domain.*'(.*)'").Groups[1].ToString();
            var thunder_url     = Regex.Match(embedNodes.InnerText, ".*thunder_url.*\"(.*)\"").Groups[1].ToString();
            var downloadMp3Link = domain + thunder_url;

            DownloadMP3LinkList.Add(downloadMp3Link);
        }
        private void CrawlerOnPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            var html       = e.CrawledPage.AngleSharpHtmlDocument;
            var page       = -1;
            var pageString = HttpUtility.ParseQueryString(e.CrawledPage.Uri.Query).Get("page");

            if (pageString != null && !int.TryParse(pageString, out page))
            {
                page = -1;
            }

            if (page < 0)
            {
                return;
            }

            var artistCells = html.QuerySelectorAll("table>tbody>tr>td>table>tbody>tr>td>a");

            //            var artistNames = artistCells.Select(element => element.TextContent);

            foreach (var artistCell in artistCells)
            {
                var url = artistCell.GetAttribute("href");
                _artistPages.Add(new ArtistPage()
                {
                    Name = artistCell.TextContent,
                    Url  = url,
                    Page = page,
                });
                //Console.WriteLine("   {0} : {1}", artistCell.TextContent, href);
            }

            Console.WriteLine("completed [{0}]: {1}", _artistPages.Count, e.CrawledPage.Uri);
        }
示例#19
0
        private void syncPageCrawledCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
                links.Add(crawledPage.Uri.AbsoluteUri);
                listBox1.DataSource = links;
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser


            var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //
        }
示例#20
0
        private static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlAgilityPackDocument = crawledPage.HtmlDocument;           //Html Agility Pack parser
            var angleSharpHtmlDocument  = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

            var list = findAllListForASC(htmlAgilityPackDocument);

            if (list.Count > 0)
            {
                DownloadLinkList = list;
            }
        }
        private void Crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            PageToCrawl      pageToCrawl = e.CrawledPage;
            TestResultDetail item        = e.CrawledPage.PageBag.Item;

            lock (mainLock)
            {
                rootMeanResponseTime += item.MeanResponseTime;
                if (item.MinResponseTime < rootMinResponseTime)
                {
                    rootMinResponseTime = item.MinResponseTime;
                }
                if (item.MaxResponseTime > rootMaxResponseTime)
                {
                    rootMaxResponseTime = item.MaxResponseTime;
                }
                ++numberOfPagesCrawled;
            }
            if (!processedPages.ContainsKey(item.Uri))
            {
                resultDetails.Add(item);
                processedPages.TryAdd(item.Uri, 0);
                PageTestingCompleted?.Invoke(this, new PageTestingCompletedArgs(item));
            }
        }
示例#22
0
        protected override void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            var crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                return;
            }

            var element = ParseCategory(crawledPage.Content.Text);

            //var elemList = ParseCategory(crawledPage.Content.Text);
            (e.CrawlContext.CrawlBag.elements as ConcurrentBag <SpiderCategory>).Add(element);
            //(e.CrawlContext.CrawlBag.elements as ConcurrentBag<Substance>).Add(item);

            e.CrawlContext.CancellationTokenSource.Cancel();
        }
示例#23
0
        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage            page = new CrawledPage(new Uri("http://aaa.com/"));
            PageCrawlCompletedArgs uut  = new PageCrawlCompletedArgs(new CrawlContext(), page);

            Assert.AreSame(page, uut.CrawledPage);
        }
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser

            //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

            _recipeWebsite.GetRecipeDataFromHTML(htmlAgilityPackDocument, Recipes);
            //ReceptiteBg(htmlAgilityPackDocument);
            //ReceptiteGotvachBg(htmlAgilityPackDocument);
        }
示例#25
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            Console.WriteLine($"Crawled {crawledPage.Uri.AbsoluteUri}");
            e.CrawlContext.CrawlBag.Count++;
            Console.WriteLine($"Total pages crawled: {e.CrawlContext.CrawlBag.Count}");
        }
示例#26
0
        private static async void Crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            // Run application code
            using var scope = Program.ServiceProvider.CreateScope();
            var _context = scope.ServiceProvider.GetService <ContentContext>();

            await OnPageCrawlCompleted(e, _context);
        }
        protected override void OnCrawlerProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            var doc = new HtmlDocument();

            doc.LoadHtml(e.Driver.PageSource);
            Debug.WriteLine(doc.DocumentNode.SelectSingleNode("//a").InnerText);
            CrawledPageCount++;
        }
示例#28
0
        static void crawler_PageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
        {
            Console.WriteLine(e.CrawledPage.Uri);

            IEnumerable <Uri> allLinksOnPage = new HapHyperLinkParser().GetLinks(e.CrawledPage);
            IEnumerable <Uri> internalLinks  = allLinksOnPage.Where(l => l.Authority == e.CrawlContext.RootUri.Authority);
            IEnumerable <Uri> externalLinks  = allLinksOnPage.Except(internalLinks);
        }
示例#29
0
        private void CrawlerOnPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            var parsedQuery = HttpUtility.ParseQueryString(e.CrawledPage.Uri.Query);
            var id          = parsedQuery.Get("id");

            if (id != null)
            {
                var html           = e.CrawledPage.AngleSharpHtmlDocument;
                var headerElements = html.QuerySelectorAll("table table table tbody>tr>td>font>a");
                var headers        = headerElements.Select(element => element.TextContent).ToList();
                if (headers.Count == 2)
                {
                    var artist = headers[1];
                    var title  = headers[0];

                    //var artist = html.QuerySelector("table table table>tbody>tr>td>font>a")?.TextContent;
                    var lines = html.QuerySelectorAll("table.tabletext>tbody>tr>td").Select(element => element.TextContent).ToList();
                    if (lines.Count > 2)
                    {
                        artist = lines[lines.Count - 1];
                        title  = lines[lines.Count - 2];
                        lines  = lines.Take(lines.Count - 2).ToList();
                    }

                    if (artist.StartsWith(_artistNameHint))
                    {
                        var sb        = new StringBuilder(1024);
                        var badLyrics = false;
                        foreach (var line in lines)
                        {
                            if (IsBadLyricsLines(line))
                            {
                                continue;
                            }

                            sb.AppendLine(line);
                        }

                        var lyrics = sb.ToString();
                        _songs.Add(new Song()
                        {
                            Artist = artist,
                            Id     = id,
                            Lyrics = lyrics,
                            Title  = title,
                        });

                        //Console.WriteLine("completed {0}", e.CrawledPage.Uri);
                        Console.Write(".");
                    }
                    else
                    {
                        Console.Write("-");
                    }
                }
            }
        }
示例#30
0
        private void Crawler_PageCrawlCompleted(object?sender, PageCrawlCompletedArgs e)
        {
            Logger.Information("{0}", e.CrawledPage.Uri);

            foreach (var m in GetExamples(e.CrawledPage.Content.Text))
            {
                Console.WriteLine(m);
            }
        }