public Shop(CrawlData crawlData) { Name = crawlData.Name; Sido = crawlData.CodeSido; Sigungu = crawlData.CodeSigungu; AddressSido = crawlData.AddressSiDo; AddressSigungu = crawlData.AddressSiGunGu; Address = crawlData.Address; Type = crawlData.Type; }
public override bool Process(CrawlData crawlData) { try { DoExtractContent(crawlData); DoExtractLinks(crawlData); return true; } catch { return false; } }
private async Task ProcessListAsync(Uri url, TermOfOffice termOfOffice, CrawlData data) { var urls = await GetDeputiesUrlsAsync(url) .ConfigureAwait(false); foreach (var deputyUrl in urls) { var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice) .ConfigureAwait(false); data.DataNotifier.NewDeputyFound(deputy); } }
private void DoExtractContent(CrawlData crawlData) { var text = _indexStrips.Aggregate(crawlData.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value)); var html = new HtmlDocument(text); crawlData.Title = html.FindTagsByName("title").Select(t => t.InnerText).FirstOrDefault(); var metaData = new CaseInvariantNameValueCollection(); foreach (var keyValue in html.FindTagsByName("meta").Select(GetKeyValue).Where(keyValue => keyValue.HasValue)) { if (metaData.HasKey(keyValue.Value.Key)) throw new ApplicationException("Duplicate meta tags:" + keyValue.Value.Key); metaData.Append(keyValue.Value.Key, keyValue.Value.Value); } crawlData.MetaData = metaData.ToDictionary(); crawlData.FilteredContent = html.ExtractContent(); }
public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data) { Require.NotNull(url, nameof(url)); Require.NotNull(termOfOffice, nameof(termOfOffice)); Require.NotNull(data, nameof(data)); logger.LogInformation("Start Orka deputies {Url}.", url); var perfWatch = Stopwatch.StartNew(); var listUrl = await GetListUrlAsync(url) .ConfigureAwait(false); await ProcessListAsync(listUrl, termOfOffice, data) .ConfigureAwait(false); logger.LogInformation("Crawling Orka deputies {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed); }
public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data) { Require.NotNull(url, nameof(url)); Require.NotNull(termOfOffice, nameof(termOfOffice)); Require.NotNull(data, nameof(data)); var crawler = deputiesCrawlers.FirstOrDefault(d => d.IsMatch(url)); if (crawler == null) { logger.LogWarning("Couldn't match correct crawler for url {Url}", url); } else { await crawler.CrawlAsync(url, termOfOffice, data) .ConfigureAwait(false); } }
public override bool Process(CrawlData data) { try { using (var file = new TempFile()) { file.FileName += "." + Extension; File.WriteAllBytes(file.FileName, data.ResponseStream.ToArray()); using (var filterReader = new FilterReader(file.FileName)) { data.FilteredContent = filterReader.ReadToEnd().Trim(); } } return true; } catch { return false; } }
private async Task WorkAction( Browser browser, CancellationToken cancellationToken, PauseToken pauseToken, WorkerRelevantJobData jobData, ConcurrentQueue <Uri> queue, ConcurrentDictionary <Uri, CrawlData> crawled) { // create a new tab on the browser for this thread var nextUri = DequeueOrRetry(queue, cancellationToken); if (cancellationToken.IsCancellationRequested) { return; } // go to page var response = await _tab.GoToAsync(nextUri.AbsolutePath); if (response.Ok) { // perform the jobs actions // each action could yield a return value, such as extracted data // the url should be added to the crawl collection } else { // indicate in the crawled collection this was a failure + reason crawled.TryAdd(nextUri, CrawlData.CreateFailureData(response.Status, response.StatusText)); } // if we should look for some links on the page if (jobData.LinkEnqueueType != LinkEnqueueType.None) { // get the page content and just put it in a collection // parser group will sort through and add the links } }
public abstract bool Process(CrawlData data);
public virtual bool CanProcess(CrawlData data) { return string.Compare(data.MimeType, MimeType, StringComparison.InvariantCultureIgnoreCase) == 0; }
private void DoExtractLinks(CrawlData data) { var text = _followStrips.Aggregate(data.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value)); var html = new HtmlDocument(text); data.Links = html.FindTagsByName("a").Select(a => a.Attributes["href"]).ToArray(); }
private async Task ProcessListAsync(IEnumerable <Uri> urls, TermOfOffice termOfOffice, CrawlData data) { foreach (var deputyUrl in urls) { var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice) .ConfigureAwait(false); data.DataNotifier.NewDeputyFound(deputy); } }
public override bool Process(CrawlData data) { data.FilteredContent = string.Empty; return true; }