Exemple #1
0
 public Shop(CrawlData crawlData)
 {
     Name           = crawlData.Name;
     Sido           = crawlData.CodeSido;
     Sigungu        = crawlData.CodeSigungu;
     AddressSido    = crawlData.AddressSiDo;
     AddressSigungu = crawlData.AddressSiGunGu;
     Address        = crawlData.Address;
     Type           = crawlData.Type;
 }
 public override bool Process(CrawlData crawlData)
 {
     try
     {
         DoExtractContent(crawlData);
         DoExtractLinks(crawlData);
         return true;
     }
     catch
     {
         return false;
     }
 }
Exemple #3
0
        private async Task ProcessListAsync(Uri url, TermOfOffice termOfOffice, CrawlData data)
        {
            var urls = await GetDeputiesUrlsAsync(url)
                       .ConfigureAwait(false);

            foreach (var deputyUrl in urls)
            {
                var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice)
                             .ConfigureAwait(false);

                data.DataNotifier.NewDeputyFound(deputy);
            }
        }
        private void DoExtractContent(CrawlData crawlData)
        {
            var text = _indexStrips.Aggregate(crawlData.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value));
            var html = new HtmlDocument(text);

            crawlData.Title = html.FindTagsByName("title").Select(t => t.InnerText).FirstOrDefault();

            var metaData = new CaseInvariantNameValueCollection();
            foreach (var keyValue in html.FindTagsByName("meta").Select(GetKeyValue).Where(keyValue => keyValue.HasValue))
            {
                if (metaData.HasKey(keyValue.Value.Key))
                    throw new ApplicationException("Duplicate meta tags:" + keyValue.Value.Key);
                metaData.Append(keyValue.Value.Key, keyValue.Value.Value);
            }
            crawlData.MetaData = metaData.ToDictionary();
            crawlData.FilteredContent = html.ExtractContent();
        }
Exemple #5
0
        public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(termOfOffice, nameof(termOfOffice));
            Require.NotNull(data, nameof(data));

            logger.LogInformation("Start Orka deputies {Url}.", url);
            var perfWatch = Stopwatch.StartNew();

            var listUrl = await GetListUrlAsync(url)
                          .ConfigureAwait(false);

            await ProcessListAsync(listUrl, termOfOffice, data)
            .ConfigureAwait(false);

            logger.LogInformation("Crawling Orka deputies {Url} finished. Took {Elapsed}", url, perfWatch.Elapsed);
        }
Exemple #6
0
        public async Task CrawlAsync(Uri url, TermOfOffice termOfOffice, CrawlData data)
        {
            Require.NotNull(url, nameof(url));
            Require.NotNull(termOfOffice, nameof(termOfOffice));
            Require.NotNull(data, nameof(data));

            var crawler = deputiesCrawlers.FirstOrDefault(d => d.IsMatch(url));

            if (crawler == null)
            {
                logger.LogWarning("Couldn't match correct crawler for url {Url}", url);
            }
            else
            {
                await crawler.CrawlAsync(url, termOfOffice, data)
                .ConfigureAwait(false);
            }
        }
 public override bool Process(CrawlData data)
 {
     try
     {
         using (var file = new TempFile())
         {
             file.FileName += "." + Extension;
             File.WriteAllBytes(file.FileName, data.ResponseStream.ToArray());
             using (var filterReader = new FilterReader(file.FileName))
             {
                 data.FilteredContent = filterReader.ReadToEnd().Trim();
             }
         }
         return true;
     }
     catch
     {
         return false;
     }
 }
Exemple #8
0
        private async Task WorkAction(
            Browser browser,
            CancellationToken cancellationToken,
            PauseToken pauseToken,
            WorkerRelevantJobData jobData,
            ConcurrentQueue <Uri> queue,
            ConcurrentDictionary <Uri, CrawlData> crawled)
        {
            // create a new tab on the browser for this thread

            var nextUri = DequeueOrRetry(queue, cancellationToken);

            if (cancellationToken.IsCancellationRequested)
            {
                return;
            }

            // go to page
            var response = await _tab.GoToAsync(nextUri.AbsolutePath);

            if (response.Ok)
            {
                // perform the jobs actions
                // each action could yield a return value, such as extracted data
                // the url should be added to the crawl collection
            }
            else
            {
                // indicate in the crawled collection this was a failure + reason
                crawled.TryAdd(nextUri, CrawlData.CreateFailureData(response.Status, response.StatusText));
            }

            // if we should look for some links on the page
            if (jobData.LinkEnqueueType != LinkEnqueueType.None)
            {
                // get the page content and just put it in a collection
                // parser group will sort through and add the links
            }
        }
 public abstract bool Process(CrawlData data);
 public virtual bool CanProcess(CrawlData data)
 {
     return string.Compare(data.MimeType, MimeType, StringComparison.InvariantCultureIgnoreCase) == 0;
 }
        private void DoExtractLinks(CrawlData data)
        {
            var text = _followStrips.Aggregate(data.OriginalContent, (current, strip) => StripText(current, strip.Key, strip.Value));

            var html = new HtmlDocument(text);
            data.Links = html.FindTagsByName("a").Select(a => a.Attributes["href"]).ToArray();
        }
Exemple #12
0
        private async Task ProcessListAsync(IEnumerable <Uri> urls, TermOfOffice termOfOffice, CrawlData data)
        {
            foreach (var deputyUrl in urls)
            {
                var deputy = await deputyCrawlerManager.CrawlAsync(deputyUrl, termOfOffice)
                             .ConfigureAwait(false);

                data.DataNotifier.NewDeputyFound(deputy);
            }
        }
 public override bool Process(CrawlData data)
 {
     data.FilteredContent = string.Empty;
     return true;
 }