コード例 #1
0
 public void AddUrl(Web.PageCrawlDetail url)
 {
     if (UrlToBeAdded(url))
     {
         UrlsToCrawl.Enqueue(url);
     }
 }
コード例 #2
0
        public bool UrlToBeAdded(Web.PageCrawlDetail url)
        {
            bool isInQueue    = UrlsToCrawl.Where(x => x.PageUri.AbsoluteUri == url.PageUri.AbsoluteUri).Count() > 0;
            bool isInComplete = UrlsCompleted.Where(x => x.PageUri.AbsoluteUri == url.PageUri.AbsoluteUri).Count() > 0;
            bool isAllowed    = Robots.DisallowedList.Where(x => url.PageUri.AbsolutePath.StartsWith(x)).Count() == 0;
            bool isInDomain   = Robots.BastUri.Authority == url.PageUri.Authority;

            return(!isInQueue && !isInComplete && isAllowed && isInDomain);
        }
コード例 #3
0
        private async Task <string> ProcessUrl(Web.PageCrawlDetail url)
        {
            Console.WriteLine("url " + url);
            var response = await client.GetAsync(url.PageUri.AbsoluteUri);

            var content = await response.Content.ReadAsStringAsync();

            url.LoadContent(content, response.StatusCode);
            url.LoadUris();
            CrawlList.AddUrls(url.AllLinks);
            url.AllLinks = new List <Web.PageCrawlDetail>(); // dont need it anymore
            CrawlList.UrlsCompleted.Add(url);
            return(content);
        }
コード例 #4
0
        public static List <Web.PageCrawlDetail> GetUris(HtmlAgilityPack.HtmlDocument doc, Uri baseUri)
        {
            List <Web.PageCrawlDetail> uris = new List <Web.PageCrawlDetail>();

            foreach (HtmlAgilityPack.HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
            {
                // Get the value of the HREF attribute
                string hrefValue = link.GetAttributeValue("href", string.Empty);

                Uri uri = GetUrifromHREF(baseUri, hrefValue);
                if (uri != null && uris.Where(x => x.PageUri.AbsoluteUri == uri.AbsoluteUri).Count() == 0)
                {
                    Web.PageCrawlDetail url = new Web.PageCrawlDetail(uri);
                    uris.Add(url);
                }
            }
            return(uris);
        }
コード例 #5
0
        public async Task <bool> Crawl(Web.PageCrawlDetail startUrl)
        {
            runningTasks.Add(ProcessUrl(startUrl));

            while (runningTasks.Any())
            {
                var completedTask = await Task.WhenAny(runningTasks);

                runningTasks.Remove(completedTask);
                var pageHtml = await completedTask;
                Thread.Sleep(SleepTime); // added so there is not too many
                while (CrawlList.HasNext() && runningTasks.Count < maxConcurrentDownload)
                {
                    var url = CrawlList.GetNext();
                    runningTasks.Add(ProcessUrl(url));
                }
            }

            return(true);
        }