Пример #1
0
        public virtual async Task <CrawlSummary> CrawlAsync(Uri RootUri)
        {
            var summary = new CrawlSummary();
            var queue   = new Queue <CrawledPage>();

            var rootPage = new CrawledPage(RootUri, 1);

            await this.CrawlPageAsync(rootPage);

            queue.Enqueue(rootPage);
            var Domain = RootUri.Host;

            while (queue.Count > 0)
            {
                var current = queue.Dequeue();

                try
                {
                    if (current.Depth > this.Config.MaxDepth)
                    {
                        continue;
                    }
                    if (this.Config.MaxPagesToCrawl > 0 && summary.PagesCount >= this.Config.MaxPagesToCrawl)
                    {
                        continue;
                    }
                    if (summary.PageExists(current))
                    {
                        continue;
                    }
                    if (string.IsNullOrEmpty(current.URI.AbsoluteUri))
                    {
                        continue;
                    }

                    summary.AddPages(current);
                    OnPageCrawlCompleted(new CrawlEventArgs
                    {
                        CrawledPage = current
                    });

                    if (current.URI.Host != Domain)
                    {
                        continue;
                    }

                    await this.CrawlPageAsync(current);

                    foreach (var link in current.Links)
                    {
                        try
                        {
                            var uri = new Uri(link);
                            queue.Enqueue(new CrawledPage(uri, current.Depth + 1));
                        }
                        catch (Exception Ex)
                        {
                            Console.WriteLine($"[ERROR] Link: '{link}', EX: {Ex.Message}");
                        }
                    }
                }
                catch (Exception Ex) {
                    Console.WriteLine($"[ERROR] Page: '{current.URI}', EX: {Ex.Message}");
                }
            }

            return(summary);
        }
Пример #2
0
 public bool PageExists(CrawledPage Page)
 {
     return(this.Pages.Contains(Page, new CrawledPageComparer()));
 }