예제 #1
0
        //private readonly Dictionary<int, string> _dicUriPages = new Dictionary<int, string>();

        public LightningCrawler(string dbConStr, string startPage, string[] hosts = null, int crawlerThreadCount = 20, int browserCrawlerThreadCount = 5, int browserLoadWait = 0)
        {
            _dbConStr                  = dbConStr;
            _crawlerThreadCount        = crawlerThreadCount;
            _browserCrawlerThreadCount = browserCrawlerThreadCount;
            _browserLoadWait           = browserLoadWait;
            _db           = CrawlerContext.Create(dbConStr);
            _startPageUri = new System.Uri(startPage);
            _hosts        = hosts ?? new string[] { _startPageUri.Host };
        }
예제 #2
0
        private static Uri NewUriDbModel(System.Uri uri)
        {
            return(new Uri()
            {
                AbsoluteUri = uri.AbsoluteUri,//.TruncateMax(MAX_URI_LEN),
                Host = uri.Host,
                AbsolutePath = uri.AbsolutePath,
                Fragment = uri.Fragment,
                Query = uri.Query,
                Scheme = uri.Scheme,
                OriginalString = uri.OriginalString,

                CreateAt = DateTime.UtcNow,
            });
        }
예제 #3
0
        private void SaveCrawlResults(List <CrawlResult> list)
        {
            //--------------------------------save pages--------------------------------------

            if (list.Count != list.Select(o => o.AbsoluteUri).Distinct().Count())
            {
                Debugger.Break();
            }

            //crawled pages
            var pages = list.Select(o =>
            {
                var uri = NewUriDbModel(new System.Uri(o.AbsoluteUri));

                uri.FailedAt        = o.FailedAt;
                uri.FailedException = o.FailException;

                uri.StatusCode       = o.StatusCode;
                uri.StatusCodeString = o.StatusCodeStr;
                uri.TimeTaken        = o.TimeTaken;
                uri.CrawledAt        = o.CrawledAt;

                uri.ContentLength = o.ContentLength;
                uri.Content       = o.Content;
                uri.Canonical     = o.Canonical;

                uri.BrowserCrawledAt       = o.BrowserCrawledAt;
                uri.BrowserContent         = o.BrowserContent;
                uri.BrowserFailedAt        = o.BrowserFailedAt;
                uri.BrowserFailedException = o.BrowserFailedException;
                //todo:db new field here

                return(uri);
            }).ToList();

            //their links and redirects
            foreach (var crawlResult in list)
            {
                if (crawlResult.LocationAbsoluteUri != null)
                {
                    if (pages.All(p => p.AbsoluteUri != crawlResult.LocationAbsoluteUri))
                    {
                        pages.Add(NewUriDbModel(new System.Uri(crawlResult.LocationAbsoluteUri)));
                    }
                }

                if (crawlResult.LinkAbsoluteUris != null)
                {
                    foreach (var crawledLink in crawlResult.LinkAbsoluteUris)
                    {
                        if (pages.All(p => p.AbsoluteUri != crawledLink.AbsoluteUri))
                        {
                            try
                            {
                                var uri = new System.Uri(crawledLink.AbsoluteUri);

                                pages.Add(NewUriDbModel(uri));
                            }
                            catch (UriFormatException e)
                            {
                                pages.Add(NewUriDbModel(crawledLink.AbsoluteUri));
                            }
                        }
                    }
                }
            }

            if (pages.Count != pages.Select(o => o.AbsoluteUri).Distinct().Count())
            {
                Debugger.Break();
            }

            //save
            _db.BulkMerge(pages, options =>
            {
                options.ColumnPrimaryKeyExpression = o => o.AbsoluteUri;

                options.IgnoreOnMergeUpdateExpression = o => new
                {
                    o.AbsoluteUri,
                    o.AbsolutePath,
                    o.Host,
                    o.Scheme,
                    o.Fragment,
                    o.Query,
                    o.CreateAt,
                    o.OriginalString,
                };

                //The CoalesceOnMergeUpdateExpression allows you to not update any column if the specified value is null and its database
                //value is not null when BulkMerge method is executed.
                options.CoalesceOnMergeUpdateExpression = o => new
                {
                    o.FailedAt,
                    o.FailedException,
                    o.StatusCode,
                    o.StatusCodeString,
                    o.TimeTaken,
                    o.CrawledAt,
                    o.ContentLength,
                    o.Content,
                    o.Canonical,
                    o.BrowserCrawledAt,
                    o.BrowserContent,
                    o.BrowserFailedAt,
                    o.BrowserFailedException,
                    //todo:db new field here
                };
            });


            foreach (var crawlResult in list)
            {
                //if crawl fails, remove from dicPlan so that it can be added again
                if (crawlResult.FailException != null)
                {
                    _dicPlanned.Remove(crawlResult.AbsoluteUri);
                }

                //if crawl fails, remove from dicPlan so that it can be added again
                if (crawlResult.BrowserFailedException != null)
                {
                    _dicPlannedBrowser.Remove(crawlResult.AbsoluteUri);
                }
            }


            //update uri ids in memory
            foreach (var page in pages)
            {
                if (!_dicUriIdMapping.ContainsKey(page.AbsoluteUri))
                {
                    _dicUriIdMapping.Add(page.AbsoluteUri, page.Id);
                }
            }


            //--------------------------------save relations--------------------------------------
            var relations         = new List <Relation>();
            var redirectRelations = new List <RedirectRelation>();

            foreach (var crawlResult in list)
            {
                if (crawlResult.LocationAbsoluteUri != null)
                {
                    redirectRelations.Add(new RedirectRelation()
                    {
                        SourceId      = _dicUriIdMapping[crawlResult.AbsoluteUri],
                        DestinationId = _dicUriIdMapping[crawlResult.LocationAbsoluteUri],
                        CreatedAt     = DateTime.UtcNow,
                    });
                }
                else if (crawlResult.LinkAbsoluteUris != null)
                {
                    foreach (var crawledLink in crawlResult.LinkAbsoluteUris)
                    {
                        relations.Add(new Relation()
                        {
                            ParentId          = _dicUriIdMapping[crawlResult.AbsoluteUri],
                            ChildId           = _dicUriIdMapping[crawledLink.AbsoluteUri],
                            CreatedAt         = DateTime.UtcNow,
                            IsBrowserRequired = crawledLink.IsBrowserRequired,
                        });
                    }
                }
            }

            if (relations.Count > 0)
            {
                _db.BulkMerge(relations, options =>
                {
                    options.IgnoreOnMergeUpdateExpression = o => new
                    {
                        o.CreatedAt,
                        o.IsBrowserRequired,//if a static relation already exists, ignore browser relations
                    };
                    //options.CoalesceOnMergeUpdateExpression = o => o.IsBrowserRequired;
                });
            }
            if (redirectRelations.Count > 0)
            {
                _db.BulkMerge(redirectRelations, options =>
                {
                    options.IgnoreOnMergeUpdateExpression = o => o.CreatedAt;
                });
            }

            //Console.WriteLine($"Saved {list.Count} pages");
        }