//private readonly Dictionary<int, string> _dicUriPages = new Dictionary<int, string>(); public LightningCrawler(string dbConStr, string startPage, string[] hosts = null, int crawlerThreadCount = 20, int browserCrawlerThreadCount = 5, int browserLoadWait = 0) { _dbConStr = dbConStr; _crawlerThreadCount = crawlerThreadCount; _browserCrawlerThreadCount = browserCrawlerThreadCount; _browserLoadWait = browserLoadWait; _db = CrawlerContext.Create(dbConStr); _startPageUri = new System.Uri(startPage); _hosts = hosts ?? new string[] { _startPageUri.Host }; }
private static Uri NewUriDbModel(System.Uri uri) { return(new Uri() { AbsoluteUri = uri.AbsoluteUri,//.TruncateMax(MAX_URI_LEN), Host = uri.Host, AbsolutePath = uri.AbsolutePath, Fragment = uri.Fragment, Query = uri.Query, Scheme = uri.Scheme, OriginalString = uri.OriginalString, CreateAt = DateTime.UtcNow, }); }
private void SaveCrawlResults(List <CrawlResult> list) { //--------------------------------save pages-------------------------------------- if (list.Count != list.Select(o => o.AbsoluteUri).Distinct().Count()) { Debugger.Break(); } //crawled pages var pages = list.Select(o => { var uri = NewUriDbModel(new System.Uri(o.AbsoluteUri)); uri.FailedAt = o.FailedAt; uri.FailedException = o.FailException; uri.StatusCode = o.StatusCode; uri.StatusCodeString = o.StatusCodeStr; uri.TimeTaken = o.TimeTaken; uri.CrawledAt = o.CrawledAt; uri.ContentLength = o.ContentLength; uri.Content = o.Content; uri.Canonical = o.Canonical; uri.BrowserCrawledAt = o.BrowserCrawledAt; uri.BrowserContent = o.BrowserContent; uri.BrowserFailedAt = o.BrowserFailedAt; uri.BrowserFailedException = o.BrowserFailedException; //todo:db new field here return(uri); }).ToList(); //their links and redirects foreach (var crawlResult in list) { if (crawlResult.LocationAbsoluteUri != null) { if (pages.All(p => p.AbsoluteUri != crawlResult.LocationAbsoluteUri)) { pages.Add(NewUriDbModel(new System.Uri(crawlResult.LocationAbsoluteUri))); } } if (crawlResult.LinkAbsoluteUris != null) { foreach (var crawledLink in crawlResult.LinkAbsoluteUris) { if (pages.All(p => p.AbsoluteUri != crawledLink.AbsoluteUri)) { try { var uri = new System.Uri(crawledLink.AbsoluteUri); pages.Add(NewUriDbModel(uri)); } catch (UriFormatException e) { pages.Add(NewUriDbModel(crawledLink.AbsoluteUri)); } } } } } if (pages.Count != pages.Select(o => o.AbsoluteUri).Distinct().Count()) { Debugger.Break(); } //save _db.BulkMerge(pages, options => { options.ColumnPrimaryKeyExpression = o => o.AbsoluteUri; options.IgnoreOnMergeUpdateExpression = o => new { o.AbsoluteUri, o.AbsolutePath, o.Host, o.Scheme, o.Fragment, o.Query, o.CreateAt, o.OriginalString, }; //The CoalesceOnMergeUpdateExpression allows you to not update any column if the specified value is null and its database //value is not null when BulkMerge method is executed. options.CoalesceOnMergeUpdateExpression = o => new { o.FailedAt, o.FailedException, o.StatusCode, o.StatusCodeString, o.TimeTaken, o.CrawledAt, o.ContentLength, o.Content, o.Canonical, o.BrowserCrawledAt, o.BrowserContent, o.BrowserFailedAt, o.BrowserFailedException, //todo:db new field here }; }); foreach (var crawlResult in list) { //if crawl fails, remove from dicPlan so that it can be added again if (crawlResult.FailException != null) { _dicPlanned.Remove(crawlResult.AbsoluteUri); } //if crawl fails, remove from dicPlan so that it can be added again if (crawlResult.BrowserFailedException != null) { _dicPlannedBrowser.Remove(crawlResult.AbsoluteUri); } } //update uri ids in memory foreach (var page in pages) { if (!_dicUriIdMapping.ContainsKey(page.AbsoluteUri)) { _dicUriIdMapping.Add(page.AbsoluteUri, page.Id); } } //--------------------------------save relations-------------------------------------- var relations = new List <Relation>(); var redirectRelations = new List <RedirectRelation>(); foreach (var crawlResult in list) { if (crawlResult.LocationAbsoluteUri != null) { redirectRelations.Add(new RedirectRelation() { SourceId = _dicUriIdMapping[crawlResult.AbsoluteUri], DestinationId = _dicUriIdMapping[crawlResult.LocationAbsoluteUri], CreatedAt = DateTime.UtcNow, }); } else if (crawlResult.LinkAbsoluteUris != null) { foreach (var crawledLink in crawlResult.LinkAbsoluteUris) { relations.Add(new Relation() { ParentId = _dicUriIdMapping[crawlResult.AbsoluteUri], ChildId = _dicUriIdMapping[crawledLink.AbsoluteUri], CreatedAt = DateTime.UtcNow, IsBrowserRequired = crawledLink.IsBrowserRequired, }); } } } if (relations.Count > 0) { _db.BulkMerge(relations, options => { options.IgnoreOnMergeUpdateExpression = o => new { o.CreatedAt, o.IsBrowserRequired,//if a static relation already exists, ignore browser relations }; //options.CoalesceOnMergeUpdateExpression = o => o.IsBrowserRequired; }); } if (redirectRelations.Count > 0) { _db.BulkMerge(redirectRelations, options => { options.IgnoreOnMergeUpdateExpression = o => o.CreatedAt; }); } //Console.WriteLine($"Saved {list.Count} pages"); }