private List <CrawlPlan> GetPagesToBrowserCrawl(string[] hosts) { var db = CrawlerContext.Create(_dbConStr); Console.WriteLine("loading un-browser-crawled pages..."); List <CrawlPlan> pagesToCrawl; pagesToCrawl = db.Uri.Where(o => o.CrawledAt.HasValue && !o.BrowserCrawledAt.HasValue && o.StatusCode == 200 && (o.Scheme == "http" || o.Scheme == "https") && hosts.Contains(o.Host) && o.Fragment == "" //&& o.Query == "" && !o.AbsoluteUri.Contains("c!") && !o.AbsoluteUri.Contains("i!") && !o.AbsoluteUri.Contains("a!") && !o.AbsoluteUri.Contains("p!") && !o.AbsoluteUri.EndsWith(".png") && !o.AbsoluteUri.EndsWith(".gif") && !o.AbsoluteUri.EndsWith(".jpg") ) .OrderBy(o => o.Id) .Select(o => new CrawlPlan() { AbsoluteUri = o.AbsoluteUri }) .ToList(); return(pagesToCrawl); }
//private readonly Dictionary<int, string> _dicUriPages = new Dictionary<int, string>(); public LightningCrawler(string dbConStr, string startPage, string[] hosts = null, int crawlerThreadCount = 20, int browserCrawlerThreadCount = 5, int browserLoadWait = 0) { _dbConStr = dbConStr; _crawlerThreadCount = crawlerThreadCount; _browserCrawlerThreadCount = browserCrawlerThreadCount; _browserLoadWait = browserLoadWait; _db = CrawlerContext.Create(dbConStr); _startPageUri = new System.Uri(startPage); _hosts = hosts ?? new string[] { _startPageUri.Host }; }
//private void SaveCrawlResult(CrawlResult crawlResult) //{ // var stopWatch=new Stopwatch(); // stopWatch.Start(); // var page = _db.Uri.FirstOrDefault(o => o.AbsoluteUri == crawlResult.AbsoluteUri); // if (page == null) Debugger.Break(); // if (page.FailedAt.HasValue) // { // page.FailedAt = crawlResult.FailedAt; // page.FailedException = crawlResult.FailException; // _db.SaveChanges(); // return; // } // if (crawlResult.StatusCode != 200) // { // if (crawlResult.LocationAbsoluteUri != null) // { // var destinationPage = _db.Uri.FirstOrDefault(o => o.AbsoluteUri == crawlResult.LocationAbsoluteUri); // if (destinationPage == null) // { // //Console.WriteLine($"\tadding destination page to db..."); // destinationPage = NewUriDbModel(new System.Uri(crawlResult.LocationAbsoluteUri)); // _db.Uri.Add(destinationPage); // _db.SaveChanges(); // } // else // { // //Console.WriteLine($"\tdestination page already exists in db..."); // } // //add redirect relations // var redirectRelation = _db.RedirectRelation.FirstOrDefault(o => o.SourceId == page.Id && o.DestinationId == destinationPage.Id); // if (redirectRelation == null) // { // //Console.WriteLine($"\tsaving redirect relation to db..."); // redirectRelation = new RedirectRelation() // { // SourceId = page.Id, // DestinationId = destinationPage.Id, // CreatedAt = DateTime.UtcNow, // }; // _db.RedirectRelation.Add(redirectRelation); // _db.SaveChanges(); // } // else // { // //Console.WriteLine($"\tredirect relation already exists in db..."); // } // } // } // if (crawlResult.Doc != null) // { // //find links // var links = crawlResult.Doc.DocumentNode.SelectNodes("//a[@href]"); // //Console.WriteLine($"\tfound {links.Count} child links."); // var linksToSave = new List<System.Uri>(); // foreach (var link in links) // { // var href = link.Attributes["href"].Value; // //if(href.Contains("fortnite-stats/")) Debugger.Break(); // if (href == "" || href.StartsWith("javascript:")) // continue; // var decoded = HttpUtility.HtmlDecode(href); // var childUri = Util.GetUriObjectFromUriString(decoded, page.AbsoluteUri); // //no duplicated links // if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri)) // linksToSave.Add(childUri); // } // Console.WriteLine($"html links parsed {stopWatch.ElapsedMilliseconds}"); // //save links & relations to db // if (linksToSave.Count > 0) // { // //Console.WriteLine($"\tthere are {linksToSave.Count} unique links"); // //Console.WriteLine($"\tchecking db for existence..."); // var lstUri = linksToSave.Select(o => o.AbsoluteUri).ToList(); // var pages = _db.Uri.Where(o => lstUri.Contains(o.AbsoluteUri)).ToList(); // //Console.WriteLine($"\t{pages.Count} of them are already in db"); // Console.WriteLine($"uri pages fetched {stopWatch.ElapsedMilliseconds}"); // //save pages // //Console.WriteLine($"\tsaving pages..."); // foreach (var linkToSave in linksToSave) // { // var childPage = pages.FirstOrDefault(p => p.AbsoluteUri == linkToSave.AbsoluteUri); // if (childPage == null) // { // childPage = NewUriDbModel(linkToSave); // _db.Uri.Add(childPage); // } // } // _db.SaveChanges(); // Console.WriteLine($"uri pages saved {stopWatch.ElapsedMilliseconds}"); // //save relations // //Console.WriteLine($"\trefetching {lstUri.Count} pages from db..."); // pages = _db.Uri.Where(o => lstUri.Contains(o.AbsoluteUri)).ToList(); // //Console.WriteLine($"\tchecking existing relations..."); // var relations = _db.Relation.Where(o => o.ParentId == page.Id).ToList(); // //Console.WriteLine($"\t{relations.Count} of them are already in db"); // //Console.WriteLine($"\tsaving relations..."); // Console.WriteLine($"relations fetched {stopWatch.ElapsedMilliseconds}"); // foreach (var linkToSave in linksToSave) // { // var childPage = pages.FirstOrDefault(p => p.AbsoluteUri == linkToSave.AbsoluteUri); // var relation = relations.FirstOrDefault(o => o.ChildId == childPage.Id); // if (relation == null) // { // relation = new Relation() // { // ParentId = page.Id, // ChildId = childPage.Id, // CreatedAt = DateTime.UtcNow, // }; // _db.Relation.Add(relation); // } // } // _db.SaveChanges(); // Console.WriteLine($"relations saved {stopWatch.ElapsedMilliseconds}"); // } // //find canonical // var canonicalLinks = crawlResult.Doc.DocumentNode.SelectNodes("//link[@rel='canonical']"); // if (canonicalLinks != null && canonicalLinks.Count > 0) // { // var canonicalLinkValue = canonicalLinks[0].Attributes["href"].Value; // //Console.WriteLine($"\tfound canonical"); // page.Canonical = canonicalLinkValue; // Console.WriteLine($"canonical extracted {stopWatch.ElapsedMilliseconds}"); // } // page.ContentLength = crawlResult.Doc.ParsedText.Length; // } // page.CrawledAt = crawlResult.CrawledAt; // page.StatusCode = crawlResult.StatusCode; // page.StatusCodeString = crawlResult.StatusCodeStr; // page.TimeTaken = crawlResult.TimeTaken; // _db.SaveChanges(); // Console.WriteLine($"everything saved {stopWatch.ElapsedMilliseconds}"); //} //private void CrawlPages(CrawlerContext db, string host) //{ // //var pagesToCrawl = GetPagesToCrawl(db, _hosts); // var pagesToCrawl = new List<Uri>(); // Console.WriteLine($"Found {pagesToCrawl.Count} pages to crawl."); // Console.WriteLine(); // var web = new HtmlWeb(); // web.CaptureRedirect = true; // web.PostResponse = (request, response) => HtmlWeb_PostResponse(request, response); // for (var i = 0; i < pagesToCrawl.Count; i++) // { // var page = pagesToCrawl[i]; // Console.WriteLine($"{i + 1}/{pagesToCrawl.Count} {page.AbsoluteUri}"); // var stopWatch = new Stopwatch(); // stopWatch.Start(); // HtmlDocument doc; // try // { // doc = web.Load(page.AbsoluteUri); // } // catch (Exception e) // { // Console.WriteLine(e); // page.FailedAt = DateTime.UtcNow; // page.FailedException = e.ToString(); // db.SaveChanges(); // Console.WriteLine(); // continue; // } // stopWatch.Stop(); // var statusCode = (int)web.StatusCode; // var statusCodeString = web.StatusCode.ToString(); // var timeTaken = stopWatch.Elapsed.TotalSeconds; // //not 200 OK // if (web.StatusCode != HttpStatusCode.OK) // { // Console.WriteLine($"\tstatus code = {statusCode}"); // //3xx redirect // if (statusCode / 100 == 3) // { // //Console.WriteLine($"\tfound redirect {statusCode} {_redirectLocation}"); // var locationUri = GetUriObjectFromUriString(_redirectLocation, page.AbsoluteUri); // Console.WriteLine($"\tfound new location {locationUri.AbsoluteUri}"); // //var linkKey = locationUri.AbsoluteUri.TruncateMax(MAX_URI_LEN); // //add destination to db // var destinationPage = db.Uri.FirstOrDefault(o => o.AbsoluteUri == locationUri.AbsoluteUri); // if (destinationPage == null) // { // Console.WriteLine($"\tadding destination page to db..."); // destinationPage = NewUriDbModel(locationUri); // db.Uri.Add(destinationPage); // db.SaveChanges(); // } // else // Console.WriteLine($"\tdestination page already exists in db..."); // //add redirect relations // var redirectRelation = db.RedirectRelation.FirstOrDefault(o => // o.SourceId == page.Id && o.DestinationId == destinationPage.Id); // if (redirectRelation == null) // { // Console.WriteLine($"\tsaving redirect relation to db..."); // redirectRelation = new RedirectRelation() // { // SourceId = page.Id, // DestinationId = destinationPage.Id, // CreatedAt = DateTime.UtcNow, // }; // db.RedirectRelation.Add(redirectRelation); // db.SaveChanges(); // } // else // Console.WriteLine($"\tredirect relation already exists in db..."); // } // //save page info // page.CrawledAt = DateTime.UtcNow; // page.StatusCodeString = statusCodeString; // page.StatusCode = statusCode; // page.TimeTaken = (decimal?)timeTaken; // db.SaveChanges(); // Console.WriteLine(); // continue; // } // //not a document // if (doc.ParsedText == null) // { // page.CrawledAt = DateTime.UtcNow; // page.StatusCodeString = statusCodeString; // page.StatusCode = statusCode; // page.TimeTaken = (decimal?)timeTaken; // db.SaveChanges(); // Console.WriteLine(); // continue; // } // var links = doc.DocumentNode.SelectNodes("//a[@href]"); // Console.WriteLine($"\tfound {links.Count} child links."); // var linksToSave = new List<System.Uri>(); // foreach (var link in links) // { // var href = link.Attributes["href"].Value; // if (href == "" || href.StartsWith("javascript:")) // continue; // var decoded = HttpUtility.HtmlDecode(href); // //if (decoded!= href) // // Debugger.Break(); // var childUri = GetUriObjectFromUriString(decoded, page.AbsoluteUri); // //Console.WriteLine($"\t{href}\r\n\t{childUri.AbsoluteUri}"); // //Console.WriteLine(); // //no duplicated links // if (linksToSave.All(o => o.AbsoluteUri != childUri.AbsoluteUri)) // linksToSave.Add(childUri); // } // //save links & relations to db // if (linksToSave.Count > 0) // { // //var uriKeyStrings = linksToSave.Select(o => o.AbsoluteUri.TruncateMax(MAX_URI_LEN)).Distinct().ToList(); // Console.WriteLine($"\tthere are {linksToSave.Count} unique links" // //+$", {uriKeyStrings.Count} unique uri keys" // ); // Console.WriteLine($"\tchecking db for existence..."); // var lstUri = linksToSave.Select(o => o.AbsoluteUri).ToList(); // var pages = db.Uri.Where(o => lstUri.Contains(o.AbsoluteUri)).ToList(); // //var childRelation = db.Relation.Where(o => o.ParentId == page.Id).ToList(); // Console.WriteLine($"\t{pages.Count} of them are already in db"); // //save pages // Console.WriteLine($"\tsaving pages..."); // //var keysAdded = new List<string>(); // foreach (var linkToSave in linksToSave) // { // //var uriKey = linkToSave.AbsoluteUri.TruncateMax(MAX_URI_LEN).ToLower(); // ////not add uris that are different but with same unique keys // //if (keysAdded.Contains(uriKey)) // // continue; // var childPage = pages.FirstOrDefault(p => p.AbsoluteUri == linkToSave.AbsoluteUri); // if (childPage == null) // { // childPage = NewUriDbModel(linkToSave); // //if (linkToSave.AbsoluteUri.Length > MAX_URI_LEN) // // childPage.FullAbsoluteUri = linkToSave.AbsoluteUri; // db.Uri.Add(childPage); // //keysAdded.Add(uriKey); // } // } // db.SaveChanges(); // //save relations // Console.WriteLine($"\trefetching {lstUri.Count} pages from db..."); // pages = db.Uri.Where(o => lstUri.Contains(o.AbsoluteUri)).ToList(); // Console.WriteLine($"\tchecking existing relations..."); // var relations = db.Relation.Where(o => o.ParentId == page.Id).ToList(); // Console.WriteLine($"\t{relations.Count} of them are already in db"); // Console.WriteLine($"\tsaving relations..."); // foreach (var linkToSave in linksToSave) // { // //var uriKey = linkToSave.AbsoluteUri.TruncateMax(MAX_URI_LEN); // var childPage = pages.FirstOrDefault(p => p.AbsoluteUri == linkToSave.AbsoluteUri); // var relation = relations.FirstOrDefault(o => o.ChildId == childPage.Id); // if (relation == null) // { // relation = new Relation() // { // ParentId = page.Id, // ChildId = childPage.Id, // CreatedAt = DateTime.UtcNow, // }; // db.Relation.Add(relation); // } // } // db.SaveChanges(); // } // //find canonical // var canonicalLinks = doc.DocumentNode.SelectNodes("//link[@rel='canonical']"); // if (canonicalLinks != null && canonicalLinks.Count > 0) // { // var canonicalLinkValue = canonicalLinks[0].Attributes["href"].Value; // Console.WriteLine($"\tfound canonical" // //+$": {canonicalLinkValue}" // ); // page.Canonical = canonicalLinkValue; // } // //save page info // var contentLength = doc.ParsedText.Length; // var content = doc.ParsedText; // page.CrawledAt = DateTime.UtcNow; // //page.Content = content; // page.ContentLength = contentLength; // page.StatusCodeString = statusCodeString; // page.StatusCode = statusCode; // page.TimeTaken = (decimal?)timeTaken; // db.SaveChanges(); // Console.WriteLine(); // } //} private List <CrawlPlan> GetPagesToCrawl(string[] hosts) { var db = CrawlerContext.Create(_dbConStr); Console.WriteLine("loading uncrawled pages..."); List <CrawlPlan> pagesToCrawl; //pagesToCrawl = db.Uri.Where(o => !o.CrawledAt.HasValue // && hosts.Contains(o.Host) // && o.Fragment == "" // && o.Query == "" // && !o.AbsoluteUri.Contains("c!") // && !o.AbsoluteUri.Contains("i!") // && !o.AbsoluteUri.Contains("a!") // && !o.AbsoluteUri.Contains("p!") // && !o.AbsoluteUri.EndsWith(".png") // && !o.AbsoluteUri.EndsWith(".gif") // && !o.AbsoluteUri.EndsWith(".jpg") // ) // .OrderBy(o => o.Id) // .Select(o => new CrawlPlan() // { // AbsoluteUri = o.AbsoluteUri // }) // .ToList(); //if (pagesToCrawl.Count < 100) //{ // Console.WriteLine("loading uncrawled pages (2)..."); // pagesToCrawl = db.Uri.Where(o => !o.CrawledAt.HasValue // && hosts.Contains(o.Host) // && o.Fragment == "" // && o.Query == "" // //&& !o.AbsoluteUri.Contains("c!") // //&& !o.AbsoluteUri.Contains("i!") // //&& !o.AbsoluteUri.Contains("a!") // //&& !o.AbsoluteUri.Contains("p!") // && !o.AbsoluteUri.EndsWith(".png") // && !o.AbsoluteUri.EndsWith(".gif") // && !o.AbsoluteUri.EndsWith(".jpg") // ) // .OrderBy(o => o.Id) // .Select(o => new CrawlPlan() // { // AbsoluteUri = o.AbsoluteUri // }) // .ToList(); //} //if (pagesToCrawl.Count < 100) //{ //Console.WriteLine("loading uncrawled pages (3)..."); pagesToCrawl = db.Uri.Where(o => !o.CrawledAt.HasValue && (o.Scheme == "http" || o.Scheme == "https") && hosts.Contains(o.Host) && o.Fragment == "" //&& o.Query == "" //&& !o.AbsoluteUri.Contains("c!") //&& !o.AbsoluteUri.Contains("i!") //&& !o.AbsoluteUri.Contains("a!") //&& !o.AbsoluteUri.Contains("p!") && !o.AbsoluteUri.EndsWith(".png") && !o.AbsoluteUri.EndsWith(".gif") && !o.AbsoluteUri.EndsWith(".jpg") ) .OrderBy(o => o.Id) .Select(o => new CrawlPlan() { AbsoluteUri = o.AbsoluteUri }) .ToList(); //} //recrawl 5xx pages //pagesToCrawl = db.Uri.Where(o => o.CrawledAt.HasValue && o.StatusCode.ToString().StartsWith("5") // && (o.Scheme == "http" || o.Scheme == "https") // && hosts.Contains(o.Host) // && o.Fragment == "" // //&& o.Query == "" // //&& !o.AbsoluteUri.Contains("c!") // //&& !o.AbsoluteUri.Contains("i!") // //&& !o.AbsoluteUri.Contains("a!") // //&& !o.AbsoluteUri.Contains("p!") // && !o.AbsoluteUri.EndsWith(".png") // && !o.AbsoluteUri.EndsWith(".gif") // && !o.AbsoluteUri.EndsWith(".jpg") // ) // .OrderBy(o => o.Id) // .Select(o => new CrawlPlan() // { // AbsoluteUri = o.AbsoluteUri // }) // .ToList(); return(pagesToCrawl); }
//const int MAX_URI_LEN = 450; static void Main(string[] args) { ThreadPool.SetMinThreads(1000, 1000); IConfiguration Configuration = new ConfigurationBuilder() .AddJsonFile("appsettings.json", optional: true, reloadOnChange: true) .Build(); if (args.Length > 0 && args[0] == "indexer") { var esConfig = Configuration.GetSection("ESConnection"); var settings = new ConnectionSettings(new System.Uri(esConfig["Host"])) .DefaultIndex("uri").BasicAuthentication(esConfig["Username"], esConfig["Password"]); var elasticClient = new ElasticClient(settings); //var uri = new Uri { AbsoluteUri = "key1", BrowserContent = "<fkjaslkdf>a sdlfjlasjdflsM</asdf> lkafsjiw fasd fjl<a></a>", CrawledAt = DateTime.UtcNow }; //var indexResponse = elasticClient.IndexDocument(uri); var db = CrawlerContext.Create(Configuration.GetConnectionString("CrawlerDatabase")); Console.WriteLine($"Fetching db data..."); var uriDocuments = db.Uri.Where(o => o.Content != null) .OrderBy(o => o.Id) .Select(o => new UriDocument() { AbsoluteUri = o.AbsoluteUri, BrowserHtml = o.BrowserContent ?? o.Content, Id = o.Id, OriginalUriString = o.OriginalString, }) .ToList(); Console.WriteLine($"Parsing html and generating doc text..."); var htmlDoc = new HtmlDocument(); foreach (var doc in uriDocuments) { htmlDoc.LoadHtml(doc.BrowserHtml); var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//html"); doc.BrowserText = htmlBody.InnerText.Trim(); } Console.WriteLine($"Saving to ES..."); var bulkAllObservable = elasticClient.BulkAll(uriDocuments, b => b .Index("uri") // how long to wait between retries .BackOffTime("30s") // how many retries are attempted if a failure occurs .BackOffRetries(2) // refresh the index once the bulk operation completes .RefreshOnCompleted() // how many concurrent bulk requests to make .MaxDegreeOfParallelism(Environment.ProcessorCount) // number of items per bulk request .Size(1000) //.RetryDocumentPredicate((item, person) => //{ // // decide if a document should be retried in the event of a failure // return item.Error.Index == "even-index" && person.FirstName == "Martijn"; //}) .DroppedDocumentCallback((item, uri) => { // if a document cannot be indexed this delegate is called Console.WriteLine($"Unable to index: {item} {uri}"); }) ) // Perform the indexing, waiting up to 15 minutes. // Whilst the BulkAll calls are asynchronous this is a blocking operation .Wait(TimeSpan.FromMinutes(15), next => { // do something on each response e.g. write number of batches indexed to console Console.WriteLine($"ES Bulked Items: {next.Items.Count}"); Console.WriteLine( $"{next.Items.GroupBy(o => o.Result).Select(o => o.Key + ":" + o.Count()).Aggregate((o, n) => o + " " + n)}"); }); //Console.ReadKey(); return; } //var db = CrawlerContext.Create(Configuration.GetConnectionString("CrawlerDatabase")); //var u1 = new Uri { AbsoluteUri = "key1" }; //var u2 = new Uri { AbsoluteUri = "key2" }; //var u3 = new Uri { AbsoluteUri = "key1" }; //var list = new List<Uri> { u1, u2, u3 }; ////db.Uri.Add(u1); ////db.Uri.Add(u2); ////db.SaveChanges(); ////db.BulkInsertOrUpdate(list); //db.BulkMerge(list, options => options.ColumnPrimaryKeyExpression = o => o.AbsoluteUri); const string startPage = "https://www.domain.com/"; //var browserWebCrawler = new BrowserWebCrawler(); //var crawlResult = browserWebCrawler.CrawlPage(new CrawlPlan() {AbsoluteUri = "https://www.playerauctions.com/wow-account"}); //var staticWebCrawler = new StaticWebCrawler(); //var crawlResult = staticWebCrawler.CrawlPage(new CrawlPlan() { AbsoluteUri = "https://www.playerauctions.com/wow-gold/" }); var crawler = new LightningCrawler(Configuration.GetConnectionString("CrawlerDatabase"), startPage, new string[] { "www.domain.com", "subdomain1.domain.com", "subdomain2.domain.com", }, 20, 2, 0); crawler.Run(); }