private Url selectNextUrl() { using (var db = new CrawlerDbContext()) { var query = from a in db.Urls where a.seedList == true orderby a.lastCrawled select a; return(query.First()); } }
private void processUrls(Url url) { string html = webRequest(url); var matchesList = findMatches(html, pattern); foreach (var i in matchesList) { try { string temp = i.TrimStart('"').TrimEnd('"'); for (int j = 0; j <= CONFIG_CRAWL_DEPTH; j++) { var tempRec = new Url() { ParentId = url.Id, Id = Guid.NewGuid(), url = formatUrl(temp, j), lastCrawled = null, seedList = null }; using (var db = new CrawlerDbContext()) { var u = new Url() { ParentId = tempRec.ParentId, Id = tempRec.Id, url = tempRec.url.ToString(), lastCrawled = null, seedList = null }; db.Urls.Add(u); db.SaveChanges(); Console.WriteLine("--> " + u.ParentId.ToString() + "|" + u.Id.ToString() + "|" + u.url + "|" + u.lastCrawled); } } } catch (Exception e) { Console.WriteLine("ERROR: " + e.Message); } } // Update lastCrawled using (var db = new CrawlerDbContext()) { var query = from a in db.Urls where a.Id == url.Id select a; foreach (var item in query) { item.lastCrawled = DateTime.Now; //Console.WriteLine("**** " + item.Id + " | " + item.ParentId + " | " + item.url + " | " + item.lastCrawled + " ****"); } db.SaveChanges(); } }
private void insertSeedListIntoDatabase() { var newSeed = Guid.NewGuid(); foreach (var i in getSeedList()) { var u = new Url() { ParentId = newSeed, Id = Guid.NewGuid(), url = i, lastCrawled = null, seedList = true }; using (var db = new CrawlerDbContext()) { db.Urls.Add(u); db.SaveChanges(); } } }
public void crawlSeedListOnly() { //Console.WriteLine("Crawl seed list only"); foreach (var a in getSeedList()) { var u = new Url() { ParentId = Guid.NewGuid(), Id = Guid.NewGuid(), url = a, lastCrawled = null, seedList = null }; Console.WriteLine(u.ParentId + "|" + u.Id + "|" + u.url + "|" + u.lastCrawled); using (var db = new CrawlerDbContext()) { //db.Database.Log = Console.Write; db.Urls.Add(u); db.SaveChanges(); } processUrls(u); } }