Example #1
0
 private Url selectNextUrl()
 {
     using (var db = new CrawlerDbContext())
     {
         var query = from a in db.Urls
                     where a.seedList == true
                     orderby a.lastCrawled
                     select a;
         return(query.First());
     }
 }
Example #2
0
        private void processUrls(Url url)
        {
            string html        = webRequest(url);
            var    matchesList = findMatches(html, pattern);

            foreach (var i in matchesList)
            {
                try
                {
                    string temp = i.TrimStart('"').TrimEnd('"');
                    for (int j = 0; j <= CONFIG_CRAWL_DEPTH; j++)
                    {
                        var tempRec = new Url()
                        {
                            ParentId = url.Id, Id = Guid.NewGuid(), url = formatUrl(temp, j), lastCrawled = null, seedList = null
                        };
                        using (var db = new CrawlerDbContext())
                        {
                            var u = new Url()
                            {
                                ParentId = tempRec.ParentId, Id = tempRec.Id, url = tempRec.url.ToString(), lastCrawled = null, seedList = null
                            };
                            db.Urls.Add(u);
                            db.SaveChanges();
                            Console.WriteLine("--> " + u.ParentId.ToString() + "|" + u.Id.ToString() + "|" + u.url + "|" + u.lastCrawled);
                        }
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine("ERROR: " + e.Message);
                }
            }
            // Update lastCrawled
            using (var db = new CrawlerDbContext())
            {
                var query = from a in db.Urls
                            where a.Id == url.Id
                            select a;

                foreach (var item in query)
                {
                    item.lastCrawled = DateTime.Now;
                    //Console.WriteLine("**** " + item.Id + " | " + item.ParentId + " | " + item.url + " | " + item.lastCrawled + " ****");
                }
                db.SaveChanges();
            }
        }
Example #3
0
        private void insertSeedListIntoDatabase()
        {
            var newSeed = Guid.NewGuid();

            foreach (var i in getSeedList())
            {
                var u = new Url()
                {
                    ParentId = newSeed, Id = Guid.NewGuid(), url = i, lastCrawled = null, seedList = true
                };
                using (var db = new CrawlerDbContext())
                {
                    db.Urls.Add(u);
                    db.SaveChanges();
                }
            }
        }
Example #4
0
 public void crawlSeedListOnly()
 {
     //Console.WriteLine("Crawl seed list only");
     foreach (var a in getSeedList())
     {
         var u = new Url()
         {
             ParentId = Guid.NewGuid(), Id = Guid.NewGuid(), url = a, lastCrawled = null, seedList = null
         };
         Console.WriteLine(u.ParentId + "|" + u.Id + "|" + u.url + "|" + u.lastCrawled);
         using (var db = new CrawlerDbContext())
         {
             //db.Database.Log = Console.Write;
             db.Urls.Add(u);
             db.SaveChanges();
         }
         processUrls(u);
     }
 }