public void Scrape() { KopipastaContext db = new KopipastaContext(); var posts = db.PostsUncreated.Where(x => x.Status == PostUnscrapedStatus.unscraped).Take(10).ToList(); //var posts = new List<PostUnscraped> { new PostUnscraped { Id = 0, SiteId = 1, Status = PostUnscrapedStatus.unscraped }, new PostUnscraped { Id = 1, SiteId = 2, Status = PostUnscrapedStatus.unscraped } }; List<Post> CreatedPosts = new List<Post>(); IWebDriver driver = new ChromeDriver(@"C:\Selenium\"); for (int i = 0; i < posts.Count(); i++) { var post = posts[i]; driver.Navigate().GoToUrl(baseUrl + post.SiteId); try { log.Info("parsing post №: " + post.SiteId); WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(3)); string postTitle = wait.Until((d) => { return d.FindElement(By.CssSelector(titleSelector)).Text; }); string postBody = driver.FindElement(By.ClassName(bodySelector)).Text; string postRating = ratingBase + post.SiteId; int rating = int.Parse(driver.FindElement(By.Id(postRating)).Text); string DateString = driver.FindElement(By.ClassName(postDate)).Text; var ScrapedPost = new Post() { Body = postBody, Header = postTitle, Rating = rating, SiteId = post.SiteId, Created = DateString }; CreatedPosts.Add(ScrapedPost); try { db.Posts.Add(ScrapedPost); db.SaveChanges(); db.PostsUncreated.Remove(post); } catch (Exception ex) { log.Error("error saving post to db, post №: " + post.SiteId); log.Error(ex.StackTrace); } } catch (System.FormatException ex) { log.Error("error on parsing post rating, post №: " + post.SiteId); log.Error(ex.StackTrace); } catch (Exception ex) { log.Error("error on scraping post №: " + post.SiteId); log.Error(ex.StackTrace); } } driver.Quit(); }
protected void OnTimedEvent(object source, ElapsedEventArgs e) { //KopipastaLog.WriteEntry("on timer"); KopipastaContext db = new KopipastaContext(); int LastPosstIndex; if (db.PostsUncreated.Count() == 0) LastPosstIndex = 0; else LastPosstIndex = db.PostsUncreated.Max(x => x.SiteId); var reachedTop = false; aTimer.Enabled = false; int NewPostsCounter = 0; ServicePointManager.DefaultConnectionLimit = 40; ServicePointManager.SetTcpKeepAlive(false, 10, 10); while (!reachedTop) { if (NewPostsCounter >= Max) { reachedTop = true; } NewPostsCounter++; var PostURL = BaseURL + LastPosstIndex; //LastPosstIndex++; //var sp = new ServicePoint(); //ServicePoint sp = ServicePointManager.FindServicePoint(new Uri(PostURL)); //sp.SetTcpKeepAlive(false, 10, 10); HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(PostURL); Request.Method = "HEAD"; Request.Timeout = 10000; Request.Headers = new WebHeaderCollection(); Request.KeepAlive = false; Request.ConnectionGroupName = "KopipastaConnection"; HttpWebResponse Resopnse; try { Resopnse = (HttpWebResponse)Request.GetResponse(); if (Resopnse.StatusCode == HttpStatusCode.NotFound) { reachedTop = true; } else { PostUnscraped NewPost = new PostUnscraped { SiteId = LastPosstIndex, Status = Data.PostUnscrapedStatus.unscraped }; db.PostsUncreated.Add(NewPost); } } catch (Exception ex) { ServicePoint sp = Request.ServicePoint; KopipastaLog.WriteEntry("Error"); KopipastaLog.WriteEntry(ex.StackTrace, EventLogEntryType.Error); string currentConnections = "current open connections: " + sp.CurrentConnections.ToString(); KopipastaLog.WriteEntry(currentConnections, EventLogEntryType.Information); sp.CloseConnectionGroup("KopipastaConnection"); //sp.CloseConnectionGroup("KopipastaConnection"); } finally { LastPosstIndex++; } } try { db.SaveChanges(); KopipastaLog.WriteEntry(String.Format("saved {0} new posts to database", NewPostsCounter)); ConsoleApplication1.Scraper sc = new ConsoleApplication1.Scraper(); sc.Scrape(); } catch (Exception ex) { KopipastaLog.WriteEntry("Error"); KopipastaLog.WriteEntry(ex.StackTrace); } aTimer.Enabled = true; }