Example #1
0
        public void Scrape()
        {
            KopipastaContext db = new KopipastaContext();
            var posts = db.PostsUncreated.Where(x => x.Status == PostUnscrapedStatus.unscraped).Take(10).ToList();

            //var posts = new List<PostUnscraped> { new PostUnscraped { Id = 0, SiteId = 1, Status = PostUnscrapedStatus.unscraped }, new PostUnscraped { Id = 1, SiteId = 2, Status = PostUnscrapedStatus.unscraped } };

            List<Post> CreatedPosts = new List<Post>();

            IWebDriver driver = new ChromeDriver(@"C:\Selenium\");
            for (int i = 0; i < posts.Count(); i++)
            {
                var post = posts[i];
                driver.Navigate().GoToUrl(baseUrl + post.SiteId);

                try
                {
                    log.Info("parsing post №: " + post.SiteId);
                    WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(3));
                    string postTitle = wait.Until((d) => { return d.FindElement(By.CssSelector(titleSelector)).Text; });
                    string postBody = driver.FindElement(By.ClassName(bodySelector)).Text;

                    string postRating = ratingBase + post.SiteId;
                    int rating = int.Parse(driver.FindElement(By.Id(postRating)).Text);

                    string DateString = driver.FindElement(By.ClassName(postDate)).Text;

                    var ScrapedPost = new Post() { Body = postBody, Header = postTitle, Rating = rating, SiteId = post.SiteId, Created = DateString };
                    CreatedPosts.Add(ScrapedPost);

                    try
                    {
                        db.Posts.Add(ScrapedPost);
                        db.SaveChanges();
                        db.PostsUncreated.Remove(post);
                    }
                    catch (Exception ex)
                    {
                        log.Error("error saving post to db, post №: " + post.SiteId);
                        log.Error(ex.StackTrace);
                    }

                }
                catch (System.FormatException ex)
                {
                    log.Error("error on parsing post rating, post №: " + post.SiteId);
                    log.Error(ex.StackTrace);
                }
                catch (Exception ex)
                {
                    log.Error("error on scraping post №: " + post.SiteId);
                    log.Error(ex.StackTrace);
                }
            }
            driver.Quit();
        }
Example #2
0
        protected void OnTimedEvent(object source, ElapsedEventArgs e)
        {
            //KopipastaLog.WriteEntry("on timer");
            KopipastaContext db = new KopipastaContext();
            int LastPosstIndex;
            if (db.PostsUncreated.Count() == 0)
                LastPosstIndex = 0;
            else
                LastPosstIndex = db.PostsUncreated.Max(x => x.SiteId);
            var reachedTop = false;
            aTimer.Enabled = false;
            int NewPostsCounter = 0;
            ServicePointManager.DefaultConnectionLimit = 40;
            ServicePointManager.SetTcpKeepAlive(false, 10, 10);
            while (!reachedTop)
            {
                if (NewPostsCounter >= Max)
                {
                    reachedTop = true;
                }
                NewPostsCounter++;

                var PostURL = BaseURL + LastPosstIndex;
                //LastPosstIndex++;
                //var sp = new ServicePoint();
                //ServicePoint sp = ServicePointManager.FindServicePoint(new Uri(PostURL));
                //sp.SetTcpKeepAlive(false, 10, 10);

                HttpWebRequest Request = (HttpWebRequest)WebRequest.Create(PostURL);
                Request.Method = "HEAD";
                Request.Timeout = 10000;
                Request.Headers = new WebHeaderCollection();
                Request.KeepAlive = false;
                Request.ConnectionGroupName = "KopipastaConnection";
                HttpWebResponse Resopnse;
                try
                {
                    Resopnse = (HttpWebResponse)Request.GetResponse();
                    if (Resopnse.StatusCode == HttpStatusCode.NotFound)
                    {
                        reachedTop = true;
                    }
                    else
                    {
                        PostUnscraped NewPost = new PostUnscraped { SiteId = LastPosstIndex, Status = Data.PostUnscrapedStatus.unscraped };
                        db.PostsUncreated.Add(NewPost);
                    }
                }
                catch (Exception ex)
                {
                    ServicePoint sp = Request.ServicePoint;
                    KopipastaLog.WriteEntry("Error");
                    KopipastaLog.WriteEntry(ex.StackTrace, EventLogEntryType.Error);
                    string currentConnections = "current open connections: " + sp.CurrentConnections.ToString();
                    KopipastaLog.WriteEntry(currentConnections, EventLogEntryType.Information);
                    sp.CloseConnectionGroup("KopipastaConnection");
                    //sp.CloseConnectionGroup("KopipastaConnection");
                }
                finally
                {
                    LastPosstIndex++;
                }
            }
            try
            {
                db.SaveChanges();
                KopipastaLog.WriteEntry(String.Format("saved {0} new posts to database", NewPostsCounter));
                ConsoleApplication1.Scraper sc = new ConsoleApplication1.Scraper();
                sc.Scrape();
            }
            catch (Exception ex)
            {
                KopipastaLog.WriteEntry("Error");
                KopipastaLog.WriteEntry(ex.StackTrace);
            }
            aTimer.Enabled = true;
        }