public void PerformSimpleDownloadOfListOfPosts() { ModelRepository repo = new ModelRepository(); List <string> listOfPosts = new List <string>(); string repoName = "pollitikaNew.db"; Logger.Info("Opening data store: " + repoName); repo.OpenDataStore(repoName); List <string> listToDo = new List <string> { "/milivoj-petkovic-naredio-da-se-dvije-brigade-hvo-a-povuku-iz-brckog-i-tuzle", "/drumovi-ce-pozeljet-turaka-a-turaka-nigdje-biti-nece", "/tajne-robnih-zaliha", "/bahatost-na-a-svagda-nja", "/moc-pozeljnog-razmi-ljanja-ili-wishful-thinking-in-cloud-cuckoo-land", "/policija-pobolj-ala-rad", "/le-web", "/cija-ce-volja-trijumfirati-u-radanju-jednih-nacija", "/sustavno-trovanje-gradana-dok-sustav-funkcionira", "/o-rusvaju-s-jmbg-om" }; SimpleMultithreadedScrapper.AnalyzeFrontPage_SimpleMultithreaded(listToDo, repo); PrintStatistics(repo); repo.UpdateDataStore(); }
static public void AnalyzeListOfPosts_Multithreaded_OneBatch(List <string> listOfPosts, ModelRepository repo, List <ScrapingBrowser> listLoggedBrowsers, bool isFrontPage, bool fetchCommentVotes) { Stopwatch timer = new Stopwatch(); ILog log = log4net.LogManager.GetLogger(typeof(Program)); List <string> postsToProcessInBatch = new List <string>(); foreach (string s in listOfPosts) { string postUrl = "http://pollitika.com" + s; if (repo.PostAlreadyExists(postUrl) == false) { postsToProcessInBatch.Add(s); } else { log.WarnFormat("Post with url {0} ALREADY EXISTS IN DATABASE", s); } } postsToProcessInBatch.Add(null); // adding terminator for CrawlListOfPages int k = 0; List <Task> listTasks = CrawlListOfPages(() => postsToProcessInBatch[k++], (url, neki_repo, browser) => SimpleMultithreadedScrapper.MultithreadedAnalyzePost("http://pollitika.com" + url, repo, isFrontPage, fetchCommentVotes, browser), 1000, repo, listLoggedBrowsers); // when we get to the end of call, some threads are still running log.Debug("Starting wait for tasks to finish!"); while (listTasks.Count(task => task.IsCompleted == false) > 0) { Thread.Sleep(1000); } log.InfoFormat("BATCH DONE {0}, BATCH DONE IN TIME {1}", DateTime.Now, timer.Elapsed); timer.Restart(); }
static public void AnalyzeListOfPosts_Multithreaded(List <string> listOfPosts, ModelRepository repo, bool isFrontPage, bool fetchCommentVotes) { Stopwatch timer = new Stopwatch(); ILog log = log4net.LogManager.GetLogger(typeof(Program)); List <ScrapingBrowser> listLoggedBrowsers = new List <ScrapingBrowser>(); log.Info("Logging in browsers"); const int MaxConcurrentBrowsers = 8; for (int i = 0; i < MaxConcurrentBrowsers; i++) { listLoggedBrowsers.Add(Utility.GetLoggedBrowser()); } int batchInd = 0; int batchSize = 50; int numBatches = listOfPosts.Count / batchSize + 1; timer.Start(); while (batchInd * batchSize < listOfPosts.Count) { log.InfoFormat("DOING BATCH {0} of {1}, date: {2}", batchInd + 1, numBatches, DateTime.Now); int startInd = batchInd * batchSize; List <string> postsToProcessInBatch = new List <string>(); for (int ind = startInd; ind < startInd + batchSize && ind < listOfPosts.Count; ind++) { string postUrl = "http://pollitika.com" + listOfPosts[ind]; if (repo.PostAlreadyExists(postUrl) == false) { postsToProcessInBatch.Add(listOfPosts[ind]); } else { log.WarnFormat("Post with url {0} ALREADY EXISTS IN DATABASE", listOfPosts[ind]); } } postsToProcessInBatch.Add(null); // adding terminator for CrawlListOfPages int k = 0; List <Task> listTasks = CrawlListOfPages(() => postsToProcessInBatch[k++], (url, neki_repo, browser) => SimpleMultithreadedScrapper.MultithreadedAnalyzePost("http://pollitika.com" + url, repo, isFrontPage, fetchCommentVotes, browser), 1000, repo, listLoggedBrowsers); // when we get to the end of call, some threads are still running log.Debug("Starting wait for tasks to finish!"); while (listTasks.Count(task => task.IsCompleted == false) > 0) { Thread.Sleep(1000); } log.Info("Updating store"); repo.UpdateDataStore(); log.InfoFormat("BATCH DONE {0}, BATCH DONE IN TIME {1}", DateTime.Now, timer.Elapsed); timer.Restart(); batchInd++; } }