Пример #1
0
        public void PerformSimpleDownloadOfListOfPosts()
        {
            ModelRepository repo        = new ModelRepository();
            List <string>   listOfPosts = new List <string>();

            string repoName = "pollitikaNew.db";

            Logger.Info("Opening data store: " + repoName);
            repo.OpenDataStore(repoName);

            List <string> listToDo = new List <string>
            {
                "/milivoj-petkovic-naredio-da-se-dvije-brigade-hvo-a-povuku-iz-brckog-i-tuzle",
                "/drumovi-ce-pozeljet-turaka-a-turaka-nigdje-biti-nece",
                "/tajne-robnih-zaliha",
                "/bahatost-na-a-svagda-nja",
                "/moc-pozeljnog-razmi-ljanja-ili-wishful-thinking-in-cloud-cuckoo-land",
                "/policija-pobolj-ala-rad",
                "/le-web",
                "/cija-ce-volja-trijumfirati-u-radanju-jednih-nacija",
                "/sustavno-trovanje-gradana-dok-sustav-funkcionira",
                "/o-rusvaju-s-jmbg-om"
            };

            SimpleMultithreadedScrapper.AnalyzeFrontPage_SimpleMultithreaded(listToDo, repo);

            PrintStatistics(repo);


            repo.UpdateDataStore();
        }
        static public void AnalyzeListOfPosts_Multithreaded_OneBatch(List <string> listOfPosts, ModelRepository repo, List <ScrapingBrowser> listLoggedBrowsers, bool isFrontPage, bool fetchCommentVotes)
        {
            Stopwatch timer = new Stopwatch();
            ILog      log   = log4net.LogManager.GetLogger(typeof(Program));

            List <string> postsToProcessInBatch = new List <string>();

            foreach (string s in listOfPosts)
            {
                string postUrl = "http://pollitika.com" + s;
                if (repo.PostAlreadyExists(postUrl) == false)
                {
                    postsToProcessInBatch.Add(s);
                }
                else
                {
                    log.WarnFormat("Post with url {0} ALREADY EXISTS IN DATABASE", s);
                }
            }

            postsToProcessInBatch.Add(null);    // adding terminator for CrawlListOfPages

            int         k         = 0;
            List <Task> listTasks = CrawlListOfPages(() => postsToProcessInBatch[k++],
                                                     (url, neki_repo, browser) => SimpleMultithreadedScrapper.MultithreadedAnalyzePost("http://pollitika.com" + url, repo, isFrontPage, fetchCommentVotes, browser),
                                                     1000,
                                                     repo,
                                                     listLoggedBrowsers);

            // when we get to the end of call, some threads are still running
            log.Debug("Starting wait for tasks to finish!");
            while (listTasks.Count(task => task.IsCompleted == false) > 0)
            {
                Thread.Sleep(1000);
            }

            log.InfoFormat("BATCH DONE {0}, BATCH DONE IN TIME {1}", DateTime.Now, timer.Elapsed);
            timer.Restart();
        }
        static public void AnalyzeListOfPosts_Multithreaded(List <string> listOfPosts, ModelRepository repo, bool isFrontPage, bool fetchCommentVotes)
        {
            Stopwatch timer = new Stopwatch();
            ILog      log   = log4net.LogManager.GetLogger(typeof(Program));

            List <ScrapingBrowser> listLoggedBrowsers = new List <ScrapingBrowser>();

            log.Info("Logging in browsers");
            const int MaxConcurrentBrowsers = 8;

            for (int i = 0; i < MaxConcurrentBrowsers; i++)
            {
                listLoggedBrowsers.Add(Utility.GetLoggedBrowser());
            }

            int batchInd   = 0;
            int batchSize  = 50;
            int numBatches = listOfPosts.Count / batchSize + 1;

            timer.Start();

            while (batchInd * batchSize < listOfPosts.Count)
            {
                log.InfoFormat("DOING BATCH {0} of {1}, date: {2}", batchInd + 1, numBatches, DateTime.Now);

                int           startInd = batchInd * batchSize;
                List <string> postsToProcessInBatch = new List <string>();

                for (int ind = startInd; ind < startInd + batchSize && ind < listOfPosts.Count; ind++)
                {
                    string postUrl = "http://pollitika.com" + listOfPosts[ind];
                    if (repo.PostAlreadyExists(postUrl) == false)
                    {
                        postsToProcessInBatch.Add(listOfPosts[ind]);
                    }
                    else
                    {
                        log.WarnFormat("Post with url {0} ALREADY EXISTS IN DATABASE", listOfPosts[ind]);
                    }
                }

                postsToProcessInBatch.Add(null);    // adding terminator for CrawlListOfPages

                int         k         = 0;
                List <Task> listTasks = CrawlListOfPages(() => postsToProcessInBatch[k++],
                                                         (url, neki_repo, browser) => SimpleMultithreadedScrapper.MultithreadedAnalyzePost("http://pollitika.com" + url, repo, isFrontPage, fetchCommentVotes, browser),
                                                         1000,
                                                         repo,
                                                         listLoggedBrowsers);

                // when we get to the end of call, some threads are still running
                log.Debug("Starting wait for tasks to finish!");
                while (listTasks.Count(task => task.IsCompleted == false) > 0)
                {
                    Thread.Sleep(1000);
                }

                log.Info("Updating store");

                repo.UpdateDataStore();

                log.InfoFormat("BATCH DONE {0}, BATCH DONE IN TIME {1}", DateTime.Now, timer.Elapsed);
                timer.Restart();

                batchInd++;
            }
        }