Example #1
0
        /// <summary>
        /// Entry point of the crawler
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            // Setting Up Log
            LogSetup.InitializeLog("PlayStoreCrawler.log", "info");
            _logger = LogManager.GetCurrentClassLogger();

            // Control Variable (Bool - Should the process use proxies? )
            bool isUsingProxies = false;

            // Checking for the need to use HTTP proxies or not
            if (args != null && args.Length == 1)
            {
                _logger.Info("Loading Proxies from File");

                // Setting flag to true
                isUsingProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists(fPath))
                {
                    _logger.Fatal("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit(-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load(fLines.ToList());
                }
                catch (Exception ex)
                {
                    _logger.Fatal(ex);
                    System.Environment.Exit(-101);
                }
            }

            // Configuring MongoDB Wrapper
            _logger.Info("Setting up MongoDB Collections and Indexes");
            _mongoDB = new MongoDBWrapper();
            string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database collections have the proper indexes
            _mongoDB.EnsureIndex("Url");
            _mongoDB.EnsureIndex("IsBusy", Consts.QUEUED_APPS_COLLECTION);
            _mongoDB.EnsureIndex("Url", Consts.QUEUED_APPS_COLLECTION);

            // Main Flow
            _logger.Info("Started Bootstrapping Steps");

            // Scrapping "Play Store Categories"
            foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames)
            {
                CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies);
            }

            // Queueing Apps that start with each of the characters from "A" to "Z"
            foreach (var character in BootstrapTerms.charactersSearchTerms)
            {
                CrawlStore(character, isUsingProxies);
            }

            /// ... Keep Adding characters / search terms in order to increase the crawler's reach
            // APP CATEGORIES
            foreach (var category in BootstrapTerms.categoriesSearchTerms)
            {
                CrawlStore(category, isUsingProxies);
            }

            // Extra "Random" Search terms to increase even more the crawler's reach
            foreach (var miscTerm in BootstrapTerms.miscSearchTerms)
            {
                CrawlStore(miscTerm, isUsingProxies);
            }

            // Country Names as Search terms to increase even more the crawler's reach
            foreach (var countryName in BootstrapTerms.countryNames)
            {
                CrawlStore(countryName, isUsingProxies);
            }

            _logger.Info("\n\nBootstrapping Apps of Past Collections");

            // Iterating over past collections
            HashSet <String> appUrls = new HashSet <String> ();

            foreach (string collection in _mongoDB.GetHistoryOfCollections())
            {
                _logger.Info("Reading Collection [{0}]", collection);

                foreach (string app in _mongoDB.FindAllFromCollectionAs <AppModel> (collection).Select(t => t.Url))
                {
                    if (!appUrls.Contains(app))
                    {
                        appUrls.Add(app);
                    }
                }

                _logger.Info("\t=> Distinct Apps Found {0}", appUrls.Count);
            }

            // Adding Distinct Apps to the collection of apps to be processed
            int appsQueued = 0;

            foreach (string appUrl in appUrls)
            {
                _mongoDB.AddToQueue(appUrl);

                if (appsQueued++ % 10000 == 0)
                {
                    _logger.Info("[Progress] Apps Queued From Past Collections [{0}]", appsQueued);
                }
            }
        }