/// <summary>
        /// Entry point of the worker piece of the process
        /// Notice that you can run as many workers as you want to in order to make the crawling faster
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            // Configuring Log Object
            LogSetup.InitializeLog ("PlayStoreWorker.log", "info");
            Logger logger = LogManager.GetCurrentClassLogger ();
            logger.Info ("Worker Started");

            // Control Variable (Bool - Should the process use proxies? )
            bool isUsingProxies = false;

            // Checking for the need to use proxies
            if (args != null && args.Length == 1)
            {
                // Setting flag to true
                isUsingProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists (fPath))
                {
                    logger.Fatal ("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit (-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load (fLines.ToList ());
                }
                catch (Exception ex)
                {
                    logger.Fatal (ex);
                    System.Environment.Exit (-101);
                }
            }

            // Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB   = new MongoDBWrapper();
            string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);
            mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Creating Instance of Web Requests Server
            WebRequests server = new WebRequests ();

            // Queued App Model
            QueuedApp app;

            // Retry Counter (Used for exponential wait increasing logic)
            int retryCounter = 0;

            // Iterating Over MongoDB Records while no document is found to be processed
            while ((app = mongoDB.FindAndModify ()) != null)
            {
                try
                {
                    // Building APP URL
                    string appUrl = app.Url;

                    // Sanity check of app page url
                    if (app.Url.IndexOf ("http", StringComparison.OrdinalIgnoreCase) < 0)
                    {
                        appUrl = Consts.APP_URL_PREFIX + app.Url;
                    }

                    // Checking if this app is on the database already
                    if (mongoDB.AppProcessed (appUrl))
                    {
                        // Console Feedback, Comment this line to disable if you want to
                        logger.Info ("Duplicated App, skipped.");

                        // Delete it from the queue and continues the loop
                        mongoDB.RemoveFromQueue (app.Url);
                        continue;
                    }

                    // Configuring server and Issuing Request
                    server.Headers.Add (Consts.ACCEPT_LANGUAGE);
                    server.Host              = Consts.HOST;
                    server.UserAgent         = Consts.GITHUBURL;
                    server.Encoding          = "utf-8";
                    server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset;

                    // Checking for the need to use "HTTP Proxies"
                    if (isUsingProxies)
                    {
                        server.Proxy = ProxiesLoader.GetWebProxy ();
                    }

                    // Issuing HTTP Request
                    string response          = server.Get (appUrl);

                    // Flag Indicating Success while processing and parsing this app
                    bool ProcessingWorked = true;

                    // Sanity Check
                    if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        logger.Info ("Error opening app page : " + appUrl);
                        ProcessingWorked = false;

                        // Renewing WebRequest Object to get rid of Cookies
                        server = new WebRequests ();

                        // Fallback time variable
                        double waitTime;

                        // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much
                        // If there are no proxies in use, on the other hand, the process must wait more time
                        if (isUsingProxies)
                        {
                            // Waits two seconds everytime
                            waitTime = TimeSpan.FromSeconds (2).TotalMilliseconds;
                        }
                        else
                        {
                            // Increments retry counter
                            retryCounter++;

                            // Checking for maximum retry count
                            if (retryCounter >= 8)
                            {
                                waitTime = TimeSpan.FromMinutes (20).TotalMilliseconds;
                            }
                            else
                            {
                                // Calculating next wait time ( 2 ^ retryCounter seconds)
                                waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds;
                            }
                        }

                        // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP
                        logger.Info ("======================================================");
                        logger.Info ("\n\tFallback : " + waitTime + " Seconds");
                        Thread.Sleep (Convert.ToInt32 (waitTime));

                        // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue"
                        if (server.StatusCode == 0)
                        {
                            // Console Feedback
                            logger.Info ("\tApp Not Found (404) - " + app.Url);

                            mongoDB.RemoveFromQueue (app.Url);
                        }
                        logger.Info ("======================================================");
                    }
                    else
                    {
                        // Reseting retry counter
                        retryCounter = 0;

                        // Parsing Useful App Data
                        AppModel parsedApp = parser.ParseAppPage (response, appUrl);

                        List<String> relatedApps = new List<String> ();

                        // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently
                        try
                        {

                            // Parsing "Related Apps" and "More From Developer" Apps (URLS Only)
                            foreach (string extraAppUrl in parser.ParseExtraApps (response))
                            {
                                relatedApps.Add (Consts.APP_URL_PREFIX + extraAppUrl);
                            }

                            // Adding "Related Apps" to Apps Model
                            parsedApp.RelatedUrls = relatedApps.Distinct ().ToArray ();
                        }
                        catch
                        {
                            logger.Info ("\tNo Related Apps Found. Skipping");
                        }

                        // Inserting App into Mongo DB Database
                        if (!mongoDB.Insert<AppModel>(parsedApp))
                        {
                            ProcessingWorked = false;
                        }

                        // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy
                        // so that other workers can try to process it later
                        if (!ProcessingWorked)
                        {
                            mongoDB.ToggleBusyApp(app, false);
                        }
                        else // On the other hand, if processing worked, removes it from the database
                        {
                            // Console Feedback, Comment this line to disable if you want to
                            Console.ForegroundColor = ConsoleColor.Red;
                            logger.Info ("Inserted App : " + parsedApp.Name);
                            Console.ForegroundColor = ConsoleColor.White;

                            mongoDB.RemoveFromQueue(app.Url);
                        }

                        // Counters for console feedback only
                        int extraAppsCounter = 0, newExtraApps = 0;

                        // Parsing "Related Apps" and "More From Developer" Apps (URLS Only)
                        foreach (string extraAppUrl in relatedApps)
                        {
                            // Incrementing counter of extra apps
                            extraAppsCounter++;

                            // Assembling Full app Url to check with database
                            string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl;

                            // Checking if the app was either processed or queued to be processed already
                            if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl)))
                            {
                                // Incrementing counter of inserted apps
                                newExtraApps++;

                                // Adds it to the queue of apps to be processed
                                mongoDB.AddToQueue (extraAppUrl);
                            }
                        }

                        // Console Feedback
                        logger.Info ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps");
                    }
                }
                catch (Exception ex)
                {
                    logger.Error (ex);
                }
                finally
                {
                    try
                    {
                        // Toggles Busy status back to false
                        mongoDB.ToggleBusyApp (app, false);
                    }
                    catch (Exception ex)
                    {
                        // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this
                        // "inner catch" to avoid it from happenning
                        logger.Error (ex);
                    }
                }
            }
        }
        /// <summary>
        /// Executes a Search using the searchField as the search parameter,
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a AWS SQS queue
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore(string searchField)
        {
            // Console Feedback
            Console.WriteLine("Crawling Search Term : [ " + searchField + " ]");

            // Compiling Regular Expression used to parse the "pagToken" out of the Play Store
            Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled);

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database has the proper indexe
            mongoDB.EnsureIndex("Url");

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Host = Consts.HOST;

                // Executing Initial Request
                response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                    {
                        // Console Feedback
                        Console.WriteLine(" . Queued App");

                        // Than, queue it :)
                        mongoDB.AddToQueue(url);
                        Thread.Sleep(250);  // Hiccup
                    }
                    else
                    {
                        // Console Feedback
                        Console.WriteLine(" . Duplicated App. Skipped");
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Finding pagToken from HTML
                    var rgxMatch = pagTokenRegex.Match(response);

                    // If there's no match, skips it
                    if (!rgxMatch.Success)
                    {
                        break;
                    }

                    // Reading Match from Regex, and applying needed replacements
                    string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty);

                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.POST_DATA, pagToken);

                    // Executing request for values
                    response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        // Checks whether the app have been already processed
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                        {
                            // Console Feedback
                            Console.WriteLine(" . Queued App");

                            // Than, queue it :)
                            mongoDB.AddToQueue(url);
                            Thread.Sleep(250);  // Hiccup
                        }
                        else
                        {
                            // Console Feedback
                            Console.WriteLine(" . Duplicated App. Skipped");
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
Exemple #3
0
        /// <summary>
        /// Entry point of the worker piece of the process
        /// Notice that you can run as many workers as you want to in order to make the crawling faster
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            // Configuring Log Object
            LogSetup.InitializeLog("PlayStoreWorker.log", "info");
            Logger logger = LogManager.GetCurrentClassLogger();

            logger.Info("Worker Started");

            // Control Variable (Bool - Should the process use proxies? )
            bool isUsingProxies = false;

            // Checking for the need to use proxies
            if (args != null && args.Length == 1)
            {
                // Setting flag to true
                isUsingProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists(fPath))
                {
                    logger.Fatal("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit(-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load(fLines.ToList());
                }
                catch (Exception ex)
                {
                    logger.Fatal(ex);
                    System.Environment.Exit(-101);
                }
            }

            // Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            /*
             * // populate min downloaded & max downloaded
             * int count = 0;
             * var apps = mongoDB.FindAll<AppModel>();
             * foreach(var a in apps)
             * {
             *  a.FillMinAndMaxInstalls();
             ++count;
             *
             *  if((count % 100) == 0)
             *  {
             *      Console.WriteLine("updated {0}", count);
             *  }
             *
             *  if (!mongoDB.UpsertKeyEq<AppModel>(a, "Url", a.Url))
             *  {
             *      Console.WriteLine("UpsertKey failed");
             *  }
             * }
             */

            // Creating Instance of Web Requests Server
            WebRequests server = new WebRequests();

            // Queued App Model
            QueuedApp app;

            // Retry Counter (Used for exponential wait increasing logic)
            int retryCounter = 0;

            // Iterating Over MongoDB Records while no document is found to be processed
            while ((app = mongoDB.FindAndModify()) != null)
            {
                try
                {
                    // Building APP URL
                    string appUrl = app.Url;

                    // Sanity check of app page url
                    if (app.Url.IndexOf("http", StringComparison.OrdinalIgnoreCase) < 0)
                    {
                        appUrl = Consts.APP_URL_PREFIX + app.Url;
                    }

                    // Checking if this app is on the database already
                    if (mongoDB.AppProcessed(appUrl))
                    {
                        // Console Feedback, Comment this line to disable if you want to
                        logger.Info("Duplicated App, skipped.");

                        // Delete it from the queue and continues the loop
                        mongoDB.RemoveFromQueue(app.Url);
                        continue;
                    }

                    // Configuring server and Issuing Request
                    server.Headers.Add(Consts.ACCEPT_LANGUAGE);
                    server.Host              = Consts.HOST;
                    server.UserAgent         = Consts.GITHUBURL;
                    server.Encoding          = "utf-8";
                    server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset;

                    // Checking for the need to use "HTTP Proxies"
                    if (isUsingProxies)
                    {
                        server.Proxy = ProxiesLoader.GetWebProxy();
                    }

                    // Issuing HTTP Request
                    string response = server.Get(appUrl);

                    // Flag Indicating Success while processing and parsing this app
                    bool ProcessingWorked = true;

                    // Sanity Check
                    if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        logger.Info("Error opening app page : " + appUrl);
                        ProcessingWorked = false;

                        if (isUsingProxies)
                        {
                            ProxiesLoader.IncrementCurrentProxy();
                        }

                        // Renewing WebRequest Object to get rid of Cookies
                        server = new WebRequests();

                        // Fallback time variable
                        double waitTime;

                        // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much
                        // If there are no proxies in use, on the other hand, the process must wait more time
                        if (isUsingProxies)
                        {
                            // Waits two seconds everytime
                            waitTime = TimeSpan.FromSeconds(2).TotalMilliseconds;
                        }
                        else
                        {
                            // Increments retry counter
                            retryCounter++;

                            // Checking for maximum retry count
                            if (retryCounter >= 8)
                            {
                                waitTime = TimeSpan.FromMinutes(20).TotalMilliseconds;
                            }
                            else
                            {
                                // Calculating next wait time ( 2 ^ retryCounter seconds)
                                waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds;
                            }
                        }

                        // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP
                        logger.Info("======================================================");
                        logger.Info("\n\tFallback : " + waitTime + " Seconds");
                        Thread.Sleep(Convert.ToInt32(waitTime));

                        // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue"
                        if (server.StatusCode == 0)
                        {
                            // Console Feedback
                            logger.Info("\tApp Not Found (404) - " + app.Url);

                            mongoDB.RemoveFromQueue(app.Url);
                        }
                        logger.Info("======================================================");
                    }
                    else
                    {
                        // Reseting retry counter
                        retryCounter = 0;

                        // Parsing Useful App Data
                        AppModel parsedApp = parser.ParseAppPage(response, appUrl);

                        // Normalizing URLs
                        if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperPrivacyPolicy))
                        {
                            parsedApp.DeveloperPrivacyPolicy = parsedApp.DeveloperPrivacyPolicy.Replace("https://www.google.com/url?q=", String.Empty);
                        }

                        if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperWebsite))
                        {
                            parsedApp.DeveloperNormalizedDomain = parser.NormalizeDomainName(parsedApp.DeveloperWebsite);
                        }

                        List <String> relatedApps = new List <String> ();

                        // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently
                        try
                        {
                            // Parsing "Related Apps" and "More From Developer" Apps (URLS Only)
                            foreach (string extraAppUrl in parser.ParseExtraApps(response))
                            {
                                relatedApps.Add(Consts.APP_URL_PREFIX + extraAppUrl);
                            }

                            // Adding "Related Apps" to Apps Model
                            parsedApp.RelatedUrls = relatedApps.Distinct().ToArray();
                        }
                        catch
                        {
                            logger.Info("\tNo Related Apps Found. Skipping");
                        }

                        // Inserting App into Mongo DB Database
                        if (!mongoDB.UpsertKeyEq <AppModel>(parsedApp, "Url", appUrl))
                        {
                            ProcessingWorked = false;
                        }

                        // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy
                        // so that other workers can try to process it later
                        if (!ProcessingWorked)
                        {
                            mongoDB.ToggleBusyApp(app, false);
                        }
                        else // On the other hand, if processing worked, removes it from the database
                        {
                            // Console Feedback, Comment this line to disable if you want to
                            Console.ForegroundColor = ConsoleColor.Red;
                            logger.Info("Inserted App : " + parsedApp.Name);
                            Console.ForegroundColor = ConsoleColor.White;

                            mongoDB.RemoveFromQueue(app.Url);
                        }

                        // Counters for console feedback only
                        int extraAppsCounter = 0, newExtraApps = 0;

                        // Parsing "Related Apps" and "More From Developer" Apps (URLS Only)
                        foreach (string extraAppUrl in relatedApps)
                        {
                            // Incrementing counter of extra apps
                            extraAppsCounter++;

                            // Assembling Full app Url to check with database
                            string fullExtraAppUrl;
                            if (extraAppUrl.IndexOf("https://play.google.com/") >= 0)
                            {
                                fullExtraAppUrl = extraAppUrl;
                            }
                            else
                            {
                                fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl;
                            }

                            // Checking if the app was either processed or queued to be processed already
                            if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl)))
                            {
                                // Incrementing counter of inserted apps
                                newExtraApps++;

                                // Adds it to the queue of apps to be processed
                                mongoDB.AddToQueue(extraAppUrl);
                            }
                        }

                        // Console Feedback
                        logger.Info("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps");
                    }
                }
                catch (Exception ex)
                {
                    logger.Error(ex);
                }
                finally
                {
                    try
                    {
                        // Toggles Busy status back to false
                        mongoDB.ToggleBusyApp(app, false);
                    }
                    catch (Exception ex)
                    {
                        // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this
                        // "inner catch" to avoid it from happenning
                        logger.Error(ex);
                    }
                }
            }
        }
        /// <summary>
        /// Entry point of the worker piece of the process
        /// Notice that you can run as many workers as you want to in order to make the crawling faster
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            // Configuring Log Object Threshold
            LogWriter.Threshold = TLogEventLevel.Information;
            LogWriter.Info("Worker Started");

            // Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Creating Instance of Web Requests Server
            WebRequests server = new WebRequests();

            QueuedApp app;

            // Retry Counter (Used for exponential wait increasing logic)
            int retryCounter = 0;

            // Iterating Over MongoDB Records while no document is found to be processed
            while ((app = mongoDB.FindAndModify()) != null)
            {
                try
                {
                    // Building APP URL
                    string appUrl = Consts.APP_URL_PREFIX + app.Url;

                    // Checking if this app is on the database already
                    if (mongoDB.AppProcessed(appUrl))
                    {
                        // Console Feedback, Comment this line to disable if you want to
                        Console.WriteLine("Duplicated App, skipped.");

                        // Delete it from the queue and continues the loop
                        mongoDB.RemoveFromQueue(app.Url);
                        continue;
                    }

                    // Configuring server and Issuing Request
                    server.Headers.Add(Consts.ACCEPT_LANGUAGE);
                    server.Host              = Consts.HOST;
                    server.Encoding          = "utf-8";
                    server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset;
                    string response = server.Get(appUrl);

                    // Flag Indicating Success while processing and parsing this app
                    bool ProcessingWorked = true;

                    // Sanity Check
                    if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Info("Error opening app page : " + appUrl);
                        ProcessingWorked = false;

                        // Renewing WebRequest Object to get rid of Cookies
                        server = new WebRequests();

                        // Inc. retry counter
                        retryCounter++;

                        Console.WriteLine("Retrying:" + retryCounter);

                        // Checking for maximum retry count
                        double waitTime;
                        if (retryCounter >= 11)
                        {
                            waitTime = TimeSpan.FromMinutes(35).TotalMilliseconds;

                            // Removing App from the database (this the app page may have expired)
                            mongoDB.RemoveFromQueue(app.Url);
                        }
                        else
                        {
                            // Calculating next wait time ( 2 ^ retryCounter seconds)
                            waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds;
                        }

                        // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP
                        Thread.Sleep(Convert.ToInt32(waitTime));
                    }
                    else
                    {
                        // Reseting retry counter
                        retryCounter = 0;

                        // Parsing Useful App Data
                        AppModel parsedApp = parser.ParseAppPage(response, appUrl);

                        // Inserting App into Mongo DB Database
                        if (!mongoDB.Insert <AppModel>(parsedApp))
                        {
                            ProcessingWorked = false;
                        }

                        // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy
                        // so that other workers can try to process it later
                        if (!ProcessingWorked)
                        {
                            mongoDB.ToggleBusyApp(app, false);
                        }
                        else // On the other hand, if processing worked, removes it from the database
                        {
                            // Console Feedback, Comment this line to disable if you want to
                            Console.WriteLine("Inserted App : " + parsedApp.Name);

                            mongoDB.RemoveFromQueue(app.Url);
                        }

                        // Counters for console feedback only
                        int extraAppsCounter = 0, newExtraApps = 0;

                        // Parsing "Related Apps" and "More From Developer" Apps (URLS Only)
                        foreach (string extraAppUrl in parser.ParseExtraApps(response))
                        {
                            // Incrementing counter of extra apps
                            extraAppsCounter++;

                            // Assembling Full app Url to check with database
                            string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl;

                            // Checking if the app was either processed or queued to be processed already
                            if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl)))
                            {
                                // Incrementing counter of inserted apps
                                newExtraApps++;

                                // Adds it to the queue of apps to be processed
                                mongoDB.AddToQueue(extraAppUrl);
                            }
                        }

                        // Console Feedback
                        Console.WriteLine("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps");
                    }
                }
                catch (Exception ex)
                {
                    LogWriter.Error(ex);
                }
                finally
                {
                    try
                    {
                        // Toggles Busy status back to false
                        mongoDB.ToggleBusyApp(app, false);
                    }
                    catch (Exception ex)
                    {
                        // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this
                        // "inner catch" to avoid it from happenning
                        LogWriter.Error(ex);
                    }
                }
            }
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="categoryUrl"></param>
        private static void CrawlCategory(string categoryUrl, string categoryName, bool shouldUseProxies)
        {
            // Console Feedback
            _logger.Warn("Crawling Category : [ " + categoryName + " ]");

            // Hashset of urls used to keep track of what's been parsed already
            HashSet <String> foundUrls = new HashSet <String> ();

            // Control variable to avoid "Loop" on pagging
            bool isDonePagging = false;

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database has the proper indexe
            mongoDB.EnsureIndex("Url");

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Headers.Add(Consts.ACCEPT_LANGUAGE);
                server.Host      = Consts.HOST;
                server.UserAgent = Consts.GITHUBURL;
                server.Encoding  = "utf-8";

                // Executing Initial Request
                response = server.Get(categoryUrl);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Saving found url on local hashset
                    foundUrls.Add(url);

                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                    {
                        // Than, queue it :)
                        mongoDB.AddToQueue(url);
                    }
                }

                // Executing Requests for more Play Store Links
                int baseSkip          = 60;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.CATEGORIES_POST_DATA, (currentMultiplier * baseSkip), baseSkip);

                    // Executing request for values
                    response = server.Post(String.Format(categoryUrl + "?authuser=0"), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        // If a certain app is found twice, it means that the "pagging" logic got stuck into a
                        // Loop, so the all the apps for this category were parsed already
                        if (foundUrls.Contains(url))
                        {
                            isDonePagging = true;
                            break;
                        }

                        // Saving found url on local hashset
                        foundUrls.Add(url);

                        // Checks whether the app have been already processed
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                        {
                            // Than, queue it :)
                            mongoDB.AddToQueue(url);
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (!isDonePagging && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
        /// <summary>
        /// Executes a Search using the searchField as the search parameter, 
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a AWS SQS queue
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore (string searchField)
        {
            // Console Feedback
            Console.WriteLine ("Crawling Search Term : [ " + searchField + " ]");

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB   = new MongoDBWrapper ();
            string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT);
            mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser (); 

            // Executing Web Requests
            using (WebRequests server = new WebRequests ())
            {
                // Creating Request Object
                server.Host = Consts.HOST;

                // Executing Initial Request
                response    = server.Post (Consts.CRAWL_URL, Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)                
                foreach (string url in parser.ParseAppUrls (response))
                {
                    // Checks whether the app have been already processed 
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                    {
                        // Console Feedback
                        Console.WriteLine (" . Queued App");

                        // Than, queue it :)
                        mongoDB.AddToQueue (url);
                    }
                    else
                    {
                        // Console Feedback
                        Console.WriteLine (" . Duplicated App. Skipped");
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Assembling new PostData with paging values
                    string postData = String.Format (Consts.POST_DATA, (initialSkip * currentMultiplier));

                    // Executing request for values
                    response = server.Post (Consts.CRAWL_URL, postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Error ("Http Error", "Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls (response))
                    {
                        // Checks whether the app have been already processed 
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                        {
                            // Console Feedback
                            Console.WriteLine (" . Queued App");

                            // Than, queue it :)
                            mongoDB.AddToQueue (url);
                        }
                        else
                        {
                            // Console Feedback
                            Console.WriteLine (" . Duplicated App. Skipped");
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;

                }  while (parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
        /// <summary>
        /// Executes a Search using the searchField as the search parameter, 
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a AWS SQS queue
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore (string searchField)
        {
            // Console Feedback
            Console.WriteLine ("Crawling Search Term : [ " + searchField + " ]");

            // Compiling Regular Expression used to parse the "pagToken" out of the Play Store
            Regex pagTokenRegex = new Regex (@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled);

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB   = new MongoDBWrapper ();
            string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT);
            mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database has the proper indexe
            mongoDB.EnsureIndex ("Url");

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser (); 

            // Executing Web Requests
            using (WebRequests server = new WebRequests ())
            {
                // Creating Request Object
                server.Host = Consts.HOST;

                // Executing Initial Request
                response    = server.Post (String.Format (Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)                
                foreach (string url in parser.ParseAppUrls (response))
                {
                    // Checks whether the app have been already processed 
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                    {
                        // Console Feedback
                        Console.WriteLine (" . Queued App");

                        // Than, queue it :)
                        mongoDB.AddToQueue (url);
                        Thread.Sleep (250); // Hiccup
                    }
                    else
                    {
                        // Console Feedback
                        Console.WriteLine (" . Duplicated App. Skipped");
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Finding pagToken from HTML
                    var rgxMatch = pagTokenRegex.Match (response);

                    // If there's no match, skips it
                    if (!rgxMatch.Success)
                    {
                        break;
                    }

                    // Reading Match from Regex, and applying needed replacements
                    string pagToken = rgxMatch.Value.Replace (":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty);

                    // Assembling new PostData with paging values
                    string postData = String.Format (Consts.POST_DATA, pagToken);

                    // Executing request for values
                    response = server.Post (String.Format (Consts.CRAWL_URL, searchField), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Error ("Http Error", "Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls (response))
                    {
                        // Checks whether the app have been already processed 
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                        {
                            // Console Feedback
                            Console.WriteLine (" . Queued App");

                            // Than, queue it :)
                            mongoDB.AddToQueue (url);
                            Thread.Sleep (250); // Hiccup
                        }
                        else
                        {
                            // Console Feedback
                            Console.WriteLine (" . Duplicated App. Skipped");
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;

                }  while (parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
        /// <summary>
        /// Executes a Search using the searchField as the search parameter,
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a Mongo "QUEUE" collection
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore(string searchField, bool shouldUseProxies)
        {
            // Console Feedback
            _logger.Warn("Crawling Search Term : [ " + searchField + " ]");

            // Hashset of urls used to keep track of what's been parsed already
            HashSet <String> foundUrls = new HashSet <String> ();

            // Control variable to avoid "Loop" on pagging
            bool isDonePagging = false;

            // Compiling Regular Expression used to parse the "pagToken" out of the Play Store
            Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\x22", RegexOptions.Compiled);

            // HTML Response
            string response;

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Headers.Add(Consts.ACCEPT_LANGUAGE);
                server.Host      = Consts.HOST;
                server.UserAgent = Consts.GITHUBURL;
                server.Encoding  = "utf-8";

                // Checking for the need to use "HTTP Proxies"
                if (shouldUseProxies)
                {
                    server.Proxy = ProxiesLoader.GetWebProxy();
                }

                // Executing Initial Request
                response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    foundUrls.Add(url);
                    if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url)))
                    {
                        // Than, queue it :)
                        _mongoDB.AddToQueue(url);
                        Thread.Sleep(250);  // Hiccup
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Finding pagToken from HTML
                    var rgxMatch = pagTokenRegex.Match(response);

                    // If there's no match, skips it
                    if (!rgxMatch.Success)
                    {
                        break;
                    }

                    // Reading Match from Regex, and applying needed replacements
                    string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\x22", String.Empty).Replace(@"\\u003d", String.Empty);

                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.POST_DATA, pagToken);

                    // Executing request for values
                    response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        if (foundUrls.Contains(url))
                        {
                            isDonePagging = true;
                            break;
                        }
                        // Checks whether the app have been already processed
                        foundUrls.Add(url);
                        if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url)))
                        {
                            // Than, queue it :)
                            _mongoDB.AddToQueue(url);
                            Thread.Sleep(250);  // Hiccup
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (!isDonePagging && parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
        /// <summary>
        /// Entry point of the worker piece of the process
        /// Notice that you can run as many workers as you want to in order to make the crawling faster
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            // Configuring Log Object Threshold
            LogWriter.Threshold = TLogEventLevel.Information;
            LogWriter.Info ("Worker Started");

            // Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB = new MongoDBWrapper();
            string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);
            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Creating Instance of Web Requests Server
            WebRequests server = new WebRequests ();
            
            QueuedApp app;

            // Retry Counter (Used for exponential wait increasing logic)
            int retryCounter = 0;

            // Iterating Over MongoDB Records while no document is found to be processed                
            while ((app = mongoDB.FindAndModify ()) != null)
            {
                try
                {
                    // Building APP URL
                    string appUrl = Consts.APP_URL_PREFIX + app.Url;

                    // Checking if this app is on the database already
                    if (mongoDB.AppProcessed(appUrl))
                    {
                        // Console Feedback, Comment this line to disable if you want to
                        Console.WriteLine("Duplicated App, skipped.");

                        // Delete it from the queue and continues the loop
                        mongoDB.RemoveFromQueue (app.Url);
                        continue;
                    }

                    // Configuring server and Issuing Request
                    server.Headers.Add(Consts.ACCEPT_LANGUAGE);
                    server.Host = Consts.HOST;
                    server.Encoding = "utf-8";
                    server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset;
                    string response = server.Get (appUrl);

                    // Flag Indicating Success while processing and parsing this app
                    bool ProcessingWorked = true;

                    // Sanity Check
                    if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Info ("Error opening app page : " + appUrl);
                        ProcessingWorked = false;
                        
                        // Renewing WebRequest Object to get rid of Cookies
                        server = new WebRequests ();

                        // Inc. retry counter
                        retryCounter++;

                        Console.WriteLine ("Retrying:" + retryCounter);

                        // Checking for maximum retry count
                        double waitTime;
                        if (retryCounter >= 11)
                        {
                            waitTime = TimeSpan.FromMinutes (35).TotalMilliseconds;

                            // Removing App from the database (this the app page may have expired)
                            mongoDB.RemoveFromQueue (appUrl);
                        }
                        else
                        {
                            // Calculating next wait time ( 2 ^ retryCounter seconds)
                            waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds;
                        }

                        // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP
                        Thread.Sleep (Convert.ToInt32 (waitTime));
                    }
                    else
                    {
                        // Reseting retry counter
                        retryCounter = 0;

                        // Parsing Useful App Data
                        AppModel parsedApp = parser.ParseAppPage (response, appUrl);

                        // Inserting App into Mongo DB Database
                        if (!mongoDB.Insert<AppModel>(parsedApp))
                        {
                            ProcessingWorked = false;
                        }

                        // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy 
                        // so that other workers can try to process it later
                        if (!ProcessingWorked)
                        {
                            mongoDB.ToggleBusyApp(app, false);
                        }
                        else // On the other hand, if processing worked, removes it from the database
                        {
                            // Console Feedback, Comment this line to disable if you want to
                            Console.WriteLine("Inserted App : " + parsedApp.Name);

                            mongoDB.RemoveFromQueue(app.Url);
                        }

                        // Counters for console feedback only
                        int extraAppsCounter = 0, newExtraApps = 0;

                        // Parsing "Related Apps" and "More From Developer" Apps (URLS Only)
                        foreach (string extraAppUrl in parser.ParseExtraApps (response))
                        {
                            // Incrementing counter of extra apps
                            extraAppsCounter++;

                            // Assembling Full app Url to check with database
                            string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl;

                            // Checking if the app was either processed or queued to be processed already
                            if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl)))
                            {
                                // Incrementing counter of inserted apps
                                newExtraApps++;

                                // Adds it to the queue of apps to be processed
                                mongoDB.AddToQueue (extraAppUrl);
                            }
                        }

                        // Console Feedback
                        Console.WriteLine ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps");
                    }
                }
                catch (Exception ex)
                {
                    LogWriter.Error (ex);
                }
                finally
                {
                    try
                    {
                        // Toggles Busy status back to false
                        mongoDB.ToggleBusyApp(app, false);
                    }
                    catch (Exception ex)
                    {
                        // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this
                        // "inner catch" to avoid it from happenning
                        LogWriter.Error (ex);
                    }
                }
            }
        }
        /// <summary>
        /// Entry point of the crawler
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            // Setting Up Log
            LogSetup.InitializeLog("PlayStoreCrawler.log", "info");
            _logger = LogManager.GetCurrentClassLogger();

            // Control Variable (Bool - Should the process use proxies? )
            bool isUsingProxies = false;

            // Checking for the need to use HTTP proxies or not
            if (args != null && args.Length == 1)
            {
                _logger.Info("Loading Proxies from File");

                // Setting flag to true
                isUsingProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists(fPath))
                {
                    _logger.Fatal("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit(-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load(fLines.ToList());
                }
                catch (Exception ex)
                {
                    _logger.Fatal(ex);
                    System.Environment.Exit(-101);
                }
            }

            // Configuring MongoDB Wrapper
            _logger.Info("Setting up MongoDB Collections and Indexes");
            _mongoDB = new MongoDBWrapper();
            string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database collections have the proper indexes
            _mongoDB.EnsureIndex("Url");
            _mongoDB.EnsureIndex("IsBusy", Consts.QUEUED_APPS_COLLECTION);
            _mongoDB.EnsureIndex("Url", Consts.QUEUED_APPS_COLLECTION);

            // Main Flow
            _logger.Info("Started Bootstrapping Steps");

            // Scrapping "Play Store Categories"
            foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames)
            {
                CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies);
            }

            // Queueing Apps that start with each of the characters from "A" to "Z"
            foreach (var character in BootstrapTerms.charactersSearchTerms)
            {
                CrawlStore(character, isUsingProxies);
            }

            /// ... Keep Adding characters / search terms in order to increase the crawler's reach
            // APP CATEGORIES
            foreach (var category in BootstrapTerms.categoriesSearchTerms)
            {
                CrawlStore(category, isUsingProxies);
            }

            // Extra "Random" Search terms to increase even more the crawler's reach
            foreach (var miscTerm in BootstrapTerms.miscSearchTerms)
            {
                CrawlStore(miscTerm, isUsingProxies);
            }

            // Country Names as Search terms to increase even more the crawler's reach
            foreach (var countryName in BootstrapTerms.countryNames)
            {
                CrawlStore(countryName, isUsingProxies);
            }

            _logger.Info("\n\nBootstrapping Apps of Past Collections");

            // Iterating over past collections
            HashSet <String> appUrls = new HashSet <String> ();

            foreach (string collection in _mongoDB.GetHistoryOfCollections())
            {
                _logger.Info("Reading Collection [{0}]", collection);

                foreach (string app in _mongoDB.FindAllFromCollectionAs <AppModel> (collection).Select(t => t.Url))
                {
                    if (!appUrls.Contains(app))
                    {
                        appUrls.Add(app);
                    }
                }

                _logger.Info("\t=> Distinct Apps Found {0}", appUrls.Count);
            }

            // Adding Distinct Apps to the collection of apps to be processed
            int appsQueued = 0;

            foreach (string appUrl in appUrls)
            {
                _mongoDB.AddToQueue(appUrl);

                if (appsQueued++ % 10000 == 0)
                {
                    _logger.Info("[Progress] Apps Queued From Past Collections [{0}]", appsQueued);
                }
            }
        }
Exemple #11
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="categoryUrl"></param>
        private static void CrawlCategory (string categoryUrl, string categoryName, bool shouldUseProxies)
        {
            // Console Feedback
            _logger.Warn ("Crawling Category : [ " + categoryName + " ]");

            // Hashset of urls used to keep track of what's been parsed already
            HashSet<String> foundUrls = new HashSet<String> ();

            // Control variable to avoid "Loop" on pagging
            bool isDonePagging = false;

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB   = new MongoDBWrapper ();
            string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT);
            mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database has the proper indexe
            mongoDB.EnsureIndex ("Url");

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser (); 

            // Executing Web Requests
            using (WebRequests server = new WebRequests ())
            {
                // Creating Request Object
                server.Headers.Add (Consts.ACCEPT_LANGUAGE);
                server.Host      = Consts.HOST;
                server.UserAgent = Consts.GITHUBURL;
                server.Encoding  = "utf-8";

                // Executing Initial Request
                response = server.Get (categoryUrl);

                // Parsing Links out of Html Page (Initial Request)                
                foreach (string url in parser.ParseAppUrls (response))
                {
                    // Saving found url on local hashset
                    foundUrls.Add (url);

                    // Checks whether the app have been already processed 
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                    {
                        // Than, queue it :)
                        mongoDB.AddToQueue (url);
                    }
                }

                // Executing Requests for more Play Store Links
                int baseSkip       = 60;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Assembling new PostData with paging values
                    string postData = String.Format (Consts.CATEGORIES_POST_DATA, (currentMultiplier * baseSkip), baseSkip);

                    // Executing request for values
                    response = server.Post (String.Format (categoryUrl + "?authuser=0"), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        _logger.Error ("Http Error" + " - Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls (response))
                    {
                        // If a certain app is found twice, it means that the "pagging" logic got stuck into a 
                        // Loop, so the all the apps for this category were parsed already
                        if (foundUrls.Contains (url))
                        {
                            isDonePagging = true;
                            break;
                        }

                        // Saving found url on local hashset
                        foundUrls.Add (url);

                        // Checks whether the app have been already processed 
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                        {
                            // Than, queue it :)
                            mongoDB.AddToQueue (url);
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;

                }  while (!isDonePagging && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
Exemple #12
0
        /// <summary>
        /// Executes a Search using the searchField as the search parameter, 
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a Mongo "QUEUE" collection
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore (string searchField, bool shouldUseProxies)
        {
            // Console Feedback
			_logger.Warn ("Crawling Search Term : [ " + searchField + " ]");

			// Hashset of urls used to keep track of what's been parsed already
			HashSet<String> foundUrls = new HashSet<String> ();

			// Control variable to avoid "Loop" on pagging
			bool isDonePagging = false;

            // Compiling Regular Expression used to parse the "pagToken" out of the Play Store
            Regex pagTokenRegex = new Regex (@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled);

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB   = new MongoDBWrapper ();
            string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT);
            mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database has the proper indexe
            mongoDB.EnsureIndex ("Url");

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser (); 

            // Executing Web Requests
            using (WebRequests server = new WebRequests ())
            {
                // Creating Request Object
                server.Headers.Add (Consts.ACCEPT_LANGUAGE);
                server.Host      = Consts.HOST;
                server.UserAgent = Consts.GITHUBURL;
                server.Encoding  = "utf-8";

                // Checking for the need to use "HTTP Proxies"
                if (shouldUseProxies)
                {
                    server.Proxy = ProxiesLoader.GetWebProxy ();
                }   

                // Executing Initial Request
                response    = server.Post (String.Format (Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)                
                foreach (string url in parser.ParseAppUrls (response))
                {
                    // Checks whether the app have been already processed 
					// or is queued to be processed
					foundUrls.Add (url);
                    if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                    {
                        // Than, queue it :)
                        mongoDB.AddToQueue (url);
                        Thread.Sleep (250); // Hiccup
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Finding pagToken from HTML
                    var rgxMatch = pagTokenRegex.Match (response);

                    // If there's no match, skips it
                    if (!rgxMatch.Success)
                    {
                        break;
                    }

                    // Reading Match from Regex, and applying needed replacements
                    string pagToken = rgxMatch.Value.Replace (":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty);

                    // Assembling new PostData with paging values
                    string postData = String.Format (Consts.POST_DATA, pagToken);

                    // Executing request for values
                    response = server.Post (String.Format (Consts.CRAWL_URL, searchField), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        _logger.Error ("Http Error" + " - Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls (response))
					{
						if (foundUrls.Contains (url))
						{
							isDonePagging = true;
							break;
						}
                        // Checks whether the app have been already processed 
						foundUrls.Add (url);
                        if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url)))
                        {
                            // Than, queue it :)
                            mongoDB.AddToQueue (url);
                            Thread.Sleep (250); // Hiccup
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;

				}  while (!isDonePagging && parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
Exemple #13
0
        /// <summary>
        /// Executes a Search using the searchField as the search parameter,
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a AWS SQS queue
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore(string searchField)
        {
            // Console Feedback
            Console.WriteLine("Crawling Search Term : [ " + searchField + " ]");

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Host = Consts.HOST;

                // Executing Initial Request
                response = server.Post(Consts.CRAWL_URL, Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                    {
                        // Console Feedback
                        Console.WriteLine(" . Queued App");

                        // Than, queue it :)
                        mongoDB.AddToQueue(url);
                    }
                    else
                    {
                        // Console Feedback
                        Console.WriteLine(" . Duplicated App. Skipped");
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.POST_DATA, (initialSkip * currentMultiplier));

                    // Executing request for values
                    response = server.Post(Consts.CRAWL_URL, postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        // Checks whether the app have been already processed
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                        {
                            // Console Feedback
                            Console.WriteLine(" . Queued App");

                            // Than, queue it :)
                            mongoDB.AddToQueue(url);
                        }
                        else
                        {
                            // Console Feedback
                            Console.WriteLine(" . Duplicated App. Skipped");
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }