Example #1
0
        /// <summary>
        /// Executes a Search using the searchField as the search parameter,
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a Mongo "QUEUE" collection
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore(string searchField, bool shouldUseProxies)
        {
            // Console Feedback
            _logger.Warn("Crawling Search Term : [ " + searchField + " ]");

            // Hashset of urls used to keep track of what's been parsed already
            HashSet <String> foundUrls = new HashSet <String> ();

            // Control variable to avoid "Loop" on pagging
            bool isDonePagging = false;

            // Compiling Regular Expression used to parse the "pagToken" out of the Play Store
            Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\x22", RegexOptions.Compiled);

            // HTML Response
            string response;

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Headers.Add(Consts.ACCEPT_LANGUAGE);
                server.Host      = Consts.HOST;
                server.UserAgent = Consts.GITHUBURL;
                server.Encoding  = "utf-8";

                // Checking for the need to use "HTTP Proxies"
                if (shouldUseProxies)
                {
                    server.Proxy = ProxiesLoader.GetWebProxy();
                }

                // Executing Initial Request
                response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    foundUrls.Add(url);
                    if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url)))
                    {
                        // Than, queue it :)
                        _mongoDB.AddToQueue(url);
                        Thread.Sleep(250);  // Hiccup
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Finding pagToken from HTML
                    var rgxMatch = pagTokenRegex.Match(response);

                    // If there's no match, skips it
                    if (!rgxMatch.Success)
                    {
                        break;
                    }

                    // Reading Match from Regex, and applying needed replacements
                    string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\x22", String.Empty).Replace(@"\\u003d", String.Empty);

                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.POST_DATA, pagToken);

                    // Executing request for values
                    response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        if (foundUrls.Contains(url))
                        {
                            isDonePagging = true;
                            break;
                        }
                        // Checks whether the app have been already processed
                        foundUrls.Add(url);
                        if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url)))
                        {
                            // Than, queue it :)
                            _mongoDB.AddToQueue(url);
                            Thread.Sleep(250);  // Hiccup
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (!isDonePagging && parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
        /// <summary>
        /// Executes a Search using the searchField as the search parameter,
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a AWS SQS queue
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore(string searchField)
        {
            // Console Feedback
            Console.WriteLine("Crawling Search Term : [ " + searchField + " ]");

            // Compiling Regular Expression used to parse the "pagToken" out of the Play Store
            Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled);

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Ensuring the database has the proper indexe
            mongoDB.EnsureIndex("Url");

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Host = Consts.HOST;

                // Executing Initial Request
                response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                    {
                        // Console Feedback
                        Console.WriteLine(" . Queued App");

                        // Than, queue it :)
                        mongoDB.AddToQueue(url);
                        Thread.Sleep(250);  // Hiccup
                    }
                    else
                    {
                        // Console Feedback
                        Console.WriteLine(" . Duplicated App. Skipped");
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Finding pagToken from HTML
                    var rgxMatch = pagTokenRegex.Match(response);

                    // If there's no match, skips it
                    if (!rgxMatch.Success)
                    {
                        break;
                    }

                    // Reading Match from Regex, and applying needed replacements
                    string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty);

                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.POST_DATA, pagToken);

                    // Executing request for values
                    response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        // Checks whether the app have been already processed
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                        {
                            // Console Feedback
                            Console.WriteLine(" . Queued App");

                            // Than, queue it :)
                            mongoDB.AddToQueue(url);
                            Thread.Sleep(250);  // Hiccup
                        }
                        else
                        {
                            // Console Feedback
                            Console.WriteLine(" . Duplicated App. Skipped");
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }
Example #3
0
        /// <summary>
        /// Executes a Search using the searchField as the search parameter,
        /// paginates / scrolls the search results to the end adding all the url of apps
        /// it finds to a AWS SQS queue
        /// </summary>
        /// <param name="searchField"></param>
        private static void CrawlStore(string searchField)
        {
            // Console Feedback
            Console.WriteLine("Crawling Search Term : [ " + searchField + " ]");

            // HTML Response
            string response;

            // MongoDB Helper
            // Configuring MongoDB Wrapper
            MongoDBWrapper mongoDB           = new MongoDBWrapper();
            string         fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT);

            mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION);

            // Response Parser
            PlayStoreParser parser = new PlayStoreParser();

            // Executing Web Requests
            using (WebRequests server = new WebRequests())
            {
                // Creating Request Object
                server.Host = Consts.HOST;

                // Executing Initial Request
                response = server.Post(Consts.CRAWL_URL, Consts.INITIAL_POST_DATA);

                // Parsing Links out of Html Page (Initial Request)
                foreach (string url in parser.ParseAppUrls(response))
                {
                    // Checks whether the app have been already processed
                    // or is queued to be processed
                    if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                    {
                        // Console Feedback
                        Console.WriteLine(" . Queued App");

                        // Than, queue it :)
                        mongoDB.AddToQueue(url);
                    }
                    else
                    {
                        // Console Feedback
                        Console.WriteLine(" . Duplicated App. Skipped");
                    }
                }

                // Executing Requests for more Play Store Links
                int initialSkip       = 48;
                int currentMultiplier = 1;
                int errorsCount       = 0;
                do
                {
                    // Assembling new PostData with paging values
                    string postData = String.Format(Consts.POST_DATA, (initialSkip * currentMultiplier));

                    // Executing request for values
                    response = server.Post(Consts.CRAWL_URL, postData);

                    // Checking Server Status
                    if (server.StatusCode != System.Net.HttpStatusCode.OK)
                    {
                        LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]");
                        errorsCount++;
                        continue;
                    }

                    // Parsing Links
                    foreach (string url in parser.ParseAppUrls(response))
                    {
                        // Checks whether the app have been already processed
                        // or is queued to be processed
                        if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url)))
                        {
                            // Console Feedback
                            Console.WriteLine(" . Queued App");

                            // Than, queue it :)
                            mongoDB.AddToQueue(url);
                        }
                        else
                        {
                            // Console Feedback
                            Console.WriteLine(" . Duplicated App. Skipped");
                        }
                    }

                    // Incrementing Paging Multiplier
                    currentMultiplier++;
                }  while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS);
            }
        }