/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object LogSetup.InitializeLog ("PlayStoreWorker.log", "info"); Logger logger = LogManager.GetCurrentClassLogger (); logger.Info ("Worker Started"); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists (fPath)) { logger.Fatal ("Couldnt find proxies on path : " + fPath); System.Environment.Exit (-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load (fLines.ToList ()); } catch (Exception ex) { logger.Fatal (ex); System.Environment.Exit (-101); } } // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests (); // Queued App Model QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify ()) != null) { try { // Building APP URL string appUrl = app.Url; // Sanity check of app page url if (app.Url.IndexOf ("http", StringComparison.OrdinalIgnoreCase) < 0) { appUrl = Consts.APP_URL_PREFIX + app.Url; } // Checking if this app is on the database already if (mongoDB.AppProcessed (appUrl)) { // Console Feedback, Comment this line to disable if you want to logger.Info ("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue (app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Checking for the need to use "HTTP Proxies" if (isUsingProxies) { server.Proxy = ProxiesLoader.GetWebProxy (); } // Issuing HTTP Request string response = server.Get (appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK) { logger.Info ("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests (); // Fallback time variable double waitTime; // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much // If there are no proxies in use, on the other hand, the process must wait more time if (isUsingProxies) { // Waits two seconds everytime waitTime = TimeSpan.FromSeconds (2).TotalMilliseconds; } else { // Increments retry counter retryCounter++; // Checking for maximum retry count if (retryCounter >= 8) { waitTime = TimeSpan.FromMinutes (20).TotalMilliseconds; } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds; } } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP logger.Info ("======================================================"); logger.Info ("\n\tFallback : " + waitTime + " Seconds"); Thread.Sleep (Convert.ToInt32 (waitTime)); // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue" if (server.StatusCode == 0) { // Console Feedback logger.Info ("\tApp Not Found (404) - " + app.Url); mongoDB.RemoveFromQueue (app.Url); } logger.Info ("======================================================"); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage (response, appUrl); List<String> relatedApps = new List<String> (); // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently try { // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps (response)) { relatedApps.Add (Consts.APP_URL_PREFIX + extraAppUrl); } // Adding "Related Apps" to Apps Model parsedApp.RelatedUrls = relatedApps.Distinct ().ToArray (); } catch { logger.Info ("\tNo Related Apps Found. Skipping"); } // Inserting App into Mongo DB Database if (!mongoDB.Insert<AppModel>(parsedApp)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.ForegroundColor = ConsoleColor.Red; logger.Info ("Inserted App : " + parsedApp.Name); Console.ForegroundColor = ConsoleColor.White; mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in relatedApps) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue (extraAppUrl); } } // Console Feedback logger.Info ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { logger.Error (ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp (app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning logger.Error (ex); } } } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField) { // Console Feedback Console.WriteLine("Crawling Search Term : [ " + searchField + " ]"); // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match(response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, pagToken); // Executing request for values response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object LogSetup.InitializeLog("PlayStoreWorker.log", "info"); Logger logger = LogManager.GetCurrentClassLogger(); logger.Info("Worker Started"); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { logger.Fatal(ex); System.Environment.Exit(-101); } } // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); /* * // populate min downloaded & max downloaded * int count = 0; * var apps = mongoDB.FindAll<AppModel>(); * foreach(var a in apps) * { * a.FillMinAndMaxInstalls(); ++count; * * if((count % 100) == 0) * { * Console.WriteLine("updated {0}", count); * } * * if (!mongoDB.UpsertKeyEq<AppModel>(a, "Url", a.Url)) * { * Console.WriteLine("UpsertKey failed"); * } * } */ // Creating Instance of Web Requests Server WebRequests server = new WebRequests(); // Queued App Model QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify()) != null) { try { // Building APP URL string appUrl = app.Url; // Sanity check of app page url if (app.Url.IndexOf("http", StringComparison.OrdinalIgnoreCase) < 0) { appUrl = Consts.APP_URL_PREFIX + app.Url; } // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to logger.Info("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue(app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Checking for the need to use "HTTP Proxies" if (isUsingProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Issuing HTTP Request string response = server.Get(appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK) { logger.Info("Error opening app page : " + appUrl); ProcessingWorked = false; if (isUsingProxies) { ProxiesLoader.IncrementCurrentProxy(); } // Renewing WebRequest Object to get rid of Cookies server = new WebRequests(); // Fallback time variable double waitTime; // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much // If there are no proxies in use, on the other hand, the process must wait more time if (isUsingProxies) { // Waits two seconds everytime waitTime = TimeSpan.FromSeconds(2).TotalMilliseconds; } else { // Increments retry counter retryCounter++; // Checking for maximum retry count if (retryCounter >= 8) { waitTime = TimeSpan.FromMinutes(20).TotalMilliseconds; } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds; } } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP logger.Info("======================================================"); logger.Info("\n\tFallback : " + waitTime + " Seconds"); Thread.Sleep(Convert.ToInt32(waitTime)); // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue" if (server.StatusCode == 0) { // Console Feedback logger.Info("\tApp Not Found (404) - " + app.Url); mongoDB.RemoveFromQueue(app.Url); } logger.Info("======================================================"); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage(response, appUrl); // Normalizing URLs if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperPrivacyPolicy)) { parsedApp.DeveloperPrivacyPolicy = parsedApp.DeveloperPrivacyPolicy.Replace("https://www.google.com/url?q=", String.Empty); } if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperWebsite)) { parsedApp.DeveloperNormalizedDomain = parser.NormalizeDomainName(parsedApp.DeveloperWebsite); } List <String> relatedApps = new List <String> (); // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently try { // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps(response)) { relatedApps.Add(Consts.APP_URL_PREFIX + extraAppUrl); } // Adding "Related Apps" to Apps Model parsedApp.RelatedUrls = relatedApps.Distinct().ToArray(); } catch { logger.Info("\tNo Related Apps Found. Skipping"); } // Inserting App into Mongo DB Database if (!mongoDB.UpsertKeyEq <AppModel>(parsedApp, "Url", appUrl)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.ForegroundColor = ConsoleColor.Red; logger.Info("Inserted App : " + parsedApp.Name); Console.ForegroundColor = ConsoleColor.White; mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in relatedApps) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl; if (extraAppUrl.IndexOf("https://play.google.com/") >= 0) { fullExtraAppUrl = extraAppUrl; } else { fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; } // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue(extraAppUrl); } } // Console Feedback logger.Info("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { logger.Error(ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning logger.Error(ex); } } } }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; LogWriter.Info("Worker Started"); // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests(); QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify()) != null) { try { // Building APP URL string appUrl = Consts.APP_URL_PREFIX + app.Url; // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue(app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; string response = server.Get(appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Info("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests(); // Inc. retry counter retryCounter++; Console.WriteLine("Retrying:" + retryCounter); // Checking for maximum retry count double waitTime; if (retryCounter >= 11) { waitTime = TimeSpan.FromMinutes(35).TotalMilliseconds; // Removing App from the database (this the app page may have expired) mongoDB.RemoveFromQueue(app.Url); } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds; } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP Thread.Sleep(Convert.ToInt32(waitTime)); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage(response, appUrl); // Inserting App into Mongo DB Database if (!mongoDB.Insert <AppModel>(parsedApp)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Inserted App : " + parsedApp.Name); mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps(response)) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue(extraAppUrl); } } // Console Feedback Console.WriteLine("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { LogWriter.Error(ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning LogWriter.Error(ex); } } } }
/// <summary> /// /// </summary> /// <param name="categoryUrl"></param> private static void CrawlCategory(string categoryUrl, string categoryName, bool shouldUseProxies) { // Console Feedback _logger.Warn("Crawling Category : [ " + categoryName + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet <String> foundUrls = new HashSet <String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Executing Initial Request response = server.Get(categoryUrl); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Saving found url on local hashset foundUrls.Add(url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Than, queue it :) mongoDB.AddToQueue(url); } } // Executing Requests for more Play Store Links int baseSkip = 60; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format(Consts.CATEGORIES_POST_DATA, (currentMultiplier * baseSkip), baseSkip); // Executing request for values response = server.Post(String.Format(categoryUrl + "?authuser=0"), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // If a certain app is found twice, it means that the "pagging" logic got stuck into a // Loop, so the all the apps for this category were parsed already if (foundUrls.Contains(url)) { isDonePagging = true; break; } // Saving found url on local hashset foundUrls.Add(url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Than, queue it :) mongoDB.AddToQueue(url); } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore (string searchField) { // Console Feedback Console.WriteLine ("Crawling Search Term : [ " + searchField + " ]"); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post (Consts.CRAWL_URL, Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format (Consts.POST_DATA, (initialSkip * currentMultiplier)); // Executing request for values response = server.Post (Consts.CRAWL_URL, postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error ("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore (string searchField) { // Console Feedback Console.WriteLine ("Crawling Search Term : [ " + searchField + " ]"); // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex (@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex ("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post (String.Format (Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match (response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace (":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format (Consts.POST_DATA, pagToken); // Executing request for values response = server.Post (String.Format (Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error ("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a Mongo "QUEUE" collection /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField, bool shouldUseProxies) { // Console Feedback _logger.Warn("Crawling Search Term : [ " + searchField + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet <String> foundUrls = new HashSet <String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\x22", RegexOptions.Compiled); // HTML Response string response; // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Checking for the need to use "HTTP Proxies" if (shouldUseProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Executing Initial Request response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed foundUrls.Add(url); if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url))) { // Than, queue it :) _mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match(response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\x22", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, pagToken); // Executing request for values response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { if (foundUrls.Contains(url)) { isDonePagging = true; break; } // Checks whether the app have been already processed foundUrls.Add(url); if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url))) { // Than, queue it :) _mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; LogWriter.Info ("Worker Started"); // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests (); QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify ()) != null) { try { // Building APP URL string appUrl = Consts.APP_URL_PREFIX + app.Url; // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue (app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; string response = server.Get (appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Info ("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests (); // Inc. retry counter retryCounter++; Console.WriteLine ("Retrying:" + retryCounter); // Checking for maximum retry count double waitTime; if (retryCounter >= 11) { waitTime = TimeSpan.FromMinutes (35).TotalMilliseconds; // Removing App from the database (this the app page may have expired) mongoDB.RemoveFromQueue (appUrl); } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds; } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP Thread.Sleep (Convert.ToInt32 (waitTime)); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage (response, appUrl); // Inserting App into Mongo DB Database if (!mongoDB.Insert<AppModel>(parsedApp)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Inserted App : " + parsedApp.Name); mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps (response)) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue (extraAppUrl); } } // Console Feedback Console.WriteLine ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { LogWriter.Error (ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning LogWriter.Error (ex); } } } }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use HTTP proxies or not if (args != null && args.Length == 1) { _logger.Info("Loading Proxies from File"); // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // Configuring MongoDB Wrapper _logger.Info("Setting up MongoDB Collections and Indexes"); _mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database collections have the proper indexes _mongoDB.EnsureIndex("Url"); _mongoDB.EnsureIndex("IsBusy", Consts.QUEUED_APPS_COLLECTION); _mongoDB.EnsureIndex("Url", Consts.QUEUED_APPS_COLLECTION); // Main Flow _logger.Info("Started Bootstrapping Steps"); // Scrapping "Play Store Categories" foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames) { CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies); } // Queueing Apps that start with each of the characters from "A" to "Z" foreach (var character in BootstrapTerms.charactersSearchTerms) { CrawlStore(character, isUsingProxies); } /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES foreach (var category in BootstrapTerms.categoriesSearchTerms) { CrawlStore(category, isUsingProxies); } // Extra "Random" Search terms to increase even more the crawler's reach foreach (var miscTerm in BootstrapTerms.miscSearchTerms) { CrawlStore(miscTerm, isUsingProxies); } // Country Names as Search terms to increase even more the crawler's reach foreach (var countryName in BootstrapTerms.countryNames) { CrawlStore(countryName, isUsingProxies); } _logger.Info("\n\nBootstrapping Apps of Past Collections"); // Iterating over past collections HashSet <String> appUrls = new HashSet <String> (); foreach (string collection in _mongoDB.GetHistoryOfCollections()) { _logger.Info("Reading Collection [{0}]", collection); foreach (string app in _mongoDB.FindAllFromCollectionAs <AppModel> (collection).Select(t => t.Url)) { if (!appUrls.Contains(app)) { appUrls.Add(app); } } _logger.Info("\t=> Distinct Apps Found {0}", appUrls.Count); } // Adding Distinct Apps to the collection of apps to be processed int appsQueued = 0; foreach (string appUrl in appUrls) { _mongoDB.AddToQueue(appUrl); if (appsQueued++ % 10000 == 0) { _logger.Info("[Progress] Apps Queued From Past Collections [{0}]", appsQueued); } } }
/// <summary> /// /// </summary> /// <param name="categoryUrl"></param> private static void CrawlCategory (string categoryUrl, string categoryName, bool shouldUseProxies) { // Console Feedback _logger.Warn ("Crawling Category : [ " + categoryName + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet<String> foundUrls = new HashSet<String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex ("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Executing Initial Request response = server.Get (categoryUrl); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Saving found url on local hashset foundUrls.Add (url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); } } // Executing Requests for more Play Store Links int baseSkip = 60; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format (Consts.CATEGORIES_POST_DATA, (currentMultiplier * baseSkip), baseSkip); // Executing request for values response = server.Post (String.Format (categoryUrl + "?authuser=0"), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error ("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { // If a certain app is found twice, it means that the "pagging" logic got stuck into a // Loop, so the all the apps for this category were parsed already if (foundUrls.Contains (url)) { isDonePagging = true; break; } // Saving found url on local hashset foundUrls.Add (url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a Mongo "QUEUE" collection /// </summary> /// <param name="searchField"></param> private static void CrawlStore (string searchField, bool shouldUseProxies) { // Console Feedback _logger.Warn ("Crawling Search Term : [ " + searchField + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet<String> foundUrls = new HashSet<String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex (@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex ("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Checking for the need to use "HTTP Proxies" if (shouldUseProxies) { server.Proxy = ProxiesLoader.GetWebProxy (); } // Executing Initial Request response = server.Post (String.Format (Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed foundUrls.Add (url); if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match (response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace (":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format (Consts.POST_DATA, pagToken); // Executing request for values response = server.Post (String.Format (Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error ("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { if (foundUrls.Contains (url)) { isDonePagging = true; break; } // Checks whether the app have been already processed foundUrls.Add (url); if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField) { // Console Feedback Console.WriteLine("Crawling Search Term : [ " + searchField + " ]"); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post(Consts.CRAWL_URL, Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, (initialSkip * currentMultiplier)); // Executing request for values response = server.Post(Consts.CRAWL_URL, postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }