public static string GetAppReviews(string appID, int reviewsPage, bool isUsingProxies = false) { // Creating Instance of HTTP Requests Handler using (WebRequests httpClient = new WebRequests()) { // Configuring Request Object httpClient.Host = Consts.HOST; httpClient.Origin = Consts.ORIGIN; httpClient.Encoding = "utf-8"; httpClient.AllowAutoRedirect = true; httpClient.Accept = "*/*"; httpClient.UserAgent = Consts.USER_AGENT; httpClient.ContentType = "application/x-www-form-urlencoded;charset=UTF-8"; httpClient.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; httpClient.Headers.Add(Consts.ACCEPT_LANGUAGE); // Checking for the need to use a Proxy on this request if (isUsingProxies) { httpClient.Proxy = ProxiesLoader.GetWebProxy(); } // Assembling Post Data string postData = String.Format(Consts.REVIEWS_POST_DATA, reviewsPage, appID); // Issuing Request return(httpClient.Post(Consts.REVIEWS_URL, postData)); } }
public string Get(string url, bool useProxies) { using (WebRequests httpClient = new WebRequests()) { // Should this request use HTTP Proxies ? if (useProxies) { httpClient.Proxy = ProxiesLoader.GetWebProxy(); } httpClient.UserAgent = Consts.USER_AGENT; string htmlResponse = httpClient.Get(url); return(htmlResponse); } }
public string GetRootPage(bool useProxies) { string htmlResponse = String.Empty; int currentRetry = 0, maxRetries = 100; using (WebRequests httpClient = new WebRequests()) { // (Re) Trying to reach Root Page do { // Should this request use HTTP Proxies ? if (useProxies) { httpClient.Proxy = ProxiesLoader.GetWebProxy(); } htmlResponse = httpClient.Get(Consts.ROOT_STORE_URL); currentRetry++; } while (String.IsNullOrEmpty(htmlResponse) && currentRetry <= maxRetries); } return(htmlResponse); }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Loading Configuration LogSetup.InitializeLog("Apple_Store_Numerics_Worker.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper numericUrlQueue = new AWSSQSHelper(_numericUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper appsUrlQueue = new AWSSQSHelper(_appUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.Info("Started Processing Numeric Urls"); do { try { // Dequeueing messages from the Queue if (!numericUrlQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!numericUrlQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var numericUrl in numericUrlQueue.GetDequeuedMessages()) { try { // Retries Counter int retries = 0; string htmlResponse; // Retrying if necessary do { // Executing Http Request for the Category Url htmlResponse = httpClient.Get(numericUrl.Body, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { _logger.Info("Retrying Request for Category Page"); retries++; } } while (String.IsNullOrWhiteSpace(htmlResponse) && retries <= _maxRetries); // Checking if retries failed if (String.IsNullOrWhiteSpace(htmlResponse)) { // Deletes Message and moves on numericUrlQueue.DeleteMessage(numericUrl); continue; } // Feedback _logger.Info("Current page " + numericUrl.Body); foreach (var parsedAppUrl in parser.ParseAppsUrls(htmlResponse)) { // Enqueueing App Urls appsUrlQueue.EnqueueMessage(HttpUtility.HtmlDecode(parsedAppUrl)); } } catch (Exception ex) { _logger.Info(ex); } finally { // Deleting the message numericUrlQueue.DeleteMessage(numericUrl); } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object LogSetup.InitializeLog("PlayStoreWorker.log", "info"); Logger logger = LogManager.GetCurrentClassLogger(); logger.Info("Worker Started"); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { logger.Fatal(ex); System.Environment.Exit(-101); } } // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); /* * // populate min downloaded & max downloaded * int count = 0; * var apps = mongoDB.FindAll<AppModel>(); * foreach(var a in apps) * { * a.FillMinAndMaxInstalls(); ++count; * * if((count % 100) == 0) * { * Console.WriteLine("updated {0}", count); * } * * if (!mongoDB.UpsertKeyEq<AppModel>(a, "Url", a.Url)) * { * Console.WriteLine("UpsertKey failed"); * } * } */ // Creating Instance of Web Requests Server WebRequests server = new WebRequests(); // Queued App Model QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify()) != null) { try { // Building APP URL string appUrl = app.Url; // Sanity check of app page url if (app.Url.IndexOf("http", StringComparison.OrdinalIgnoreCase) < 0) { appUrl = Consts.APP_URL_PREFIX + app.Url; } // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to logger.Info("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue(app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Checking for the need to use "HTTP Proxies" if (isUsingProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Issuing HTTP Request string response = server.Get(appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK) { logger.Info("Error opening app page : " + appUrl); ProcessingWorked = false; if (isUsingProxies) { ProxiesLoader.IncrementCurrentProxy(); } // Renewing WebRequest Object to get rid of Cookies server = new WebRequests(); // Fallback time variable double waitTime; // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much // If there are no proxies in use, on the other hand, the process must wait more time if (isUsingProxies) { // Waits two seconds everytime waitTime = TimeSpan.FromSeconds(2).TotalMilliseconds; } else { // Increments retry counter retryCounter++; // Checking for maximum retry count if (retryCounter >= 8) { waitTime = TimeSpan.FromMinutes(20).TotalMilliseconds; } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds; } } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP logger.Info("======================================================"); logger.Info("\n\tFallback : " + waitTime + " Seconds"); Thread.Sleep(Convert.ToInt32(waitTime)); // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue" if (server.StatusCode == 0) { // Console Feedback logger.Info("\tApp Not Found (404) - " + app.Url); mongoDB.RemoveFromQueue(app.Url); } logger.Info("======================================================"); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage(response, appUrl); // Normalizing URLs if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperPrivacyPolicy)) { parsedApp.DeveloperPrivacyPolicy = parsedApp.DeveloperPrivacyPolicy.Replace("https://www.google.com/url?q=", String.Empty); } if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperWebsite)) { parsedApp.DeveloperNormalizedDomain = parser.NormalizeDomainName(parsedApp.DeveloperWebsite); } List <String> relatedApps = new List <String> (); // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently try { // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps(response)) { relatedApps.Add(Consts.APP_URL_PREFIX + extraAppUrl); } // Adding "Related Apps" to Apps Model parsedApp.RelatedUrls = relatedApps.Distinct().ToArray(); } catch { logger.Info("\tNo Related Apps Found. Skipping"); } // Inserting App into Mongo DB Database if (!mongoDB.UpsertKeyEq <AppModel>(parsedApp, "Url", appUrl)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.ForegroundColor = ConsoleColor.Red; logger.Info("Inserted App : " + parsedApp.Name); Console.ForegroundColor = ConsoleColor.White; mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in relatedApps) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl; if (extraAppUrl.IndexOf("https://play.google.com/") >= 0) { fullExtraAppUrl = extraAppUrl; } else { fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; } // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue(extraAppUrl); } } // Console Feedback logger.Info("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { logger.Error(ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning logger.Error(ex); } } } }
static void Main(string[] args) { // Configuring Log Object Logger logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { logger.Fatal(ex); System.Environment.Exit(-101); } } // MongoDB instance Creation logger.Info("Configuring MonboDB Client"); // Creating instance of Mongo Handler for the main collection MongoDBWrapper mongoClient = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoClient.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); logger.Info("Iterating over Apps"); // Creating Play Store Parser PlayStoreParser parser = new PlayStoreParser(); // App Model AppModel appRecord; // Control Variable bool noError = true; // Finding all the "Apps" that didn't have the reviews visited yet while ((appRecord = mongoClient.FindAndModifyReviews()) != null) { // Extracting app ID from URL string appId = appRecord.Url.Replace(Consts.PLAY_STORE_PREFIX, String.Empty); // Console Feedback logger.Info("Processing App [ " + appRecord.Name + " ] "); try { // Console Feedback Console.Write("Reviews from : " + appRecord.Name); // Issuing Request for Reviews string response = ReviewsWrapper.GetAppReviews(appId, 1, isUsingProxies); // Checking for Blocking Situation if (String.IsNullOrEmpty(response)) { logger.Info("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 seconds Thread.Sleep(TimeSpan.FromSeconds(10)); } // Checking for "No Reviews" app if (response.Length < 50) { logger.Info("No Reviews for this app. Skipping"); Console.Write(" - No Reviews Found\n"); continue; } // Normalizing Response to Proper HTML response = ReviewsWrapper.NormalizeResponse(response); // List of Reviews List <AppReview> reviews = new List <AppReview> (); // Iterating over Parsed Reviews foreach (var review in parser.ParseReviews(response)) { // Adding App Data to the review review.appID = appId; review.appName = appRecord.Name; review.appURL = appRecord.Url; // Capture Timestamp to the model review.timestamp = DateTime.Now; // Adding reviews to the current list reviews.Add(review); } // Any Review Found ? if (reviews.Count > 0) { Console.Write(" - " + reviews.Count + " Reviews Found\n"); // Checking if there was any previous list of reviews if (appRecord.Reviews == null) { appRecord.Reviews = reviews; } else // Previous List found - Appending only the new ones { foreach (var review in reviews) { if (!appRecord.Reviews.Any(t => t.permalink.Equals(review.permalink))) { appRecord.Reviews.Add(review); } } } } } catch (Exception ex) { logger.Error(ex); Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Error : " + ex.Message); Console.ForegroundColor = ConsoleColor.White; noError = false; } finally { // Toggling back the "ReviewsStatus" attribute from the model if (noError) { appRecord.ReviewsStatus = "Visited"; mongoClient.SaveRecord <AppModel> (appRecord); } else // "Error" status { appRecord.ReviewsStatus = "Error"; mongoClient.SaveRecord <AppModel> (appRecord); } } } }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Loading Configuration LogSetup.InitializeLog("Apple_Store_Urls_Worker.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper appsUrlQueue = new AWSSQSHelper(_appUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper appsDataQueue = new AWSSQSHelper(_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.Info("Started Processing Individual Apps Urls"); do { try { // Dequeueing messages from the Queue if (!appsUrlQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!appsUrlQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var appUrl in appsUrlQueue.GetDequeuedMessages()) { bool processingWorked = true; try { // Retries Counter int retries = 0; string htmlResponse; // Retrying if necessary do { // Executing Http Request for the Category Url //appUrl.Body = "https://itunes.apple.com/us/app/action-run-3d/id632371832?mt=8"; //appUrl.Body = "https://itunes.apple.com/us/app/emoji-2-free-new-emoticons/id521863802?mt=8"; //appUrl.Body = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8"; //appUrl.Body = "https://itunes.apple.com/us/app/dba-den-bla-avis/id448605988?mt=8"; htmlResponse = httpClient.Get(appUrl.Body, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { // Extending Fallback time retries++; int sleepTime = retries * _hiccupTime <= 30000 ? retries * _hiccupTime : 30000; _logger.Info("Retrying Request for App Page [ " + sleepTime / 1000 + " ]"); Thread.Sleep(sleepTime); } } while (String.IsNullOrWhiteSpace(htmlResponse) && retries <= _maxRetries); // Checking if retries failed if (String.IsNullOrWhiteSpace(htmlResponse)) { continue; } // Feedback _logger.Info("Current page " + appUrl.Body, "Parsing App Data"); // Parsing Data out of the Html Page AppleStoreAppModel parsedApp = parser.ParseAppPage(htmlResponse); parsedApp.url = appUrl.Body; // Enqueueing App Data appsDataQueue.EnqueueMessage(parsedApp.ToJson()); // Little Hiccup Thread.Sleep(_hiccupTime); } catch (Exception ex) { _logger.Error(ex); // Setting Flag to "False" processingWorked = false; } finally { //Deleting the message - Only if the processing worked if (processingWorked) { appsUrlQueue.DeleteMessage(appUrl); } } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Setting Up Log LogSetup.InitializeLog("Apple_Store_Crawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Starting Flow _logger.Info("Worker Started"); // Loading Configuration _logger.Info("Reading Configuration"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper sqsWrapper = new AWSSQSHelper(_categoriesQueueName, 10, _awsKey, _awsKeySecret); // Step 1 - Trying to obtain the root page html (source of all the apps) var rootPageResponse = httpClient.GetRootPage(shouldUseProxies); // Sanity Check if (String.IsNullOrWhiteSpace(rootPageResponse)) { _logger.Info("Error obtaining Root Page HTMl - Aborting", "Timeout Error"); return; } // Step 2 - Extracting Category Urls from the Root Page and queueing their Urls foreach (var categoryUrl in parser.ParseCategoryUrls(rootPageResponse)) { // Logging Feedback _logger.Info("Queueing Category : " + categoryUrl); // Queueing Category Urls sqsWrapper.EnqueueMessage(categoryUrl); } _logger.Info("End of Bootstrapping phase"); }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use HTTP proxies or not if (args != null && args.Length == 1) { _logger.Info("Loading Proxies from File"); // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // Configuring MongoDB Wrapper _logger.Info("Setting up MongoDB Collections and Indexes"); _mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database collections have the proper indexes _mongoDB.EnsureIndex("Url"); _mongoDB.EnsureIndex("IsBusy", Consts.QUEUED_APPS_COLLECTION); _mongoDB.EnsureIndex("Url", Consts.QUEUED_APPS_COLLECTION); // Main Flow _logger.Info("Started Bootstrapping Steps"); // Scrapping "Play Store Categories" foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames) { CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies); } // Queueing Apps that start with each of the characters from "A" to "Z" foreach (var character in BootstrapTerms.charactersSearchTerms) { CrawlStore(character, isUsingProxies); } /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES foreach (var category in BootstrapTerms.categoriesSearchTerms) { CrawlStore(category, isUsingProxies); } // Extra "Random" Search terms to increase even more the crawler's reach foreach (var miscTerm in BootstrapTerms.miscSearchTerms) { CrawlStore(miscTerm, isUsingProxies); } // Country Names as Search terms to increase even more the crawler's reach foreach (var countryName in BootstrapTerms.countryNames) { CrawlStore(countryName, isUsingProxies); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a Mongo "QUEUE" collection /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField, bool shouldUseProxies) { // Console Feedback _logger.Warn("Crawling Search Term : [ " + searchField + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet <String> foundUrls = new HashSet <String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\x22", RegexOptions.Compiled); // HTML Response string response; // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Checking for the need to use "HTTP Proxies" if (shouldUseProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Executing Initial Request response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed foundUrls.Add(url); if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url))) { // Than, queue it :) _mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match(response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\x22", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, pagToken); // Executing request for values response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { if (foundUrls.Contains(url)) { isDonePagging = true; break; } // Checks whether the app have been already processed foundUrls.Add(url); if ((!_mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!_mongoDB.AppQueued(url))) { // Than, queue it :) _mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use HTTP proxies or not if (args != null && args.Length == 1) { _logger.Info("Loading Proxies from File"); // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // Main Flow _logger.Info("Started Bootstrapping Steps"); // Scrapping "Play Store Categories" foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames) { CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies); } // Queueing Apps that start with each of the characters from "A" to "Z" foreach (var character in BootstrapTerms.charactersSearchTerms) { CrawlStore(character, isUsingProxies); } /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES foreach (var category in BootstrapTerms.categoriesSearchTerms) { CrawlStore(category, isUsingProxies); } // Extra "Random" Search terms to increase even more the crawler's reach foreach (var miscTerm in BootstrapTerms.miscSearchTerms) { CrawlStore(miscTerm, isUsingProxies); } // Country Names as Search terms to increase even more the crawler's reach foreach (var countryName in BootstrapTerms.countryNames) { CrawlStore(countryName, isUsingProxies); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a Mongo "QUEUE" collection /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField, bool shouldUseProxies) { // Console Feedback _logger.Info("Crawling Search Term : [ " + searchField + " ]"); // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Checking for the need to use "HTTP Proxies" if (shouldUseProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Executing Initial Request response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback _logger.Info("Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } else { // Console Feedback _logger.Info("Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match(response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, pagToken); // Executing request for values response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Loading Configuration LogSetup.InitializeLog("Apple_Store_Urls_Worker.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper charactersUrlQueue = new AWSSQSHelper(_characterUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper numericUrlQueue = new AWSSQSHelper(_numericUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.Info("Started Processing Character Urls"); do { try { // Dequeueing messages from the Queue if (!charactersUrlQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!charactersUrlQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var characterUrl in charactersUrlQueue.GetDequeuedMessages()) { // Console Feedback _logger.Info("Started Parsing Url : " + characterUrl.Body); try { // Retries Counter int retries = 0; string htmlResponse; // Retrying if necessary do { // Executing Http Request for the Category Url htmlResponse = httpClient.Get(characterUrl.Body, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { _logger.Info("Retrying Request for Character Page"); retries++; // Small Hiccup Thread.Sleep(_hiccupTime); } } while (String.IsNullOrWhiteSpace(htmlResponse) && retries <= _maxRetries); // Checking if retries failed if (String.IsNullOrWhiteSpace(htmlResponse)) { // Deletes Message and moves on charactersUrlQueue.DeleteMessage(characterUrl); continue; } // Hashset of urls processed (to avoid duplicates) HashSet <String> urlsQueued = new HashSet <String> (); // Executing Request and Queueing Urls until there's no other Url to be queued do { // Flag to check whether any url was added after the last iteration (avoids endless loop) bool anyNewUrl = false; // If the request worked, parses the Urls out of the page foreach (string numericUrls in parser.ParseNumericUrls(htmlResponse).Select(t => HttpUtility.HtmlDecode(t))) { // Checking if this url was previously queued if (!urlsQueued.Contains(numericUrls)) { // Enqueueing Urls numericUrlQueue.EnqueueMessage(HttpUtility.HtmlDecode(numericUrls)); // Adding url to the local hashset urlsQueued.Add(numericUrls); anyNewUrl = true; } } // Checking for the need to perform another http request for the next page if (parser.IsLastPage(htmlResponse) || !anyNewUrl) { break; // Breaks "While" Loop } // Feedback _logger.Info("Urls Queued For This Page : " + urlsQueued.Count, "\n\tProcessing Feedback"); // If it got to this point, it means that there are more pages to be processed // Parsing URL of the "Last" page (the last that's visible) string lastPageUrl = HttpUtility.HtmlDecode(parser.ParseLastPageUrl(htmlResponse)); // Executing Http Request for this Url (with retries) retries = 0; do { // HTTP Get for the Page htmlResponse = httpClient.Get(lastPageUrl, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { _logger.Error("Retrying Request for Last Page"); retries++; // Small Hiccup Thread.Sleep(_hiccupTime); } } while (String.IsNullOrEmpty(htmlResponse) && retries <= _maxRetries); } while (true); } catch (Exception ex) { _logger.Error(ex); } finally { charactersUrlQueue.DeleteMessage(characterUrl); } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }