/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object LogSetup.InitializeLog("PlayStoreWorker.log", "info"); Logger logger = LogManager.GetCurrentClassLogger(); logger.Info("Worker Started"); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { logger.Fatal(ex); System.Environment.Exit(-101); } } // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); /* * // populate min downloaded & max downloaded * int count = 0; * var apps = mongoDB.FindAll<AppModel>(); * foreach(var a in apps) * { * a.FillMinAndMaxInstalls(); ++count; * * if((count % 100) == 0) * { * Console.WriteLine("updated {0}", count); * } * * if (!mongoDB.UpsertKeyEq<AppModel>(a, "Url", a.Url)) * { * Console.WriteLine("UpsertKey failed"); * } * } */ // Creating Instance of Web Requests Server WebRequests server = new WebRequests(); // Queued App Model QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify()) != null) { try { // Building APP URL string appUrl = app.Url; // Sanity check of app page url if (app.Url.IndexOf("http", StringComparison.OrdinalIgnoreCase) < 0) { appUrl = Consts.APP_URL_PREFIX + app.Url; } // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to logger.Info("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue(app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Checking for the need to use "HTTP Proxies" if (isUsingProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Issuing HTTP Request string response = server.Get(appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK) { logger.Info("Error opening app page : " + appUrl); ProcessingWorked = false; if (isUsingProxies) { ProxiesLoader.IncrementCurrentProxy(); } // Renewing WebRequest Object to get rid of Cookies server = new WebRequests(); // Fallback time variable double waitTime; // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much // If there are no proxies in use, on the other hand, the process must wait more time if (isUsingProxies) { // Waits two seconds everytime waitTime = TimeSpan.FromSeconds(2).TotalMilliseconds; } else { // Increments retry counter retryCounter++; // Checking for maximum retry count if (retryCounter >= 8) { waitTime = TimeSpan.FromMinutes(20).TotalMilliseconds; } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds; } } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP logger.Info("======================================================"); logger.Info("\n\tFallback : " + waitTime + " Seconds"); Thread.Sleep(Convert.ToInt32(waitTime)); // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue" if (server.StatusCode == 0) { // Console Feedback logger.Info("\tApp Not Found (404) - " + app.Url); mongoDB.RemoveFromQueue(app.Url); } logger.Info("======================================================"); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage(response, appUrl); // Normalizing URLs if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperPrivacyPolicy)) { parsedApp.DeveloperPrivacyPolicy = parsedApp.DeveloperPrivacyPolicy.Replace("https://www.google.com/url?q=", String.Empty); } if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperWebsite)) { parsedApp.DeveloperNormalizedDomain = parser.NormalizeDomainName(parsedApp.DeveloperWebsite); } List <String> relatedApps = new List <String> (); // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently try { // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps(response)) { relatedApps.Add(Consts.APP_URL_PREFIX + extraAppUrl); } // Adding "Related Apps" to Apps Model parsedApp.RelatedUrls = relatedApps.Distinct().ToArray(); } catch { logger.Info("\tNo Related Apps Found. Skipping"); } // Inserting App into Mongo DB Database if (!mongoDB.UpsertKeyEq <AppModel>(parsedApp, "Url", appUrl)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.ForegroundColor = ConsoleColor.Red; logger.Info("Inserted App : " + parsedApp.Name); Console.ForegroundColor = ConsoleColor.White; mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in relatedApps) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl; if (extraAppUrl.IndexOf("https://play.google.com/") >= 0) { fullExtraAppUrl = extraAppUrl; } else { fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; } // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue(extraAppUrl); } } // Console Feedback logger.Info("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { logger.Error(ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning logger.Error(ex); } } } }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Loading Configuration LogSetup.InitializeLog("Apple_Store_Numerics_Worker.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper numericUrlQueue = new AWSSQSHelper(_numericUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper appsUrlQueue = new AWSSQSHelper(_appUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.Info("Started Processing Numeric Urls"); do { try { // Dequeueing messages from the Queue if (!numericUrlQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!numericUrlQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var numericUrl in numericUrlQueue.GetDequeuedMessages()) { try { // Retries Counter int retries = 0; string htmlResponse; // Retrying if necessary do { // Executing Http Request for the Category Url htmlResponse = httpClient.Get(numericUrl.Body, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { _logger.Info("Retrying Request for Category Page"); retries++; } } while (String.IsNullOrWhiteSpace(htmlResponse) && retries <= _maxRetries); // Checking if retries failed if (String.IsNullOrWhiteSpace(htmlResponse)) { // Deletes Message and moves on numericUrlQueue.DeleteMessage(numericUrl); continue; } // Feedback _logger.Info("Current page " + numericUrl.Body); foreach (var parsedAppUrl in parser.ParseAppsUrls(htmlResponse)) { // Enqueueing App Urls appsUrlQueue.EnqueueMessage(HttpUtility.HtmlDecode(parsedAppUrl)); } } catch (Exception ex) { _logger.Info(ex); } finally { // Deleting the message numericUrlQueue.DeleteMessage(numericUrl); } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Setting Up Log LogSetup.InitializeLog("Apple_Store_Crawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Starting Flow _logger.Info("Worker Started"); // Loading Configuration _logger.Info("Reading Configuration"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper sqsWrapper = new AWSSQSHelper(_categoriesQueueName, 10, _awsKey, _awsKeySecret); // Step 1 - Trying to obtain the root page html (source of all the apps) var rootPageResponse = httpClient.GetRootPage(shouldUseProxies); // Sanity Check if (String.IsNullOrWhiteSpace(rootPageResponse)) { _logger.Info("Error obtaining Root Page HTMl - Aborting", "Timeout Error"); return; } // Step 2 - Extracting Category Urls from the Root Page and queueing their Urls foreach (var categoryUrl in parser.ParseCategoryUrls(rootPageResponse)) { // Logging Feedback _logger.Info("Queueing Category : " + categoryUrl); // Queueing Category Urls sqsWrapper.EnqueueMessage(categoryUrl); } _logger.Info("End of Bootstrapping phase"); }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Loading Configuration LogSetup.InitializeLog("Apple_Store_Urls_Worker.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper appsUrlQueue = new AWSSQSHelper(_appUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper appsDataQueue = new AWSSQSHelper(_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.Info("Started Processing Individual Apps Urls"); do { try { // Dequeueing messages from the Queue if (!appsUrlQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!appsUrlQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var appUrl in appsUrlQueue.GetDequeuedMessages()) { bool processingWorked = true; try { // Retries Counter int retries = 0; string htmlResponse; // Retrying if necessary do { // Executing Http Request for the Category Url //appUrl.Body = "https://itunes.apple.com/us/app/action-run-3d/id632371832?mt=8"; //appUrl.Body = "https://itunes.apple.com/us/app/emoji-2-free-new-emoticons/id521863802?mt=8"; //appUrl.Body = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8"; //appUrl.Body = "https://itunes.apple.com/us/app/dba-den-bla-avis/id448605988?mt=8"; htmlResponse = httpClient.Get(appUrl.Body, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { // Extending Fallback time retries++; int sleepTime = retries * _hiccupTime <= 30000 ? retries * _hiccupTime : 30000; _logger.Info("Retrying Request for App Page [ " + sleepTime / 1000 + " ]"); Thread.Sleep(sleepTime); } } while (String.IsNullOrWhiteSpace(htmlResponse) && retries <= _maxRetries); // Checking if retries failed if (String.IsNullOrWhiteSpace(htmlResponse)) { continue; } // Feedback _logger.Info("Current page " + appUrl.Body, "Parsing App Data"); // Parsing Data out of the Html Page AppleStoreAppModel parsedApp = parser.ParseAppPage(htmlResponse); parsedApp.url = appUrl.Body; // Enqueueing App Data appsDataQueue.EnqueueMessage(parsedApp.ToJson()); // Little Hiccup Thread.Sleep(_hiccupTime); } catch (Exception ex) { _logger.Error(ex); // Setting Flag to "False" processingWorked = false; } finally { //Deleting the message - Only if the processing worked if (processingWorked) { appsUrlQueue.DeleteMessage(appUrl); } } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use HTTP proxies or not if (args != null && args.Length == 1) { _logger.Info("Loading Proxies from File"); // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // Configuring MongoDB Wrapper _logger.Info("Setting up MongoDB Collections and Indexes"); _mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database collections have the proper indexes _mongoDB.EnsureIndex("Url"); _mongoDB.EnsureIndex("IsBusy", Consts.QUEUED_APPS_COLLECTION); _mongoDB.EnsureIndex("Url", Consts.QUEUED_APPS_COLLECTION); // Main Flow _logger.Info("Started Bootstrapping Steps"); // Scrapping "Play Store Categories" foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames) { CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies); } // Queueing Apps that start with each of the characters from "A" to "Z" foreach (var character in BootstrapTerms.charactersSearchTerms) { CrawlStore(character, isUsingProxies); } /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES foreach (var category in BootstrapTerms.categoriesSearchTerms) { CrawlStore(category, isUsingProxies); } // Extra "Random" Search terms to increase even more the crawler's reach foreach (var miscTerm in BootstrapTerms.miscSearchTerms) { CrawlStore(miscTerm, isUsingProxies); } // Country Names as Search terms to increase even more the crawler's reach foreach (var countryName in BootstrapTerms.countryNames) { CrawlStore(countryName, isUsingProxies); } }
static void Main(string[] args) { // Loading Configuration LogSetup.InitializeLog("Apple_Store_Recorder.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Initializing Queue _logger.Info("Initializing Queue"); AWSSQSHelper appsDataQueue = new AWSSQSHelper(_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper backup = new AWSSQSHelper("DeadLetter", _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Creating MongoDB Instance _logger.Info("Loading MongoDB / Creating Instances"); MongoDBWrapper mongoDB = new MongoDBWrapper(); string serverAddr = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, serverAddr, 10000, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; // Buffer of Messages to be recorder List <AppleStoreAppModel> recordsBuffer = new List <AppleStoreAppModel> (); List <Message> messagesBuffer = new List <Message> (); // Insert Batch Size int batchSize = 1000; _logger.Info("Started Recording App Data"); do { try { // Dequeueing messages from the Queue if (!appsDataQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!appsDataQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var appDataMessage in appsDataQueue.GetDequeuedMessages()) { try { // Deserializing message var appData = AppleStoreAppModel.FromJson(appDataMessage.Body); // Dumping "Url" to "_id" appData._id = appData.url; // Adding it to the buffer of records to be recorded recordsBuffer.Add(appData); // Adding message to the buffer of messages to be deleted messagesBuffer.Add(appDataMessage); // Is it time to batch insert ? if ((recordsBuffer.Count % batchSize) == 0) { // Batch Insertion mongoDB.BatchInsert <AppleStoreAppModel> (recordsBuffer); // Logging Feedback _logger.Info("\tApps Recorded : " + recordsBuffer.Count); // Deleting Messages messagesBuffer.ForEach((msg) => appsDataQueue.DeleteMessage(msg)); _logger.Info("\tMessages Deleted: " + messagesBuffer.Count); // Clearing Buffers recordsBuffer.Clear(); messagesBuffer.Clear(); } } catch (Exception ex) { _logger.Error(ex); } finally { // Deleting the message appsDataQueue.DeleteMessage(appDataMessage); backup.EnqueueMessage(appDataMessage.Body); } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use HTTP proxies or not if (args != null && args.Length == 1) { _logger.Info("Loading Proxies from File"); // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // Main Flow _logger.Info("Started Bootstrapping Steps"); // Scrapping "Play Store Categories" foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames) { CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies); } // Queueing Apps that start with each of the characters from "A" to "Z" foreach (var character in BootstrapTerms.charactersSearchTerms) { CrawlStore(character, isUsingProxies); } /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES foreach (var category in BootstrapTerms.categoriesSearchTerms) { CrawlStore(category, isUsingProxies); } // Extra "Random" Search terms to increase even more the crawler's reach foreach (var miscTerm in BootstrapTerms.miscSearchTerms) { CrawlStore(miscTerm, isUsingProxies); } // Country Names as Search terms to increase even more the crawler's reach foreach (var countryName in BootstrapTerms.countryNames) { CrawlStore(countryName, isUsingProxies); } }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Crawling App Store using all characters as the Search Input CrawlStore("A"); CrawlStore("B"); CrawlStore("C"); CrawlStore("D"); CrawlStore("E"); CrawlStore("F"); CrawlStore("G"); CrawlStore("H"); CrawlStore("I"); CrawlStore("J"); CrawlStore("K"); CrawlStore("L"); CrawlStore("M"); CrawlStore("N"); CrawlStore("O"); CrawlStore("P"); CrawlStore("Q"); CrawlStore("R"); CrawlStore("S"); CrawlStore("T"); CrawlStore("U"); CrawlStore("V"); CrawlStore("X"); CrawlStore("Y"); CrawlStore("Z"); CrawlStore("W"); /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES CrawlStore("BOOKS"); CrawlStore("BUSINESS"); CrawlStore("COMICS"); CrawlStore("COMMUNICATION"); CrawlStore("EDUCATION"); CrawlStore("ENTERTAINMENT"); CrawlStore("FINANCE"); CrawlStore("HEALTH"); CrawlStore("LIFESTYLE"); CrawlStore("LIVE WALLPAPER"); CrawlStore("MEDIA"); CrawlStore("MEDICAL"); CrawlStore("MUSIC"); CrawlStore("NEWS"); CrawlStore("PERSONALIZATION"); CrawlStore("PHOTOGRAPHY"); CrawlStore("PRODUCTIVITY"); CrawlStore("SHOPPING"); CrawlStore("SOCIAL"); CrawlStore("SPORTS"); CrawlStore("TOOLS"); CrawlStore("TRANSPORTATION"); CrawlStore("TRAVEL"); CrawlStore("WEATHER"); CrawlStore("WIDGETS"); CrawlStore("ARCADE"); CrawlStore("BRAIN"); CrawlStore("CASUAL"); CrawlStore("CARDS"); CrawlStore("RACING"); // Extra "Random" Search terms to increase even more the crawler's reach CrawlStore("INDIE"); CrawlStore("ZOMBIE"); CrawlStore("CATS"); CrawlStore("ROOT"); CrawlStore("GPS"); CrawlStore("BLUETOOTH"); CrawlStore("COMPASS"); CrawlStore("WALLPAPER"); CrawlStore("TORRENT"); CrawlStore("P**N"); CrawlStore("PLAYER"); CrawlStore("WINE"); CrawlStore("ANTIVIRUS"); CrawlStore("P**N"); // Country Names as Search terms to increase even more the crawler's reach CrawlStore("Afghanistan"); CrawlStore("Albania"); CrawlStore("Algeria"); CrawlStore("American"); CrawlStore("Andorra"); CrawlStore("Angola"); CrawlStore("Anguilla"); CrawlStore("Antigua"); CrawlStore("Argentina"); CrawlStore("Armenia"); CrawlStore("Aruba"); CrawlStore("Australia"); CrawlStore("Austria"); CrawlStore("Azerbaijan"); CrawlStore("Bahamas"); CrawlStore("Bahrain"); CrawlStore("Bangladesh"); CrawlStore("Barbados"); CrawlStore("Belarus"); CrawlStore("Belgium"); CrawlStore("Belize"); CrawlStore("Benin"); CrawlStore("Bermuda"); CrawlStore("Bhutan"); CrawlStore("Bolivia"); CrawlStore("Bosnia"); CrawlStore("Botswana"); CrawlStore("Bouvet"); CrawlStore("Brazil"); CrawlStore("Brunei"); CrawlStore("Bulgaria"); CrawlStore("Burkina"); CrawlStore("Burundi"); CrawlStore("Cambodia"); CrawlStore("Cameroon"); CrawlStore("Canada"); CrawlStore("Cape"); CrawlStore("Cayman"); CrawlStore("Central"); CrawlStore("Chad"); CrawlStore("Chile"); CrawlStore("China"); CrawlStore("Christmas"); CrawlStore("Cocos"); CrawlStore("Colombia"); CrawlStore("Comoros"); CrawlStore("Congo"); CrawlStore("Congo"); CrawlStore("Cook"); CrawlStore("Costa"); CrawlStore("Croatia"); CrawlStore("Cuba"); CrawlStore("Cyprus"); CrawlStore("Czech"); CrawlStore("Denmark"); CrawlStore("Djibouti"); CrawlStore("Dominica"); CrawlStore("Dominican"); CrawlStore("Ecuador"); CrawlStore("Egypt"); CrawlStore("El"); CrawlStore("Equatorial"); CrawlStore("Eritrea"); CrawlStore("Estonia"); CrawlStore("Ethiopia"); CrawlStore("Falkland"); CrawlStore("Faroe"); CrawlStore("Fiji"); CrawlStore("Finland"); CrawlStore("France"); CrawlStore("French"); CrawlStore("Gabon"); CrawlStore("Gambia"); CrawlStore("Georgia"); CrawlStore("Germany"); CrawlStore("Ghana"); CrawlStore("Gibraltar"); CrawlStore("Greece"); CrawlStore("Greenland"); CrawlStore("Grenada"); CrawlStore("Guadeloupe"); CrawlStore("Guam"); CrawlStore("Guatemala"); CrawlStore("Guinea"); CrawlStore("Guinea"); CrawlStore("Guyana"); CrawlStore("Haiti"); CrawlStore("Holy"); CrawlStore("Honduras"); CrawlStore("Hong"); CrawlStore("Hungary"); CrawlStore("Iceland"); CrawlStore("India"); CrawlStore("Indonesia"); CrawlStore("Iran"); CrawlStore("Iraq"); CrawlStore("Ireland"); CrawlStore("Israel"); CrawlStore("Italy"); CrawlStore("Ivory"); CrawlStore("Jamaica"); CrawlStore("Japan"); CrawlStore("Jordan"); CrawlStore("Kazakhstan"); CrawlStore("Kenya"); CrawlStore("Kiribati"); CrawlStore("Kuwait"); CrawlStore("Kyrgyzstan"); CrawlStore("Laos"); CrawlStore("Latvia"); CrawlStore("Lebanon"); CrawlStore("Lesotho"); CrawlStore("Liberia"); CrawlStore("Libya"); CrawlStore("Liechtenstein"); CrawlStore("Lithuania"); CrawlStore("Luxembourg"); CrawlStore("Macau"); CrawlStore("Macedonia"); CrawlStore("Madagascar"); CrawlStore("Malawi"); CrawlStore("Malaysia"); CrawlStore("Maldives"); CrawlStore("Mali"); CrawlStore("Malta"); CrawlStore("Marshall"); CrawlStore("Martinique"); CrawlStore("Mauritania"); CrawlStore("Mauritius"); CrawlStore("Mayotte"); CrawlStore("Mexico"); CrawlStore("Micronesia"); CrawlStore("Moldova"); CrawlStore("Monaco"); CrawlStore("Mongolia"); CrawlStore("Montenegro"); CrawlStore("Montserrat"); CrawlStore("Morocco"); CrawlStore("Mozambique"); CrawlStore("Myanmar"); CrawlStore("Namibia"); CrawlStore("Nauru"); CrawlStore("Nepal"); CrawlStore("Netherlands"); CrawlStore("Netherlands"); CrawlStore("New"); CrawlStore("New"); CrawlStore("Nicaragua"); CrawlStore("Niger"); CrawlStore("Nigeria"); CrawlStore("Niue"); CrawlStore("Norfolk"); CrawlStore("North"); CrawlStore("Northern"); CrawlStore("Norway"); CrawlStore("Oman"); CrawlStore("Pakistan"); CrawlStore("Palau"); CrawlStore("Panama"); CrawlStore("Papua"); CrawlStore("Paraguay"); CrawlStore("Peru"); CrawlStore("Philippines"); CrawlStore("Pitcairn"); CrawlStore("Poland"); CrawlStore("Polynesia"); CrawlStore("Portugal"); CrawlStore("Puerto"); CrawlStore("Qatar"); CrawlStore("Reunion"); CrawlStore("Romania"); CrawlStore("Russia"); CrawlStore("Rwanda"); CrawlStore("Saint"); CrawlStore("Saint"); CrawlStore("Saint"); CrawlStore("Saint"); CrawlStore("Saint"); CrawlStore("Samoa"); CrawlStore("San"); CrawlStore("Sao"); CrawlStore("Saudi"); CrawlStore("Senegal"); CrawlStore("Serbia"); CrawlStore("Seychelles"); CrawlStore("Sierra"); CrawlStore("Singapore"); CrawlStore("Slovakia"); CrawlStore("Slovenia"); CrawlStore("Solomon"); CrawlStore("Somalia"); CrawlStore("South"); CrawlStore("South"); CrawlStore("South"); CrawlStore("South"); CrawlStore("Spain"); CrawlStore("Sri"); CrawlStore("Sudan"); CrawlStore("Suriname"); CrawlStore("Svalbard"); CrawlStore("Swaziland"); CrawlStore("Sweden"); CrawlStore("Switzerland"); CrawlStore("Syria"); CrawlStore("Taiwan"); CrawlStore("Tajikistan"); CrawlStore("Tanzania"); CrawlStore("Thailand"); CrawlStore("Timor"); CrawlStore("Togo"); CrawlStore("Tokelau"); CrawlStore("Tonga"); CrawlStore("Trinidad"); CrawlStore("Tunisia"); CrawlStore("Turkey"); CrawlStore("Turkmenistan"); CrawlStore("Turks"); CrawlStore("Tuvalu"); CrawlStore("Uganda"); CrawlStore("Ukraine"); CrawlStore("United"); CrawlStore("United"); CrawlStore("United"); CrawlStore("Uruguay"); CrawlStore("Uzbekistan"); CrawlStore("Vanuatu"); CrawlStore("Venezuela"); CrawlStore("Vietnam"); CrawlStore("Virgin"); CrawlStore("Wallis"); CrawlStore("Yemen"); CrawlStore("Zambia"); CrawlStore("Zimbabwe"); }
static void Main(string[] args) { // Creating Needed Instances RequestsHandler httpClient = new RequestsHandler(); AppStoreParser parser = new AppStoreParser(); // Loading Configuration LogSetup.InitializeLog("Apple_Store_Urls_Worker.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Control Variable (Bool - Should the process use proxies? ) bool shouldUseProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true shouldUseProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // AWS Queue Handler _logger.Info("Initializing Queues"); AWSSQSHelper charactersUrlQueue = new AWSSQSHelper(_characterUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper numericUrlQueue = new AWSSQSHelper(_numericUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.Info("Started Processing Character Urls"); do { try { // Dequeueing messages from the Queue if (!charactersUrlQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!charactersUrlQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var characterUrl in charactersUrlQueue.GetDequeuedMessages()) { // Console Feedback _logger.Info("Started Parsing Url : " + characterUrl.Body); try { // Retries Counter int retries = 0; string htmlResponse; // Retrying if necessary do { // Executing Http Request for the Category Url htmlResponse = httpClient.Get(characterUrl.Body, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { _logger.Info("Retrying Request for Character Page"); retries++; // Small Hiccup Thread.Sleep(_hiccupTime); } } while (String.IsNullOrWhiteSpace(htmlResponse) && retries <= _maxRetries); // Checking if retries failed if (String.IsNullOrWhiteSpace(htmlResponse)) { // Deletes Message and moves on charactersUrlQueue.DeleteMessage(characterUrl); continue; } // Hashset of urls processed (to avoid duplicates) HashSet <String> urlsQueued = new HashSet <String> (); // Executing Request and Queueing Urls until there's no other Url to be queued do { // Flag to check whether any url was added after the last iteration (avoids endless loop) bool anyNewUrl = false; // If the request worked, parses the Urls out of the page foreach (string numericUrls in parser.ParseNumericUrls(htmlResponse).Select(t => HttpUtility.HtmlDecode(t))) { // Checking if this url was previously queued if (!urlsQueued.Contains(numericUrls)) { // Enqueueing Urls numericUrlQueue.EnqueueMessage(HttpUtility.HtmlDecode(numericUrls)); // Adding url to the local hashset urlsQueued.Add(numericUrls); anyNewUrl = true; } } // Checking for the need to perform another http request for the next page if (parser.IsLastPage(htmlResponse) || !anyNewUrl) { break; // Breaks "While" Loop } // Feedback _logger.Info("Urls Queued For This Page : " + urlsQueued.Count, "\n\tProcessing Feedback"); // If it got to this point, it means that there are more pages to be processed // Parsing URL of the "Last" page (the last that's visible) string lastPageUrl = HttpUtility.HtmlDecode(parser.ParseLastPageUrl(htmlResponse)); // Executing Http Request for this Url (with retries) retries = 0; do { // HTTP Get for the Page htmlResponse = httpClient.Get(lastPageUrl, shouldUseProxies); if (String.IsNullOrEmpty(htmlResponse)) { _logger.Error("Retrying Request for Last Page"); retries++; // Small Hiccup Thread.Sleep(_hiccupTime); } } while (String.IsNullOrEmpty(htmlResponse) && retries <= _maxRetries); } while (true); } catch (Exception ex) { _logger.Error(ex); } finally { charactersUrlQueue.DeleteMessage(characterUrl); } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }