static void Main(string[] args)
        {
            // Creating Needed Instances
            RequestsHandler httpClient = new RequestsHandler ();
            AppStoreParser  parser     = new AppStoreParser ();

            // Loading Configuration
            LogSetup.InitializeLog ("Apple_Store_Categories_Worker.log", "info");
            _logger = LogManager.GetCurrentClassLogger ();

            // Loading Config
            _logger.Info ("Loading Configurations from App.config");
            LoadConfiguration ();

            // Control Variable (Bool - Should the process use proxies? )
            bool shouldUseProxies = false;

            // Checking for the need to use proxies
            if (args != null && args.Length == 1)
            {
                // Setting flag to true
                shouldUseProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists (fPath))
                {
                    _logger.Fatal ("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit (-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load (fLines.ToList ());
                }
                catch (Exception ex)
                {
                    _logger.Fatal (ex);
                    System.Environment.Exit (-101);
                }
            }

            // AWS Queue Handler
            _logger.Info ("Initializing Queues");
            AWSSQSHelper categoriesUrlQueue = new AWSSQSHelper (_categoriesQueueName   , _maxMessagesPerDequeue, _awsKey, _awsKeySecret);
            AWSSQSHelper charactersUrlQueue = new AWSSQSHelper (_characterUrlsQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret);

            // Setting Error Flag to No Error ( 0 )
            System.Environment.ExitCode = 0;

            // Initialiazing Control Variables
            int fallbackWaitTime = 1;

            _logger.Info ("Started Processing Category Urls");

            do
            {
                try
                {
                    // Dequeueing messages from the Queue
                    if (!categoriesUrlQueue.DeQueueMessages())
                    {
                        Thread.Sleep (_hiccupTime); // Hiccup                   
                        continue;
                    }

                    // Checking for no message received, and false positives situations
                    if (!categoriesUrlQueue.AnyMessageReceived())
                    {
                        // If no message was found, increases the wait time
                        int waitTime;
                        if (fallbackWaitTime <= 12)
                        {
                            // Exponential increase on the wait time, truncated after 12 retries
                            waitTime = Convert.ToInt32 (Math.Pow (2, fallbackWaitTime) * 1000);
                        }
                        else // Reseting Wait after 12 fallbacks
                        {
                            waitTime         = 2000;
                            fallbackWaitTime = 0;
                        }

                        fallbackWaitTime++;

                        // Sleeping before next try
                        _logger.Info ("Fallback (seconds) => " + waitTime);
                        Thread.Sleep (waitTime);
                        continue;
                    }

                    // Reseting fallback time
                    fallbackWaitTime = 1;

                    // Iterating over dequeued Messages
                    foreach (var categoryUrl in categoriesUrlQueue.GetDequeuedMessages())
                    {
                        // Console Feedback
                        _logger.Info ("Started Parsing Category : " + categoryUrl.Body);

                        try
                        {
                            // Retries Counter
                            int retries = 0;
                            string htmlResponse;

                            // Retrying if necessary
                            do
                            {
                                // Executing Http Request for the Category Url
                                htmlResponse = httpClient.Get (categoryUrl.Body, shouldUseProxies);

                                if (String.IsNullOrEmpty (htmlResponse))
                                {
                                    _logger.Error ("Retrying Request for Category Page");
                                    retries++;
                                }

                            } while (String.IsNullOrWhiteSpace (htmlResponse) && retries <= _maxRetries);

                            // Checking if retries failed
                            if (String.IsNullOrWhiteSpace (htmlResponse))
                            {
                                // Deletes Message and moves on
                                categoriesUrlQueue.DeleteMessage (categoryUrl);
                                continue;
                            }

                            // If the request worked, parses the urls out of the page
                            foreach (string characterUrls in parser.ParseCharacterUrls (htmlResponse))
                            {
                                // Enqueueing Urls
                                charactersUrlQueue.EnqueueMessage (HttpUtility.HtmlDecode (characterUrls));
                            }
                        }
                        catch (Exception ex)
                        {
                            _logger.Error (ex);
                        }
                        finally
                        {
                            // Deleting the message
                            categoriesUrlQueue.DeleteMessage(categoryUrl);
                        }
                    }
                }
                catch (Exception ex)
                {
                    _logger.Error (ex);
                }

            } while (true);
        }
        static void Main(string[] args)
        {
            // Creating Needed Instances
            RequestsHandler httpClient = new RequestsHandler ();
            AppStoreParser  parser     = new AppStoreParser ();

            // Loading Configuration
            LogSetup.InitializeLog ("Apple_Store_Urls_Worker.log", "info");
            _logger = LogManager.GetCurrentClassLogger ();

            // Loading Config
            _logger.Info ("Loading Configurations from App.config");
            LoadConfiguration ();

            // Control Variable (Bool - Should the process use proxies? )
            bool shouldUseProxies = false;

            // Checking for the need to use proxies
            if (args != null && args.Length == 1)
            {
                // Setting flag to true
                shouldUseProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists (fPath))
                {
                    _logger.Fatal ("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit (-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load (fLines.ToList ());
                }
                catch (Exception ex)
                {
                    _logger.Fatal (ex);
                    System.Environment.Exit (-101);
                }
            }

            // AWS Queue Handler
            _logger.Info ("Initializing Queues");
            AzureSQSHelper charactersUrlQueue = new AzureSQSHelper (_characterUrlsQueueName, _maxMessagesPerDequeue, _azureQueueconn);
            AzureSQSHelper numericUrlQueue    = new AzureSQSHelper(_numericUrlsQueueName  , _maxMessagesPerDequeue, _azureQueueconn);

            // Setting Error Flag to No Error ( 0 )
            System.Environment.ExitCode = 0;

            // Initialiazing Control Variables
            int fallbackWaitTime = 1;

            _logger.Info ("Started Processing Character Urls");

            do
            {
                try
                {
                    // Dequeueing messages from the Queue
                    if (!charactersUrlQueue.DeQueueMessages())
                    {
                        Thread.Sleep (_hiccupTime); // Hiccup
                        continue;
                    }

                    // Checking for no message received, and false positives situations
                    if (!charactersUrlQueue.AnyMessageReceived())
                    {
                        // If no message was found, increases the wait time
                        int waitTime;
                        if (fallbackWaitTime <= 12)
                        {
                            // Exponential increase on the wait time, truncated after 12 retries
                            waitTime = Convert.ToInt32 (Math.Pow (2, fallbackWaitTime) * 1000);
                        }
                        else // Reseting Wait after 12 fallbacks
                        {
                            waitTime         = 2000;
                            fallbackWaitTime = 0;
                        }

                        fallbackWaitTime++;

                        // Sleeping before next try
                        Console.WriteLine ("Fallback (seconds) => " + waitTime);
                        Thread.Sleep (waitTime);
                        continue;
                    }

                    // Reseting fallback time
                    fallbackWaitTime = 1;

                    // Iterating over dequeued Messages
                    foreach (var characterUrl in charactersUrlQueue.GetDequeuedMessages ())
                    {
                        // Console Feedback
                        _logger.Info ("Started Parsing Url : " + characterUrl.AsString);

                        try
                        {
                            // Retries Counter
                            int retries = 0;
                            string htmlResponse;

                            // Retrying if necessary
                            do
                            {
                                // Executing Http Request for the Category Url
                                htmlResponse = httpClient.Get (characterUrl.AsString, shouldUseProxies);

                                if (String.IsNullOrEmpty (htmlResponse))
                                {
                                    _logger.Info ("Retrying Request for Character Page");
                                    retries++;

                                    // Small Hiccup
                                    Thread.Sleep (_hiccupTime);
                                }

                            } while (String.IsNullOrWhiteSpace (htmlResponse) && retries <= _maxRetries);

                            // Checking if retries failed
                            if (String.IsNullOrWhiteSpace (htmlResponse))
                            {
                                // Deletes Message and moves on
                                charactersUrlQueue.DeleteMessage (characterUrl);
                                continue;
                            }

                            // Hashset of urls processed (to avoid duplicates)
                            HashSet<String> urlsQueued = new HashSet<String> ();

                            // Executing Request and Queueing Urls until there's no other Url to be queued
                            do
                            {
                                // Flag to check whether any url was added after the last iteration (avoids endless loop)
                                bool anyNewUrl = false;

                                // If the request worked, parses the Urls out of the page
                                foreach (string numericUrls in parser.ParseNumericUrls (htmlResponse).Select (t => HttpUtility.HtmlDecode (t)))
                                {
                                    // Checking if this url was previously queued
                                    if (!urlsQueued.Contains (numericUrls))
                                    {
                                        // Enqueueing Urls
                                        numericUrlQueue.EnqueueMessage (HttpUtility.HtmlDecode (numericUrls));

                                        // Adding url to the local hashset
                                        urlsQueued.Add (numericUrls);
                                        anyNewUrl = true;
                                    }
                                }

                                // Checking for the need to perform another http request for the next page
                                if (parser.IsLastPage (htmlResponse) || !anyNewUrl)
                                {
                                    break; // Breaks "While" Loop
                                }

                                // Feedback
                                _logger.Info ("Urls Queued For This Page : " + urlsQueued.Count, "\n\tProcessing Feedback");

                                // If it got to this point, it means that there are more pages to be processed
                                // Parsing URL of the "Last" page (the last that's visible)
                                string lastPageUrl = HttpUtility.HtmlDecode (parser.ParseLastPageUrl (htmlResponse));

                                // Executing Http Request for this Url (with retries)
                                retries = 0;
                                do
                                {
                                    // HTTP Get for the Page
                                    htmlResponse = httpClient.Get (lastPageUrl, shouldUseProxies);

                                    if (String.IsNullOrEmpty (htmlResponse))
                                    {
                                        _logger.Error ("Retrying Request for Last Page");
                                        retries++;

                                        // Small Hiccup
                                        Thread.Sleep (_hiccupTime);
                                    }

                                } while (String.IsNullOrEmpty (htmlResponse) && retries <= _maxRetries);

                            } while (true);
                        }
                        catch (Exception ex)
                        {
                            _logger.Error (ex);
                        }
                        finally
                        {
                            charactersUrlQueue.DeleteMessage (characterUrl);
                        }
                    }
                }
                catch (Exception ex)
                {
                    _logger.Error (ex);
                }

            } while (true);
        }
        static void Main(string[] args)
        {
            // Creating Needed Instances
            RequestsHandler httpClient = new RequestsHandler ();
            AppStoreParser  parser     = new AppStoreParser ();

            // Loading Configuration
            LogSetup.InitializeLog ("Apple_Store_Urls_Worker.log", "info");
            _logger = LogManager.GetCurrentClassLogger ();

            // Loading Config
            _logger.Info ("Loading Configurations from App.config");
            LoadConfiguration ();

            // Control Variable (Bool - Should the process use proxies? )
            bool shouldUseProxies = false;

            // Checking for the need to use proxies
            if (args != null && args.Length == 1)
            {
                // Setting flag to true
                shouldUseProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists (fPath))
                {
                    _logger.Fatal ("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit (-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load (fLines.ToList ());
                }
                catch (Exception ex)
                {
                    _logger.Fatal (ex);
                    System.Environment.Exit (-101);
                }
            }

            // AWS Queue Handler
            _logger.Info ("Initializing Queues");
            AWSSQSHelper appsUrlQueue  = new AWSSQSHelper (_appUrlsQueueName , _maxMessagesPerDequeue, _awsKey, _awsKeySecret);
            AWSSQSHelper appsDataQueue = new AWSSQSHelper (_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret);

            // Setting Error Flag to No Error ( 0 )
            System.Environment.ExitCode = 0;

            // Initialiazing Control Variables
            int fallbackWaitTime = 1;

            _logger.Info ("Started Processing Individual Apps Urls");

            do
            {
                try
                {
                    // Dequeueing messages from the Queue
                    if (!appsUrlQueue.DeQueueMessages ())
                    {
                        Thread.Sleep (_hiccupTime); // Hiccup
                        continue;
                    }

                    // Checking for no message received, and false positives situations
                    if (!appsUrlQueue.AnyMessageReceived ())
                    {
                        // If no message was found, increases the wait time
                        int waitTime;
                        if (fallbackWaitTime <= 12)
                        {
                            // Exponential increase on the wait time, truncated after 12 retries
                            waitTime = Convert.ToInt32 (Math.Pow (2, fallbackWaitTime) * 1000);
                        }
                        else // Reseting Wait after 12 fallbacks
                        {
                            waitTime         = 2000;
                            fallbackWaitTime = 0;
                        }

                        fallbackWaitTime++;

                        // Sleeping before next try
                        Console.WriteLine ("Fallback (seconds) => " + waitTime);
                        Thread.Sleep (waitTime);
                        continue;
                    }

                    // Reseting fallback time
                    fallbackWaitTime = 1;

                    // Iterating over dequeued Messages
                    foreach (var appUrl in appsUrlQueue.GetDequeuedMessages ())
                    {
                        bool processingWorked = true;

                        try
                        {
                            // Retries Counter
                            int retries = 0;
                            string htmlResponse;

                            // Retrying if necessary
                            do
                            {
                                // Executing Http Request for the Category Url
                                //appUrl.Body = "https://itunes.apple.com/us/app/action-run-3d/id632371832?mt=8";
                                //appUrl.Body = "https://itunes.apple.com/us/app/emoji-2-free-new-emoticons/id521863802?mt=8";
                                //appUrl.Body = "https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8";
                                //appUrl.Body = "https://itunes.apple.com/us/app/dba-den-bla-avis/id448605988?mt=8";
                                htmlResponse = httpClient.Get (appUrl.Body, shouldUseProxies);

                                if (String.IsNullOrEmpty (htmlResponse))
                                {
                                    // Extending Fallback time
                                    retries++;
                                    int sleepTime = retries * _hiccupTime <= 30000 ? retries * _hiccupTime : 30000;

                                    _logger.Info ("Retrying Request for App Page [ " + sleepTime / 1000 + " ]");

                                    Thread.Sleep (sleepTime);
                                }

                            } while (String.IsNullOrWhiteSpace (htmlResponse) && retries <= _maxRetries);

                            // Checking if retries failed
                            if (String.IsNullOrWhiteSpace (htmlResponse))
                            {
                                continue;
                            }

                            // Feedback
                            _logger.Info ("Current page " + appUrl.Body, "Parsing App Data");

                            // Parsing Data out of the Html Page
                            AppleStoreAppModel parsedApp = parser.ParseAppPage (htmlResponse);
                            parsedApp.url                = appUrl.Body;

                            // Enqueueing App Data
                            appsDataQueue.EnqueueMessage (parsedApp.ToJson ());

                            // Little Hiccup
                            Thread.Sleep (_hiccupTime);

                        }
                        catch (Exception ex)
                        {
                            _logger.Error (ex);

                            // Setting Flag to "False"
                            processingWorked = false;
                        }
                        finally
                        {
                             //Deleting the message - Only if the processing worked
                            if (processingWorked)
                            {
                                appsUrlQueue.DeleteMessage (appUrl);
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    _logger.Error (ex);
                }

            } while (true);
        }
Example #4
0
        static void Main(string[] args)
        {
            // Creating Needed Instances
            RequestsHandler httpClient = new RequestsHandler ();
            AppStoreParser  parser     = new AppStoreParser ();

            // Setting Up Log
            LogSetup.InitializeLog ("Apple_Store_Crawler.log", "info");
            _logger = LogManager.GetCurrentClassLogger ();

            // Starting Flow
            _logger.Info ("Worker Started");

            // Loading Configuration
            _logger.Info ("Reading Configuration");
            LoadConfiguration ();

            // Control Variable (Bool - Should the process use proxies? )
            bool shouldUseProxies = false;

            // Checking for the need to use proxies
            if (args != null && args.Length == 1)
            {
                // Setting flag to true
                shouldUseProxies = true;

                // Loading proxies from .txt received as argument
                String fPath = args[0];

                // Sanity Check
                if (!File.Exists (fPath))
                {
                    _logger.Fatal ("Couldnt find proxies on path : " + fPath);
                    System.Environment.Exit (-100);
                }

                // Reading Proxies from File
                string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8"));

                try
                {
                    // Actual Load of Proxies
                    ProxiesLoader.Load (fLines.ToList ());
                }
                catch (Exception ex)
                {
                    _logger.Fatal (ex);
                    System.Environment.Exit (-101);
                }
            }

            // AWS Queue Handler
            _logger.Info ("Initializing Queues");
            AzureSQSHelper sqsWrapper = new AzureSQSHelper (_categoriesQueueName, 10, _azureQueueconn);

            // Step 1 - Trying to obtain the root page html (source of all the apps)
            var rootPageResponse = httpClient.GetRootPage (shouldUseProxies);

            // Sanity Check
            if (String.IsNullOrWhiteSpace (rootPageResponse))
            {
                _logger.Info ("Error obtaining Root Page HTMl - Aborting", "Timeout Error");
                return;
            }

            // Step 2 - Extracting Category Urls from the Root Page and queueing their Urls
            foreach (var categoryUrl in parser.ParseCategoryUrls (rootPageResponse))
            {
                // Logging Feedback
                _logger.Info ("Queueing Category : " + categoryUrl);

                // Queueing Category Urls
                sqsWrapper.EnqueueMessage (categoryUrl);
            }

            _logger.Info ("End of Bootstrapping phase");

            //Console.ReadLine();
        }