Exemple #1
0
        private async Task RunAsync(CancellationToken cancellationToken)
        {
            _storageManager = new CrawlrStorageManager(ConfigurationManager.AppSettings["StorageConnectionString"]);

            // Get the current cmd from the cmd table;
            // Re-execute cmd query periodically until current cmd exists
            while (_storageManager.GetCurrentCmd() != "START")
            {
                Thread.Sleep(5000);
            }

            // If start cmd given, initialize download of robots.txt
            // and populate the xmlQueue and _disallowed list
            if (_storageManager.GetCurrentCmd() == "START" && _crawlrData == null)
            {
                // Set up queues, tables, data helper, status helper
                InitializeCrawlrComponents();
                Startup();
            }

            // Recurring work
            while (!cancellationToken.IsCancellationRequested)
            {
                Trace.TraceInformation("Working");

                // Do work if current cmd is still "start"
                if (_storageManager.GetCurrentCmd() == "START")
                {
                    //Process all XMLs(sitemaps) found
                    string nextXml = "";
                    try
                    {
                        while (_crawlrData.NumXmlsQueued > 0 && _storageManager.GetCurrentCmd() == "START")
                        {
                            //CloudQueueMessage nextXmlMsg = _storageManager.XmlQueue.GetMessage();
                            nextXml = _crawlrData.XmlQueue.Dequeue();
                            _crawlrData.NumXmlsQueued--;

                            XmlCrawlr.CrawlXml(ref _crawlrData, ref _storageManager, nextXml);

                            //_storageManager.XmlQueue.DeleteMessage(nextXmlMsg);

                            // Update worker role status
                            _statusManager.UpdateCrawlrStatus(
                                "Loading",
                                _crawlrData,
                                _storageManager
                                );
                            _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                            Thread.Sleep(50);
                        }
                    }
                    catch (Exception ex)
                    {
                        try
                        {
                            ErrorEntity    errorUrl       = new ErrorEntity(nextXml, ex.ToString());
                            TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                            _storageManager.ErrorTable.Execute(insertErrorUrl);
                        }
                        catch (Exception) { }
                    }

                    // Process all URLs in queue
                    string nextUrl = "";
                    try
                    {
                        while (_storageManager.GetCurrentCmd() == "START")
                        {
                            CloudQueueMessage nextUrlMsg = _storageManager.UrlQueue.GetMessage();
                            nextUrl = nextUrlMsg.AsString;

                            UrlCrawlr.CrawlUrl(ref _crawlrData, ref _storageManager, nextUrl);

                            _storageManager.UrlQueue.DeleteMessage(nextUrlMsg);
                            _crawlrData.NumUrlsQueued--;

                            // Update worker role status
                            _statusManager.UpdateCrawlrStatus(
                                "Crawling",
                                _crawlrData,
                                _storageManager
                                );
                            _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                            Thread.Sleep(50);
                        }
                    }
                    catch (Exception ex)
                    {
                        try
                        {
                            ErrorEntity    errorUrl       = new ErrorEntity(nextUrl, ex.ToString());
                            TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                            _storageManager.ErrorTable.Execute(insertErrorUrl);
                        }
                        catch (Exception) { }
                    }
                }
                else if (_storageManager.GetCurrentCmd() == "CLEAR")
                {
                    // If the "CLEAR" command is found, update status.
                    _statusManager.UpdateCrawlrStatus(
                        "CLEAR",
                        _crawlrData,
                        _storageManager
                        );
                    _statusManager.UpdateQueueSize(_storageManager, 0, 0);
                    _storageManager.UrlQueue.Clear();
                    _storageManager.XmlQueue.Clear();
                    // Give Azure time to delete tables.
                    Thread.Sleep(20000);

                    try
                    {
                        // Idle while waiting for next command.
                        while (_storageManager.GetCurrentCmd() == "CLEAR")
                        {
                            Thread.Sleep(10000);
                        }
                    }
                    finally
                    {
                        // Reinitialize worker role.
                        InitializeCrawlrComponents();
                        Startup();
                    }
                }
                else
                {
                    // Idle worker role (for unimplemented 'pause' functionality).
                    _statusManager.UpdateCrawlrStatus(
                        "Idle",
                        _crawlrData,
                        _storageManager
                        );

                    Thread.Sleep(5000);
                }
            }

            Thread.Sleep(1000);
        }
Exemple #2
0
        private async Task RunAsync(CancellationToken cancellationToken)
        {
            // Set up queues, tables, data helper, status helper
            InitializeCrawlrComponents();

            // Get the current cmd from the cmd table;
            // Re-execute cmd query periodically until current cmd exists
            while (_storageManager.GetCurrentCmd() != "START")
            {
                Thread.Sleep(5000);
            }

            // If start cmd given, initialize download of robots.txt
            // and populate the xmlQueue and _disallowed list
            if (_storageManager.GetCurrentCmd() == "START")
            {
                Startup();
            }

            // Recurring work
            while (!cancellationToken.IsCancellationRequested)
            {
                Trace.TraceInformation("Working");

                // Do work if current cmd is still "start"
                if (_storageManager.GetCurrentCmd() == "START")
                {
                    // Process all XMLs (sitemaps) found
                    while (_crawlrData.NumXmlsQueued > 0 && _storageManager.GetCurrentCmd() == "START")
                    {
                        CloudQueueMessage nextXmlMsg = _storageManager.XmlQueue.GetMessage();
                        string            nextXml    = nextXmlMsg.AsString;

                        XmlCrawlr.CrawlXml(ref _crawlrData, ref _storageManager, nextXml);

                        _storageManager.XmlQueue.DeleteMessage(nextXmlMsg);
                        _crawlrData.NumXmlsQueued--;

                        // Update worker role status
                        _statusManager.UpdateCrawlrStatus(
                            "Loading",
                            _crawlrData,
                            _storageManager
                            );
                        _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                        Thread.Sleep(50);
                    }

                    // Process all URLs in queue
                    while (_crawlrData.NumUrlsQueued > 0 && _storageManager.GetCurrentCmd() == "START")
                    {
                        CloudQueueMessage nextUrlMsg = _storageManager.UrlQueue.GetMessage();
                        string            nextUrl    = nextUrlMsg.AsString;

                        UrlCrawlr.CrawlUrl(ref _crawlrData, ref _storageManager, nextUrl);

                        _storageManager.UrlQueue.DeleteMessage(nextUrlMsg);
                        _crawlrData.NumUrlsQueued--;

                        // Update worker role status
                        _statusManager.UpdateCrawlrStatus(
                            "Crawling",
                            _crawlrData,
                            _storageManager
                            );
                        _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                        Thread.Sleep(50);
                    }
                }
                else if (_storageManager.GetCurrentCmd() == "CLEAR")
                {
                    // If the "CLEAR" command is found, clear all queues and tables.
                    _storageManager.ClearAll();
                    _statusManager.UpdateCrawlrStatus(
                        "CLEAR",
                        _crawlrData,
                        _storageManager
                        );
                    // Give Azure time to delete tables.
                    Thread.Sleep(20000);

                    try
                    {
                        // Idle while waiting for next command.
                        while (_storageManager.GetCurrentCmd() == "CLEAR")
                        {
                            Thread.Sleep(10000);
                        }
                    }
                    finally
                    {
                        // Reinitialize worker role.
                        InitializeCrawlrComponents();
                        Startup();
                    }
                }
                else
                {
                    // Idle worker role (for unimplemented 'pause' functionality).
                    _statusManager.UpdateCrawlrStatus(
                        "Idle",
                        _crawlrData,
                        _storageManager
                        );

                    Thread.Sleep(5000);
                }
            }

            Thread.Sleep(1000);
        }