コード例 #1
0
        public string StopCrawling()
        {
            try
            {
                _storageManager.IssueCmd("CLEAR", _robotsTxtUrl);
                _storageManager.ClearAll();


                return("Stopping and clearing crawl of " + _robotsTxtUrl);
            }
            catch (Exception ex)
            {
                return("Error: " + ex.ToString());
            }
        }
コード例 #2
0
        private async Task RunAsync(CancellationToken cancellationToken)
        {
            // Set up queues, tables, data helper, status helper
            InitializeCrawlrComponents();

            // Get the current cmd from the cmd table;
            // Re-execute cmd query periodically until current cmd exists
            while (_storageManager.GetCurrentCmd() != "START")
            {
                Thread.Sleep(5000);
            }

            // If start cmd given, initialize download of robots.txt
            // and populate the xmlQueue and _disallowed list
            if (_storageManager.GetCurrentCmd() == "START")
            {
                Startup();
            }

            // Recurring work
            while (!cancellationToken.IsCancellationRequested)
            {
                Trace.TraceInformation("Working");

                // Do work if current cmd is still "start"
                if (_storageManager.GetCurrentCmd() == "START")
                {
                    // Process all XMLs (sitemaps) found
                    while (_crawlrData.NumXmlsQueued > 0 && _storageManager.GetCurrentCmd() == "START")
                    {
                        CloudQueueMessage nextXmlMsg = _storageManager.XmlQueue.GetMessage();
                        string            nextXml    = nextXmlMsg.AsString;

                        XmlCrawlr.CrawlXml(ref _crawlrData, ref _storageManager, nextXml);

                        _storageManager.XmlQueue.DeleteMessage(nextXmlMsg);
                        _crawlrData.NumXmlsQueued--;

                        // Update worker role status
                        _statusManager.UpdateCrawlrStatus(
                            "Loading",
                            _crawlrData,
                            _storageManager
                            );
                        _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                        Thread.Sleep(50);
                    }

                    // Process all URLs in queue
                    while (_crawlrData.NumUrlsQueued > 0 && _storageManager.GetCurrentCmd() == "START")
                    {
                        CloudQueueMessage nextUrlMsg = _storageManager.UrlQueue.GetMessage();
                        string            nextUrl    = nextUrlMsg.AsString;

                        UrlCrawlr.CrawlUrl(ref _crawlrData, ref _storageManager, nextUrl);

                        _storageManager.UrlQueue.DeleteMessage(nextUrlMsg);
                        _crawlrData.NumUrlsQueued--;

                        // Update worker role status
                        _statusManager.UpdateCrawlrStatus(
                            "Crawling",
                            _crawlrData,
                            _storageManager
                            );
                        _statusManager.UpdateQueueSize(_storageManager, _crawlrData.NumXmlsQueued, _crawlrData.NumUrlsQueued);

                        Thread.Sleep(50);
                    }
                }
                else if (_storageManager.GetCurrentCmd() == "CLEAR")
                {
                    // If the "CLEAR" command is found, clear all queues and tables.
                    _storageManager.ClearAll();
                    _statusManager.UpdateCrawlrStatus(
                        "CLEAR",
                        _crawlrData,
                        _storageManager
                        );
                    // Give Azure time to delete tables.
                    Thread.Sleep(20000);

                    try
                    {
                        // Idle while waiting for next command.
                        while (_storageManager.GetCurrentCmd() == "CLEAR")
                        {
                            Thread.Sleep(10000);
                        }
                    }
                    finally
                    {
                        // Reinitialize worker role.
                        InitializeCrawlrComponents();
                        Startup();
                    }
                }
                else
                {
                    // Idle worker role (for unimplemented 'pause' functionality).
                    _statusManager.UpdateCrawlrStatus(
                        "Idle",
                        _crawlrData,
                        _storageManager
                        );

                    Thread.Sleep(5000);
                }
            }

            Thread.Sleep(1000);
        }