Пример #1
0
        public override void Run()
        {
            createQueues();
            createTable();
            initializeCounters();

            while (checkAdminStatus() == "stopCrawl")
            {
                System.Threading.Thread.Sleep(1000);
            }

            visitedUrls  = new HashSet <string>();
            disallowList = new List <string>();
            baseSitemap  = new List <string>();
            readRobots("http://www.cnn.com/robots.txt");
            readRobots("http://bleacherreport.com/robots.txt");

            while (true)
            {
                System.Threading.Thread.Sleep(50);
                if (checkAdminStatus() == "newCrawl")
                {
                    createQueues();
                    createTable();
                    initializeCounters();
                    foreach (string baseSitemapUrl in baseSitemap)
                    {
                        parseXML(baseSitemapUrl);
                    }
                    TableOperation getAdminNode       = TableOperation.Retrieve <adminNode>("admin", "command");
                    TableResult    retrievedAdminNode = stattable.Execute(getAdminNode);
                    adminNode      newAdminNode       = (adminNode)retrievedAdminNode.Result;
                    if (newAdminNode != null)
                    {
                        newAdminNode.currentCommand = "resumeCrawl";
                        TableOperation updateAdminNode = TableOperation.Replace(newAdminNode);
                        stattable.Execute(updateAdminNode);
                    }
                }
                if (checkAdminStatus() == "resumeCrawl")
                {
                    CloudQueueMessage currentXmlUrl = xmlqueue.GetMessage();
                    if (currentXmlUrl != null)
                    {
                        parseXML(currentXmlUrl.AsString);
                        xmlqueue.DeleteMessage(currentXmlUrl);
                    }
                    try {
                        CloudQueueMessage currentHtmlUrl = htmlqueue.GetMessage();
                        if (currentHtmlUrl != null)
                        {
                            crawlUrl(currentHtmlUrl.AsString);
                            htmlqueue.DeleteMessage(currentHtmlUrl);
                        }
                    } catch (Exception e) {
                    }
                }
                updatePerformanceCounter();
            }
        }
Пример #2
0
        /// <summary>
        /// Setup the table counters used for gathering statistics
        /// </summary>
        public void initializeCounters()
        {
            //Initialize row counter
            int            currentRows       = 0;
            TableOperation getRowCount       = TableOperation.Retrieve <rowCount>("rowCount", "totalRows");
            TableResult    retrievedRowCount = stattable.Execute(getRowCount);

            if (retrievedRowCount.Result != null)
            {
                currentRows = (((rowCount)retrievedRowCount.Result).count);
            }
            rowCount       countStart       = new rowCount(currentRows);
            TableOperation insertCountStart = TableOperation.InsertOrReplace(countStart);

            stattable.Execute(insertCountStart);

            //Initialize crawler to stop
            adminNode      stopCrawl       = new adminNode("stopCrawl");
            TableOperation insertAdminNode = TableOperation.InsertOrReplace(stopCrawl);

            stattable.Execute(insertAdminNode);

            //Initialize preformance counters
            performanceNode newPerformance        = new performanceNode("None", "None");
            TableOperation  insertPerformanceNode = TableOperation.InsertOrReplace(newPerformance);

            stattable.Execute(insertPerformanceNode);
        }
Пример #3
0
        public string resumeCrawl()
        {
            createTable();
            TableOperation getAdminNode       = TableOperation.Retrieve <adminNode>("admin", "command");
            TableResult    retrievedAdminNode = stattable.Execute(getAdminNode);
            adminNode      newAdminNode       = (adminNode)retrievedAdminNode.Result;

            if (newAdminNode != null)
            {
                newAdminNode.currentCommand = "resumeCrawl";
                TableOperation updateAdminNode = TableOperation.Replace(newAdminNode);
                stattable.Execute(updateAdminNode);
            }
            return("Resuming crawl...");
        }
Пример #4
0
        /// <summary>
        /// Setup the table counters used for gathering statistics
        /// </summary>
        public void initializeCounters()
        {
            //Initialize url row counter
            int            currentUrlRows      = 0;
            rowCount       urlCountStart       = new rowCount(currentUrlRows);
            TableOperation insertUrlCountStart = TableOperation.InsertOrReplace(urlCountStart);

            stattable.Execute(insertUrlCountStart);

            //Initialize crawler to stop
            adminNode      stopCrawl       = new adminNode("stopCrawl");
            TableOperation insertAdminNode = TableOperation.InsertOrReplace(stopCrawl);

            stattable.Execute(insertAdminNode);

            //Initialize preformance counters
            performanceNode newPerformance        = new performanceNode("None", "None");
            TableOperation  insertPerformanceNode = TableOperation.InsertOrReplace(newPerformance);

            stattable.Execute(insertPerformanceNode);
        }