Esempio n. 1
0
 public void ClearEverything()
 {
     Table.DeleteIfExists();
     LoadQueue.DeleteIfExists();
     CrawlQueue.DeleteIfExists();
     StopQueue.DeleteIfExists();
 }
Esempio n. 2
0
        public static CrawlQueue CreateCrawlQueue(int id, int groupId)
        {
            CrawlQueue crawlQueue = new CrawlQueue();

            crawlQueue.Id      = id;
            crawlQueue.GroupId = groupId;
            return(crawlQueue);
        }
Esempio n. 3
0
        private void UpdateDashboard(string url)
        {
            CrawlQueue.FetchAttributes();
            Status.UpdateStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue(), url, CrawlQueue.ApproximateMessageCount.ToString());
            TableOperation insertOperation = TableOperation.InsertOrReplace(Status);

            AdminStatusTable.ExecuteAsync(insertOperation);
        }
Esempio n. 4
0
 public void ClearEverything()
 {
     SiteDataTable.DeleteIfExists();
     LoadQueue.DeleteIfExists();
     CrawlQueue.DeleteIfExists();
     StopQueue.DeleteIfExists();
     AdminStatusTable.DeleteIfExists();
     ErrorQueue.DeleteIfExists();
 }
Esempio n. 5
0
        private void ProcessXML(string URL)
        {
            HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(URL);

            httpRequest.Timeout   = 10000;   // 10 secs
            httpRequest.UserAgent = "Code Sample Web Client";

            HttpWebResponse webResponse = (HttpWebResponse)httpRequest.GetResponse();
            var             stream      = new StreamReader(webResponse.GetResponseStream());

            XmlDocument xmlDoc = new XmlDocument(); // Create an XML document object

            xmlDoc.Load(stream);

            // Get elements
            XmlNodeList elements = xmlDoc.GetElementsByTagName("sitemap");

            if (elements.Count == 0)
            {
                elements = xmlDoc.GetElementsByTagName("url");
            }

            for (int i = 0; i < elements.Count; i++)
            {
                var  link        = elements[i].ChildNodes[0].InnerText;
                bool correctDate = true;
                if (elements[i].LastChild.InnerText != link)
                {
                    var date = elements[i].ChildNodes[1].InnerText;
                    correctDate = CheckLinkIsRecent(link, date);
                }
                if (CheckLinkDomain(link) && CheckLinkIsCorrectType(link) && CheckIfAllowed(link) && correctDate)
                {
                    CloudQueueMessage linkMessage = new CloudQueueMessage(link);
                    if (link.EndsWith("xml"))
                    {
                        LoadQueue.AddMessage(linkMessage);
                    }
                    else
                    {
                        CrawlQueue.AddMessage(linkMessage);
                    }
                }
            }
        }
Esempio n. 6
0
 public void AddToCrawlQueue(CrawlQueue crawlQueue)
 {
     base.AddObject("CrawlQueue", crawlQueue);
 }
Esempio n. 7
0
 public AddController(PluginsCollection plugins, CrawlQueue crawlQueue, IConfigurationService config) : base(config)
 {
     _plugins    = plugins;
     _htmlParser = new HtmlWeb();
     _crawlQueue = crawlQueue;
 }
 /// <summary>
 /// There are no comments for CrawlQueue in the schema.
 /// </summary>
 public void AddToCrawlQueue(CrawlQueue crawlQueue)
 {
     base.AddObject("CrawlQueue", crawlQueue);
 }
 /// <summary>
 /// Create a new CrawlQueue object.
 /// </summary>
 /// <param name="id">Initial value of Id.</param>
 /// <param name="groupId">Initial value of GroupId.</param>
 public static CrawlQueue CreateCrawlQueue(int id, int groupId)
 {
     CrawlQueue crawlQueue = new CrawlQueue();
     crawlQueue.Id = id;
     crawlQueue.GroupId = groupId;
     return crawlQueue;
 }
Esempio n. 10
0
 public HomeController(CrawlQueue crawlQueue)
 {
     _crawlQueue = crawlQueue;
 }
Esempio n. 11
0
 public SubmitController(PluginsCollection plugins, CrawlQueue crawlQueue, IConfigurationProvider config) : base(config)
 {
     _htmlParser = new HtmlWeb();
     _crawlQueue = crawlQueue;
 }
Esempio n. 12
0
 public SearchController(PluginsCollection plugins, CrawlQueue crawlQueue)
 {
     _plugins    = plugins;
     _crawlQueue = crawlQueue;
 }
Esempio n. 13
0
 public AddController(PluginsCollection plugins, CrawlQueue crawlQueue)
 {
     _plugins    = plugins;
     _htmlParser = new HtmlWeb();
     _crawlQueue = crawlQueue;
 }
Esempio n. 14
0
        public override void Run()
        {
            Storage = new AzureStorage();

            LoadQueue        = CloudConfiguration.GetLoadingQueue();
            CrawlQueue       = CloudConfiguration.GetCrawlingQueue();
            StopQueue        = CloudConfiguration.GetStopQueue();
            SiteDataTable    = CloudConfiguration.GetSiteDataTable();
            AdminStatusTable = CloudConfiguration.GetAdminStatusTable();
            StateQueue       = CloudConfiguration.GetStateQueue();

            State = "Idle";

            CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total");
            MemCount = new PerformanceCounter("Memory", "Available MBytes");

            Status = new AdminStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue());

            string[] robots = { "http://www.cnn.com/robots.txt", "http://www.bleacherreport.com/robots.txt" };
            Crawler = new WebCrawler(robots, Storage);

            Thread.Sleep(10000);



            string url = "";

            while (true)
            {
                CloudQueueMessage stopMessage = StopQueue.GetMessage();

                while (stopMessage == null)
                {
                    // Get the next message
                    CloudQueueMessage loadMessage = LoadQueue.GetMessage();
                    State = "Loading";
                    if (loadMessage != null)
                    {
                        State = "Loading";
                        url   = loadMessage.AsString;
                        if (url.Contains("robots.txt"))
                        {
                            string[] robotLinks = url.Split(null);
                            foreach (string link in robotLinks)
                            {
                                Crawler.ProcessURL(link);
                            }
                            LoadQueue.DeleteMessage(loadMessage);
                        }
                        else
                        {
                            Crawler.ProcessURL(url);
                        }
                    }
                    else if (State.Equals("Loading") || State.Equals("Crawling"))
                    {
                        CloudQueueMessage crawlMessage = CrawlQueue.GetMessage();
                        // dequeue crawl message
                        if (crawlMessage != null)
                        {
                            State = "Crawling";
                            url   = crawlMessage.AsString;
                            Crawler.ProcessURL(url);
                            CrawlQueue.DeleteMessage(crawlMessage);
                        }
                    }
                    stopMessage = StopQueue.GetMessage();
                    UpdateDashboard(url);
                }
                State = "Idle";
            }
        }