public void ClearEverything() { Table.DeleteIfExists(); LoadQueue.DeleteIfExists(); CrawlQueue.DeleteIfExists(); StopQueue.DeleteIfExists(); }
public static CrawlQueue CreateCrawlQueue(int id, int groupId) { CrawlQueue crawlQueue = new CrawlQueue(); crawlQueue.Id = id; crawlQueue.GroupId = groupId; return(crawlQueue); }
private void UpdateDashboard(string url) { CrawlQueue.FetchAttributes(); Status.UpdateStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue(), url, CrawlQueue.ApproximateMessageCount.ToString()); TableOperation insertOperation = TableOperation.InsertOrReplace(Status); AdminStatusTable.ExecuteAsync(insertOperation); }
public void ClearEverything() { SiteDataTable.DeleteIfExists(); LoadQueue.DeleteIfExists(); CrawlQueue.DeleteIfExists(); StopQueue.DeleteIfExists(); AdminStatusTable.DeleteIfExists(); ErrorQueue.DeleteIfExists(); }
private void ProcessXML(string URL) { HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(URL); httpRequest.Timeout = 10000; // 10 secs httpRequest.UserAgent = "Code Sample Web Client"; HttpWebResponse webResponse = (HttpWebResponse)httpRequest.GetResponse(); var stream = new StreamReader(webResponse.GetResponseStream()); XmlDocument xmlDoc = new XmlDocument(); // Create an XML document object xmlDoc.Load(stream); // Get elements XmlNodeList elements = xmlDoc.GetElementsByTagName("sitemap"); if (elements.Count == 0) { elements = xmlDoc.GetElementsByTagName("url"); } for (int i = 0; i < elements.Count; i++) { var link = elements[i].ChildNodes[0].InnerText; bool correctDate = true; if (elements[i].LastChild.InnerText != link) { var date = elements[i].ChildNodes[1].InnerText; correctDate = CheckLinkIsRecent(link, date); } if (CheckLinkDomain(link) && CheckLinkIsCorrectType(link) && CheckIfAllowed(link) && correctDate) { CloudQueueMessage linkMessage = new CloudQueueMessage(link); if (link.EndsWith("xml")) { LoadQueue.AddMessage(linkMessage); } else { CrawlQueue.AddMessage(linkMessage); } } } }
public void AddToCrawlQueue(CrawlQueue crawlQueue) { base.AddObject("CrawlQueue", crawlQueue); }
public AddController(PluginsCollection plugins, CrawlQueue crawlQueue, IConfigurationService config) : base(config) { _plugins = plugins; _htmlParser = new HtmlWeb(); _crawlQueue = crawlQueue; }
/// <summary> /// There are no comments for CrawlQueue in the schema. /// </summary> public void AddToCrawlQueue(CrawlQueue crawlQueue) { base.AddObject("CrawlQueue", crawlQueue); }
/// <summary> /// Create a new CrawlQueue object. /// </summary> /// <param name="id">Initial value of Id.</param> /// <param name="groupId">Initial value of GroupId.</param> public static CrawlQueue CreateCrawlQueue(int id, int groupId) { CrawlQueue crawlQueue = new CrawlQueue(); crawlQueue.Id = id; crawlQueue.GroupId = groupId; return crawlQueue; }
public HomeController(CrawlQueue crawlQueue) { _crawlQueue = crawlQueue; }
public SubmitController(PluginsCollection plugins, CrawlQueue crawlQueue, IConfigurationProvider config) : base(config) { _htmlParser = new HtmlWeb(); _crawlQueue = crawlQueue; }
public SearchController(PluginsCollection plugins, CrawlQueue crawlQueue) { _plugins = plugins; _crawlQueue = crawlQueue; }
public AddController(PluginsCollection plugins, CrawlQueue crawlQueue) { _plugins = plugins; _htmlParser = new HtmlWeb(); _crawlQueue = crawlQueue; }
public override void Run() { Storage = new AzureStorage(); LoadQueue = CloudConfiguration.GetLoadingQueue(); CrawlQueue = CloudConfiguration.GetCrawlingQueue(); StopQueue = CloudConfiguration.GetStopQueue(); SiteDataTable = CloudConfiguration.GetSiteDataTable(); AdminStatusTable = CloudConfiguration.GetAdminStatusTable(); StateQueue = CloudConfiguration.GetStateQueue(); State = "Idle"; CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total"); MemCount = new PerformanceCounter("Memory", "Available MBytes"); Status = new AdminStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue()); string[] robots = { "http://www.cnn.com/robots.txt", "http://www.bleacherreport.com/robots.txt" }; Crawler = new WebCrawler(robots, Storage); Thread.Sleep(10000); string url = ""; while (true) { CloudQueueMessage stopMessage = StopQueue.GetMessage(); while (stopMessage == null) { // Get the next message CloudQueueMessage loadMessage = LoadQueue.GetMessage(); State = "Loading"; if (loadMessage != null) { State = "Loading"; url = loadMessage.AsString; if (url.Contains("robots.txt")) { string[] robotLinks = url.Split(null); foreach (string link in robotLinks) { Crawler.ProcessURL(link); } LoadQueue.DeleteMessage(loadMessage); } else { Crawler.ProcessURL(url); } } else if (State.Equals("Loading") || State.Equals("Crawling")) { CloudQueueMessage crawlMessage = CrawlQueue.GetMessage(); // dequeue crawl message if (crawlMessage != null) { State = "Crawling"; url = crawlMessage.AsString; Crawler.ProcessURL(url); CrawlQueue.DeleteMessage(crawlMessage); } } stopMessage = StopQueue.GetMessage(); UpdateDashboard(url); } State = "Idle"; } }