public CrawlerProcessing(int maxConcurrentDownload, int sleepTime, Uri uri, Web.RobotsTxt robots) { CrawlList = new CrawlList(robots); client = new HttpClient(); this.maxConcurrentDownload = maxConcurrentDownload; ServicePointManager.DefaultConnectionLimit = maxConcurrentDownload; SleepTime = sleepTime; BaseUri = uri; Robots = robots; }
public static bo.Web.RobotsTxt ParseRobotsTxt(string robotsTxt) { bo.Web.RobotsTxt robots = new Web.RobotsTxt(); string[] lines = robotsTxt.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); // go through the lines and parse them for (int i = 0; i < lines.Length; i++) { if (!lines[i].Trim().StartsWith("#")) // checking for commented lines { } } string[] user_agents = Regex.Split(robotsTxt, "User-agent:"); string userAgents = ""; foreach (String agent in user_agents) { if (!agent.Contains("#")) // only ones where there is no comments at the beginning { if (agent.Trim().StartsWith("*")) { userAgents = agent.Trim().Substring(1); } } } String[] disallow = Regex.Split(userAgents, "Disallow:"); if (disallow.Length > 0) { foreach (String item in disallow) { if (!string.IsNullOrEmpty(item.Trim()) && item.Trim() != "\n") { robots.DisallowedList.Add(item.Trim()); } } } return(robots); }