public bool AddURLToFrontQueue(PrettyURL url) { // tjek om allerede besøgt if (AllURLS.Contains(url.GetPrettyURL.GetHashCode())) { return false; } int rand = new Random().Next(0, FrontQueues.Count); FrontQueues[rand].Enqueue(url); AllURLS.Add(url.GetPrettyURL.GetHashCode()); return true; }
/// <summary> /// Parse robots.txt from a specific prettyURL. /// Assumes robots.txt is properly formatted. /// </summary> /// <param name="prettyURL"></param> /// <param name="robotsContent"></param> /// <returns>A regex to determine if a prettyURL is disallowed.</returns> private Regex CalcRobotRegexForDomain(PrettyURL url, IEnumerable<string> robotsContent) { if (CachedRegexes.ContainsKey(url.GetDomain) && DateTime.Now - CachedRegexes[url.GetDomain].Item1 < MaxAge) { return CachedRegexes[url.GetDomain].Item2; } regexCalcWatch.Restart(); // No restrictions // if (robotsContent == null || robotsContent.Count() == 0) { return new Regex(@"@."); } // Find what is disallowed. var disallow = robotsContent.SkipWhile(s => !Regex.IsMatch(s, @"User-Agent: \*", RegexOptions.IgnoreCase)) // Start from user agent *. .TakeWhile(s => !string.IsNullOrWhiteSpace(s)) // Read until blank line (where allow/disallow hopefully ends). .Skip(1) // Skip the user agent string. .Where(s => s.StartsWith("Disallow")) // We only need disallowed stuff. .Select(s => s.Split(':').Last().Trim()); // Select the disallowed stuff. if (disallow.Count() == 0) { return new Regex(@"$."); } // Build the regex string StringBuilder regPattern = new StringBuilder(url + "(" + disallow.First()); foreach (var s in disallow.Skip(1)) { regPattern.Append('|'); //regPattern.Append(prettyURL); regPattern.Append(s); } regPattern.Append(')'); regPattern.Replace("*", ".*").Replace(".", "\\.").Replace("+", "\\+"); CachedRegexes[url.GetDomain] = new Tuple<DateTime, Regex>(DateTime.Now, new Regex(regPattern.ToString())); regexCalcWatch.Stop(); regexCalcTimes[url.GetDomain] = (int)regexCalcWatch.Elapsed.TotalMilliseconds; return CachedRegexes[url.GetDomain].Item2; }
public bool IsVisitAllowed(PrettyURL url) { if (CachedRegexes.ContainsKey(url.GetDomain)) { // Too old? if (DateTime.Now - CachedRegexes[url.GetDomain].Item1 > MaxAge) { var robotContent = DownloadRobotContent(url).Split('\n'); var regex = CalcRobotRegexForDomain(url, robotContent); CachedRegexes[url.GetDomain] = new Tuple<DateTime, Regex>(DateTime.Now, regex); } } else { var robotContent = DownloadRobotContent(url).Split(new string[] { "\r\n", "\n" }, StringSplitOptions.None); var regex = CalcRobotRegexForDomain(url, robotContent); CachedRegexes[url.GetDomain] = new Tuple<DateTime, Regex>(DateTime.Now, regex); } return !CachedRegexes[url.GetDomain].Item2.IsMatch(url.GetPrettyURL); }
public void RemoveRobot(PrettyURL url) { CachedRegexes.Remove(url.GetDomain); }
private string DownloadRobotContent(PrettyURL url) { string content = ""; robotDownloadWatch.Restart(); try { string robot = "http://" + url.GetDomain + "/" + "robots.txt"; //System.Diagnostics.Debug.WriteLine("Downloading " + robot); content = new System.Net.WebClient().DownloadString(robot); } catch (Exception ex) { //System.Diagnostics.Debug.WriteLine("Robot: Could not download " + url); } robotDownloadWatch.Stop(); robotDownloadTimes[url.GetDomain] = (int)robotDownloadWatch.Elapsed.TotalMilliseconds; return content; }
public IEnumerable<PrettyURL> ExtractLinksFromHTML(PrettyURL url, string html) { if (html == null) { return Enumerable.Empty<PrettyURL>(); } var hrefs = html.Split(new string[] { "<a href=\"" }, StringSplitOptions.RemoveEmptyEntries) .Skip(1) .Where(s => !s.StartsWith("feed") && !s.StartsWith("javascript")); var urls = new List<PrettyURL>(); foreach (var href in hrefs) { var link = href.Split('\"').First(); string fullPath = (link.StartsWith("/") ? url.GetPrettyURL : "") + link; if (PrettyURL.IsValidURL(fullPath)) { var pretty = new PrettyURL(fullPath); if (pretty.GetDomain.EndsWith(".dk")) { urls.Add(new PrettyURL(fullPath)); } } } return urls; }
public string DownloadHTML(PrettyURL url) { WebClient web = new WebClient(); if (Robot_IAm.IsVisitAllowed(url)) { try { return web.DownloadString(url.GetPrettyURL); } catch (Exception ex) { // tough luck //System.Diagnostics.Debug.WriteLine("Error downloading " + url, CRAWLER); return null; } } else { return null; } }