private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e) { Robot robot = sender as Robot; if (robot != null) Console.WriteLine("Done: {0} Threads: {1} Processing: {2} To Go: {3}", robot.ProcessedCount, robot.ActiveThreadCount, robot.ProcessingCount, robot.NotProcessedCount); Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine(e.Element.RequestedUri.ToString()); Console.ResetColor(); UriFoundCount++; if (e.Status >= 500) { string path = String.Format(@"c:\crawler\{0}\{1}.html", e.Element.BaseUri, e.ContentHash); if (!File.Exists(path)) { using (StreamWriter writer = File.CreateText(path)) { writer.Write(e.Content); } } } string title = null; string description = null; string keywords = null; string robots = null; string matchData = e.Content; Match titleMatch = TitleExpression.Match(matchData); if (titleMatch.Success) title = titleMatch.Groups["title"].Value.Trim(); MatchCollection metaMatches = MetaExpression.Matches(matchData); foreach (Match match in metaMatches) { if (match.Success) { if (String.Compare(match.Groups["name"].Value, "description", true) == 0) description = match.Groups["content"].Value.Trim(); else if (String.Compare(match.Groups["name"].Value, "keywords", true) == 0) keywords = match.Groups["content"].Value.Trim(); else if (String.Compare(match.Groups["name"].Value, "robots", true) == 0) robots = match.Groups["content"].Value.Trim(); } } try { using (CrawlerDataContext dc = new CrawlerDataContext()) { SessionScan scan = new SessionScan { SessionKey = SessionKey, UrlHash = e.Element.RequestedUri.ToString().ToHashString("SHA1"), ContentHash = e.ContentHash, ScanDate = DateTime.UtcNow, Host = e.Element.RequestedUri.Host, Base = e.Element.BaseUri.OriginalString, Found = e.Element.FoundUri.OriginalString, Url = e.Element.RequestedUri.OriginalString, Redirect = e.ResponseHeaders[HttpResponseHeader.Location], Method = e.Method, Status = e.Status, Title = title, Description = description, Keywords = keywords, Robots = ProcessRobots(robots, e).ToString(), ContentType = e.ResponseHeaders[HttpResponseHeader.ContentType], ContentEncoding = e.ResponseHeaders[HttpResponseHeader.ContentEncoding], ContentLength = TryConvertInt64(e.ResponseHeaders[HttpResponseHeader.ContentLength]), CacheControl = e.ResponseHeaders[HttpResponseHeader.CacheControl], Expires = e.ResponseHeaders[HttpResponseHeader.Expires] }; Dictionary<string, SessionScanRelation> relatedUrls = new Dictionary<string, SessionScanRelation>(e.Related.Length); // remove duplicates foreach (UriElement related in e.Related) { string relatedHash = related.RequestedUri.ToString().ToHashString("SHA1"); if (relatedUrls.ContainsKey(relatedHash)) relatedUrls[relatedHash].Count++; else relatedUrls.Add(relatedHash, new SessionScanRelation { SessionKey = SessionKey, UrlHash = e.Element.RequestedUri.ToString().ToHashString("SHA1"), RelatedHash = relatedHash, Related = related.RequestedUri.ToString(), Count = 1 }); } // add all the related urls to the scan scan.SessionScanRelations.AddRange(relatedUrls.Values); dc.SessionScans.InsertOnSubmit(scan); dc.SubmitChanges(); } } catch (Exception exc) { if (!Errors.Contains(exc.Message)) Errors.Add(exc.Message); Console.BackgroundColor = ConsoleColor.Red; Console.WriteLine(exc.Message); Console.ResetColor(); } }
private static void Main(string[] args) { SessionKey = Guid.NewGuid(); Errors = new List<string>(); UriFoundCount = 0; // set number of connections allowed to recommend limit // <see href="http://support.microsoft.com/kb/821268" /> ServicePointManager.DefaultConnectionLimit = 12 * Environment.ProcessorCount; Console.ForegroundColor = ConsoleColor.Red; Console.Write("Enter Domain: "); Console.ForegroundColor = ConsoleColor.Green; string url = Console.ReadLine(); if (!url.StartsWith("http://")) url = "http://" + url; Console.ForegroundColor = ConsoleColor.Red; Console.Write("Enter Number of Processors (" + 12 * Environment.ProcessorCount + " recommended): "); Console.ForegroundColor = ConsoleColor.Green; string cprocessors = Console.ReadLine(); int processors; if (!Int32.TryParse(cprocessors, out processors)) processors = 1; Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine("Starting scan of {0} with {1} thread{2}", url, processors, processors == 1 ? String.Empty : "s"); Console.ResetColor(); using (var dc = new CrawlerDataContext()) { Session session = new Session { SessionKey = SessionKey, ScanDate = DateTime.UtcNow, Url = url }; dc.Sessions.InsertOnSubmit(session); dc.SubmitChanges(); } Robot bot = new Robot(new Uri(url)); bot.MaxProcessorsAllowed = processors; bot.UriProcessingFinished += new EventHandler<UriProcessingFinishedEventArgs>(bot_UriProcessingFinished); bot.UriFound += new EventHandler<UriFoundEventArgs>(bot_UriFound); bot.Scan(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Processing Done"); if (Errors.Count > 0) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Errors"); Console.ResetColor(); for (int i = 0; i < Errors.Count; i++) { if (i % 2 == 0) Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(Errors[i]); Console.ResetColor(); } } Console.Read(); }