private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new StringReader(e.Content); XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); string textOnly = doc.DocumentElement.InnerText; foreach (string keyword in Keywords) { MatchCollection matches = Regex.Matches(textOnly, "(?'found'" + keyword.Replace(" ", "[\\s]*") + ")", RegexOptions.IgnoreCase); Console.ForegroundColor = ConsoleColor.Yellow; Console.Write("Found "); Console.ForegroundColor = ConsoleColor.Cyan; Console.Write(keyword); Console.ForegroundColor = ConsoleColor.Yellow; Console.Write(" in "); Console.ForegroundColor = ConsoleColor.Cyan; Console.Write(matches.Count); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(" different places."); } }
private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new StringReader(e.Content); XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); XmlNodeList list = doc.SelectNodes(@"/html/body[@id='gsr']/div[@id='res']/div/ol/li/div/cite"); int count = 0; foreach (XmlNode node in list) { count++; string foundUrl = node.InnerText; foundUrl = GoogleUrlResultSize.Replace(foundUrl, String.Empty); foundUrl = "http://" + foundUrl; Uri url; if (Uri.TryCreate(foundUrl, UriKind.Absolute, out url)) { if (url.Host.IndexOf(LookingForDomain) >= 0) { string result = String.Format("Rank {0} for {1}", count + CurrentlyProcessingStart, url); ResultsFound.Add(result); Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine(result); Console.ResetColor(); Console.Beep(); } } } }
private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e) { Robot robot = sender as Robot; if (robot != null) Console.WriteLine("Done: {0} Threads: {1} Processing: {2} To Go: {3}", robot.ProcessedCount, robot.ActiveThreadCount, robot.ProcessingCount, robot.NotProcessedCount); Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine(e.Element.RequestedUri.ToString()); Console.ResetColor(); UriFoundCount++; if (e.Status >= 500) { string path = String.Format(@"c:\crawler\{0}\{1}.html", e.Element.BaseUri, e.ContentHash); if (!File.Exists(path)) { using (StreamWriter writer = File.CreateText(path)) { writer.Write(e.Content); } } } string title = null; string description = null; string keywords = null; string robots = null; string matchData = e.Content; Match titleMatch = TitleExpression.Match(matchData); if (titleMatch.Success) title = titleMatch.Groups["title"].Value.Trim(); MatchCollection metaMatches = MetaExpression.Matches(matchData); foreach (Match match in metaMatches) { if (match.Success) { if (String.Compare(match.Groups["name"].Value, "description", true) == 0) description = match.Groups["content"].Value.Trim(); else if (String.Compare(match.Groups["name"].Value, "keywords", true) == 0) keywords = match.Groups["content"].Value.Trim(); else if (String.Compare(match.Groups["name"].Value, "robots", true) == 0) robots = match.Groups["content"].Value.Trim(); } } try { using (CrawlerDataContext dc = new CrawlerDataContext()) { SessionScan scan = new SessionScan { SessionKey = SessionKey, UrlHash = e.Element.RequestedUri.ToString().ToHashString("SHA1"), ContentHash = e.ContentHash, ScanDate = DateTime.UtcNow, Host = e.Element.RequestedUri.Host, Base = e.Element.BaseUri.OriginalString, Found = e.Element.FoundUri.OriginalString, Url = e.Element.RequestedUri.OriginalString, Redirect = e.ResponseHeaders[HttpResponseHeader.Location], Method = e.Method, Status = e.Status, Title = title, Description = description, Keywords = keywords, Robots = ProcessRobots(robots, e).ToString(), ContentType = e.ResponseHeaders[HttpResponseHeader.ContentType], ContentEncoding = e.ResponseHeaders[HttpResponseHeader.ContentEncoding], ContentLength = TryConvertInt64(e.ResponseHeaders[HttpResponseHeader.ContentLength]), CacheControl = e.ResponseHeaders[HttpResponseHeader.CacheControl], Expires = e.ResponseHeaders[HttpResponseHeader.Expires] }; Dictionary<string, SessionScanRelation> relatedUrls = new Dictionary<string, SessionScanRelation>(e.Related.Length); // remove duplicates foreach (UriElement related in e.Related) { string relatedHash = related.RequestedUri.ToString().ToHashString("SHA1"); if (relatedUrls.ContainsKey(relatedHash)) relatedUrls[relatedHash].Count++; else relatedUrls.Add(relatedHash, new SessionScanRelation { SessionKey = SessionKey, UrlHash = e.Element.RequestedUri.ToString().ToHashString("SHA1"), RelatedHash = relatedHash, Related = related.RequestedUri.ToString(), Count = 1 }); } // add all the related urls to the scan scan.SessionScanRelations.AddRange(relatedUrls.Values); dc.SessionScans.InsertOnSubmit(scan); dc.SubmitChanges(); } } catch (Exception exc) { if (!Errors.Contains(exc.Message)) Errors.Add(exc.Message); Console.BackgroundColor = ConsoleColor.Red; Console.WriteLine(exc.Message); Console.ResetColor(); } }
private static RobotTag ProcessRobots(string metaRobots, UriProcessingFinishedEventArgs e) { RobotTag robots = RobotTag.Null; List<string> robotParts = new List<string>(); if (metaRobots != null) robotParts.AddRange(metaRobots.ToLower().Split(new char[] { ',', ';' }, StringSplitOptions.RemoveEmptyEntries)); if (e.ResponseHeaders["X-Robots-Tag"] != null) robotParts.AddRange(e.ResponseHeaders["X-Robots-Tag"].ToLower().Split(new char[] { ',', ';' }, StringSplitOptions.RemoveEmptyEntries)); for (int i = 0; i < robotParts.Count; i++) robotParts[i] = robotParts[i].Trim(); if (robotParts.Contains("index")) robots |= RobotTag.Index; if (robotParts.Contains("noindex")) robots |= RobotTag.NoIndex; if (robotParts.Contains("follow")) robots |= RobotTag.Follow; if (robotParts.Contains("nofollow")) robots |= RobotTag.NoFollow; if (robotParts.Contains("all")) robots |= RobotTag.Index | RobotTag.Follow; if (robotParts.Contains("none")) robots |= RobotTag.NoIndex | RobotTag.NoFollow; if (robotParts.Contains("noarchive")) robots |= RobotTag.NoArchive; if (robotParts.Contains("nosnippet")) robots |= RobotTag.NoSnippet; if (robotParts.Contains("unavailable_after")) robots |= RobotTag.UnavailableAfter; // remove null if other robot tags were found if (robots != RobotTag.Null) robots &= ~RobotTag.Null; return robots; }
/// <summary> /// Raises the <see cref="E:Finished"/> event. /// </summary> /// <param name="e">The <see cref="ManagedFusion.Crawler.UriProcessFinishedEventArgs"/> instance containing the event data.</param> protected virtual void OnFinished(UriProcessingFinishedEventArgs e) { if (Finished != null) Finished(this, e); }
/// <summary> /// Raises the <see cref="E:UriProcessingFinished"/> event. /// </summary> /// <param name="e">The <see cref="ManagedFusion.Crawler.UriProcessFinishedEventArgs"/> instance containing the event data.</param> private void OnUriProcessingFinished(UriProcessingFinishedEventArgs e) { if (UriProcessingFinished != null) UriProcessingFinished(this, e); }
/// <summary> /// Handles the Finished event of the handler control. /// </summary> /// <param name="sender">The source of the event.</param> /// <param name="e">The <see cref="ManagedFusion.Crawler.UriProcessFinishedEventArgs"/> instance containing the event data.</param> private void handler_Finished(object sender, UriProcessingFinishedEventArgs e) { lock (_sync) { _processing.Remove(e.Element); if (!_processed.Contains(e.Element)) { _processed.Add(e.Element); // notify of uri finished processing OnUriProcessingFinished(e); } } }