public void TestRobotsCache() { CollectorManager cm = new CollectorManager(); RobotsCache cache = new RobotsCache(cm, "http://www.yahoo.com"); Assert.IsTrue(cache.RobotsExclusion.Count > 0, "Robots not loaded"); }
public RobotsCache(CollectorManager cm, string domainSchemeAndServer) { this.Redis = cm.PRCM.GetClient(); this.DomainSchemeAndServer = new Uri(domainSchemeAndServer).AbsoluteUri; this.RobotID = "urn:domain:robots:data"; this.RobotLastDateCrawlID = "urn:domain:robots:last-date-crawl"; if (!TryRetrieveFromCache()) { RetrieveRobots(); this.Redis.SetEntryInHash(this.RobotID, this.DomainSchemeAndServer, this.RobotsExclusion.JsonSerialize()); this.Redis.SetEntryInHash(this.RobotLastDateCrawlID,this.DomainSchemeAndServer, DateTime.Now.ToString()); } }
public static Collector CrawlAndSave( CollectorManager cm, string link, string backlink, string text, string rel, string kind, bool crawlChildLinks, bool poolChildLinksFound) { link = LinkParser.Validate(link, backlink); if (string.IsNullOrEmpty(link)) { // Do nothing, unparsable url return null; } else { Collector collector = new Collector( cm, link, backlink, text, rel, kind, crawlChildLinks, poolChildLinksFound); return collector; } }
Collector( CollectorManager cm, string link, string backlink, string text, string rel, string kind, bool crawlChildLinks, bool poolChildLinksFound) { this.CM = cm; this.Redis = cm.PRCM.GetClient(); this.Link = link; this.CurrentBacklink = LinkParser.Validate(backlink, string.Empty); this.LinkInfo = new LinkInfo(cm, this.Link, this.CurrentBacklink, text, rel, kind); if (!this.LinkInfo.LinkExcludedInRobots) { SaveLink(); if (crawlChildLinks) CrawlChildLinks(); } }
public Form1() { InitializeComponent(); this.CM = new CollectorManager(); }
public LinkRegistryTest() { this.CM = new CollectorManager(); }
public LinkInfo(CollectorManager cm, string link, string backlink, string text, string rel, string kind) : this(cm) { RetrieveInfo(link, backlink, text, rel, kind); }
public LinkInfo(CollectorManager cm, string link) : this(cm, link, string.Empty, string.Empty, string.Empty, string.Empty) { }
public LinkInfo(CollectorManager cm) { this.Backlinks = new List<string>(); this.CM = cm; }
public void TestCrawlManagerStart() { CollectorManager mgr = new CollectorManager(); mgr.Start("127.0.0.1"); }