/// <summary> /// Fetch a robot file for a uri /// </summary> /// <param name="anyUri">Any URI for which you want the robots file for</param> /// <returns></returns> public static async Task <RobotsFile> GetFile(Uri anyUri) { //log.Debug("Downloading robots: " + uri); Site site = null; RobotsFile robotsFile = null; var robotsUri = MakeRobotsUri(anyUri); bool needsVisiting = true; try { var db = await DatabasePool.GetDatabaseAsync(); site = await db.GetSite(robotsUri); await DatabasePool.GiveBackToPool(db); if (site != null) { needsVisiting = site.RobotsNeedsVisiting; } else { site = MakeNewSite(anyUri); } if (needsVisiting) { if (site != null && site.IsBlocked) { Utility.LogInfo("Can't get robots file as site is blocked by policy: " + robotsUri); return(null); } robotsFile = await DownloadRobots(robotsUri, site.LastRobotsFetched); site.LastRobotsFetched = DateTime.UtcNow; site.RobotsFile = robotsFile; db = await DatabasePool.GetDatabaseAsync(); await db.SaveSite(site); await DatabasePool.GiveBackToPool(db); } else { robotsFile = site.RobotsFile; } } catch (Exception ex) { Utility.LogException(ex); } return(robotsFile); }
/// <summary> /// Download a robots file /// </summary> /// <param name="robotsUri"></param> /// <param name="lastFetched"></param> /// <returns></returns> public static async Task <RobotsFile> DownloadRobots(Uri anyUri, DateTime?lastFetched) { RobotsFile robots = null; var robotsUri = MakeRobotsUri(anyUri); try { var ip = await Utility.GetHostIPAddress(robotsUri); /*while (!await FetchoConfiguration.Current.HostCache.WaitToFetch(ip, 60000)) * Utility.LogInfo("IP Congestion {0}", ip);*/ var bb = new BufferBlock <IWebResourceWriter>(); using (var ms = new MemoryStream()) { using (var packet = new WebDataPacketWriter(ms)) { // this is annoying, I shouldn't have to create a buffer block to get a robots file // or we should put robots into the standard flow of things await bb.SendAsync(packet); await(new HttpResourceFetcher()).Fetch(null, robotsUri, null, lastFetched, bb); } ms.Seek(0, SeekOrigin.Begin); using (var packet = new WebDataPacketReader(CreateXmlReader(ms))) { using (var stream = packet.GetResponseStream()) { if (stream == null) { robots = new RobotsFile(); } else { robots = new RobotsFile(robotsUri, stream); } } } } } catch (Exception ex) { Utility.LogInfo("Fetching {0}:", robotsUri); Utility.LogException(ex); } return(robots); }