Exemplo n.º 1
0
        /// <summary>
        /// Fetch a robot file for a uri
        /// </summary>
        /// <param name="anyUri">Any URI for which you want the robots file for</param>
        /// <returns></returns>
        public static async Task <RobotsFile> GetFile(Uri anyUri)
        {
            //log.Debug("Downloading robots: " + uri);

            Site       site          = null;
            RobotsFile robotsFile    = null;
            var        robotsUri     = MakeRobotsUri(anyUri);
            bool       needsVisiting = true;

            try
            {
                var db = await DatabasePool.GetDatabaseAsync();

                site = await db.GetSite(robotsUri);

                await DatabasePool.GiveBackToPool(db);

                if (site != null)
                {
                    needsVisiting = site.RobotsNeedsVisiting;
                }
                else
                {
                    site = MakeNewSite(anyUri);
                }

                if (needsVisiting)
                {
                    if (site != null && site.IsBlocked)
                    {
                        Utility.LogInfo("Can't get robots file as site is blocked by policy: " + robotsUri);
                        return(null);
                    }

                    robotsFile = await DownloadRobots(robotsUri, site.LastRobotsFetched);

                    site.LastRobotsFetched = DateTime.UtcNow;
                    site.RobotsFile        = robotsFile;
                    db = await DatabasePool.GetDatabaseAsync();

                    await db.SaveSite(site);

                    await DatabasePool.GiveBackToPool(db);
                }
                else
                {
                    robotsFile = site.RobotsFile;
                }
            }
            catch (Exception ex)
            {
                Utility.LogException(ex);
            }
            return(robotsFile);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Download a robots file
        /// </summary>
        /// <param name="robotsUri"></param>
        /// <param name="lastFetched"></param>
        /// <returns></returns>
        public static async Task <RobotsFile> DownloadRobots(Uri anyUri, DateTime?lastFetched)
        {
            RobotsFile robots    = null;
            var        robotsUri = MakeRobotsUri(anyUri);

            try
            {
                var ip = await Utility.GetHostIPAddress(robotsUri);

                /*while (!await FetchoConfiguration.Current.HostCache.WaitToFetch(ip, 60000))
                 *  Utility.LogInfo("IP Congestion {0}", ip);*/

                var bb = new BufferBlock <IWebResourceWriter>();

                using (var ms = new MemoryStream())
                {
                    using (var packet = new WebDataPacketWriter(ms))
                    {
                        // this is annoying, I shouldn't have to create a buffer block to get a robots file
                        // or we should put robots into the standard flow of things
                        await bb.SendAsync(packet);

                        await(new HttpResourceFetcher()).Fetch(null, robotsUri, null, lastFetched, bb);
                    }
                    ms.Seek(0, SeekOrigin.Begin);

                    using (var packet = new WebDataPacketReader(CreateXmlReader(ms)))
                    {
                        using (var stream = packet.GetResponseStream())
                        {
                            if (stream == null)
                            {
                                robots = new RobotsFile();
                            }
                            else
                            {
                                robots = new RobotsFile(robotsUri, stream);
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Utility.LogInfo("Fetching {0}:", robotsUri);
                Utility.LogException(ex);
            }

            return(robots);
        }