Example #1
0
 /// <summary>
 /// Checks if the Robots Exclusion Standard allows the crawler to visit a url.
 /// </summary>
 /// <param name="targetUrl">The url that is to be validated.</param>
 /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param>
 /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the 
 /// restrictions posed by the robots meta tag contained in the sourceUrl.</param>
 /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is 
 /// allowed (false) or disallowed (true) to visit the target Url.</returns>
 /// <remarks>This method is safe for multi-threaded operations. However only one
 /// thread will be able to perform a check at any given time.
 /// </remarks>
 public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta)
 {
     bool retVal = false; //assume that it's allowed to crawl the targetUrl
     try
     {
         mutex.WaitOne();
         //perhaphs we should use the hash code of the hostnames as keys.
         string targetHost = InternetUtils.HostName(targetUrl);
         string sourceHost = InternetUtils.HostName(sourceUrl);
         RobotsTxtEntry robots = null;
         //Do we need to fetch the robots.txt for the source Url?
         if(sourceUrl.FlagFetchRobots)
         {
             //we must fetch the robots.txt from the source url host and update sourceUrl.
             robots = FetchRobots(sourceHost);
             sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths);
             sourceUrl.FlagFetchRobots = false; //fetch it only once
             //check if it exists in the Hashtable, if so update it, otherwise add it
             if(robotsTable.ContainsKey(sourceHost))
             {
                 robotsTable[sourceHost] = robots;
             }
             else
             {
                 robotsTable.Add(sourceHost, robots);
             }
         }
         else
         {
             //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl
             if(!robotsTable.TryGetValue(sourceHost, out robots))
             {
                 robots = new RobotsTxtEntry();
                 robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths);
                 robotsTable.Add(sourceHost, robots);
             }
             else
             {
                 if(robots.ExpirationDate<DateTime.Today)
                 {
                     robots = FetchRobots(sourceHost);
                     robotsTable[sourceHost] = robots;
                 }
             }
         }
         if(targetHost != sourceHost)
         {
             //the target url is on a different host, we must get its robots.txt
             if(!robotsTable.TryGetValue(targetHost, out robots))
             {
                 robots = FetchRobots(targetHost);
                 robotsTable.Add(targetHost, robots);
             }
             else
             {
                 if(robots.ExpirationDate<DateTime.Today)
                 {
                     robots = FetchRobots(targetHost);
                     robotsTable[targetHost] = robots;
                 }
             }
         }
         if((robotsMeta & RobotsMetaTagValue.NoFollow)>0)
         {
             //if the meta tag has the NoFollow option set then we cannot crawl targetUrl
             retVal = true;
         }
         else
         {
             robots = robotsTable[targetHost];
             //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths
             if(robots.DisallowedPaths!=null)
             {
                 for(int i = 0; i < robots.DisallowedPaths.Length; i++)
                 {
                     if(targetUrl.IndexOf(robots.DisallowedPaths[i])!=-1)
                     {
                         //we found a match. It is therefore not allowed to crawl targetUrl
                         retVal = true;
                         break; //stop searching as soon as we have a match
                     }
                 }
             }
         }
     }
     catch(Exception e)
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString());
         }
     }
     finally
     {
         mutex.ReleaseMutex();
     }
     return retVal;
 }
Example #2
0
        /// <summary>
        /// Checks if the Robots Exclusion Standard allows the crawler to visit a url.
        /// </summary>
        /// <param name="targetUrl">The url that is to be validated.</param>
        /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param>
        /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the
        /// restrictions posed by the robots meta tag contained in the sourceUrl.</param>
        /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is
        /// allowed (false) or disallowed (true) to visit the target Url.</returns>
        /// <remarks>This method is safe for multi-threaded operations. However only one
        /// thread will be able to perform a check at any given time.
        /// </remarks>
        public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta)
        {
            bool retVal = false;             //assume that it's allowed to crawl the targetUrl

            try
            {
                mutex.WaitOne();
                //perhaphs we should use the hash code of the hostnames as keys.
                string         targetHost = InternetUtils.HostName(targetUrl);
                string         sourceHost = InternetUtils.HostName(sourceUrl);
                RobotsTxtEntry robots     = null;
                //Do we need to fetch the robots.txt for the source Url?
                if (sourceUrl.FlagFetchRobots)
                {
                    //we must fetch the robots.txt from the source url host and update sourceUrl.
                    robots = FetchRobots(sourceHost);
                    sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths);
                    sourceUrl.FlagFetchRobots       = false;               //fetch it only once
                    //check if it exists in the Hashtable, if so update it, otherwise add it
                    if (robotsTable.ContainsKey(sourceHost))
                    {
                        robotsTable[sourceHost] = robots;
                    }
                    else
                    {
                        robotsTable.Add(sourceHost, robots);
                    }
                }
                else
                {
                    //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl
                    if (!robotsTable.TryGetValue(sourceHost, out robots))
                    {
                        robots = new RobotsTxtEntry();
                        robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths);
                        robotsTable.Add(sourceHost, robots);
                    }
                    else
                    {
                        if (robots.ExpirationDate < DateTime.Today)
                        {
                            robots = FetchRobots(sourceHost);
                            robotsTable[sourceHost] = robots;
                        }
                    }
                }
                if (targetHost != sourceHost)
                {
                    //the target url is on a different host, we must get its robots.txt
                    if (!robotsTable.TryGetValue(targetHost, out robots))
                    {
                        robots = FetchRobots(targetHost);
                        robotsTable.Add(targetHost, robots);
                    }
                    else
                    {
                        if (robots.ExpirationDate < DateTime.Today)
                        {
                            robots = FetchRobots(targetHost);
                            robotsTable[targetHost] = robots;
                        }
                    }
                }
                if ((robotsMeta & RobotsMetaTagValue.NoFollow) > 0)
                {
                    //if the meta tag has the NoFollow option set then we cannot crawl targetUrl
                    retVal = true;
                }
                else
                {
                    robots = robotsTable[targetHost];
                    //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths
                    if (robots.DisallowedPaths != null)
                    {
                        for (int i = 0; i < robots.DisallowedPaths.Length; i++)
                        {
                            if (targetUrl.IndexOf(robots.DisallowedPaths[i]) != -1)
                            {
                                //we found a match. It is therefore not allowed to crawl targetUrl
                                retVal = true;
                                break;                                 //stop searching as soon as we have a match
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString());
                }
            }
            finally
            {
                mutex.ReleaseMutex();
            }
            return(retVal);
        }