/// <summary> /// Checks if the Robots Exclusion Standard allows the crawler to visit a url. /// </summary> /// <param name="targetUrl">The url that is to be validated.</param> /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param> /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the /// restrictions posed by the robots meta tag contained in the sourceUrl.</param> /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is /// allowed (false) or disallowed (true) to visit the target Url.</returns> /// <remarks>This method is safe for multi-threaded operations. However only one /// thread will be able to perform a check at any given time. /// </remarks> public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta) { bool retVal = false; //assume that it's allowed to crawl the targetUrl try { mutex.WaitOne(); //perhaphs we should use the hash code of the hostnames as keys. string targetHost = InternetUtils.HostName(targetUrl); string sourceHost = InternetUtils.HostName(sourceUrl); RobotsTxtEntry robots = null; //Do we need to fetch the robots.txt for the source Url? if(sourceUrl.FlagFetchRobots) { //we must fetch the robots.txt from the source url host and update sourceUrl. robots = FetchRobots(sourceHost); sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths); sourceUrl.FlagFetchRobots = false; //fetch it only once //check if it exists in the Hashtable, if so update it, otherwise add it if(robotsTable.ContainsKey(sourceHost)) { robotsTable[sourceHost] = robots; } else { robotsTable.Add(sourceHost, robots); } } else { //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl if(!robotsTable.TryGetValue(sourceHost, out robots)) { robots = new RobotsTxtEntry(); robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths); robotsTable.Add(sourceHost, robots); } else { if(robots.ExpirationDate<DateTime.Today) { robots = FetchRobots(sourceHost); robotsTable[sourceHost] = robots; } } } if(targetHost != sourceHost) { //the target url is on a different host, we must get its robots.txt if(!robotsTable.TryGetValue(targetHost, out robots)) { robots = FetchRobots(targetHost); robotsTable.Add(targetHost, robots); } else { if(robots.ExpirationDate<DateTime.Today) { robots = FetchRobots(targetHost); robotsTable[targetHost] = robots; } } } if((robotsMeta & RobotsMetaTagValue.NoFollow)>0) { //if the meta tag has the NoFollow option set then we cannot crawl targetUrl retVal = true; } else { robots = robotsTable[targetHost]; //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths if(robots.DisallowedPaths!=null) { for(int i = 0; i < robots.DisallowedPaths.Length; i++) { if(targetUrl.IndexOf(robots.DisallowedPaths[i])!=-1) { //we found a match. It is therefore not allowed to crawl targetUrl retVal = true; break; //stop searching as soon as we have a match } } } } } catch(Exception e) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString()); } } finally { mutex.ReleaseMutex(); } return retVal; }
/// <summary> /// Checks if the Robots Exclusion Standard allows the crawler to visit a url. /// </summary> /// <param name="targetUrl">The url that is to be validated.</param> /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param> /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the /// restrictions posed by the robots meta tag contained in the sourceUrl.</param> /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is /// allowed (false) or disallowed (true) to visit the target Url.</returns> /// <remarks>This method is safe for multi-threaded operations. However only one /// thread will be able to perform a check at any given time. /// </remarks> public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta) { bool retVal = false; //assume that it's allowed to crawl the targetUrl try { mutex.WaitOne(); //perhaphs we should use the hash code of the hostnames as keys. string targetHost = InternetUtils.HostName(targetUrl); string sourceHost = InternetUtils.HostName(sourceUrl); RobotsTxtEntry robots = null; //Do we need to fetch the robots.txt for the source Url? if (sourceUrl.FlagFetchRobots) { //we must fetch the robots.txt from the source url host and update sourceUrl. robots = FetchRobots(sourceHost); sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths); sourceUrl.FlagFetchRobots = false; //fetch it only once //check if it exists in the Hashtable, if so update it, otherwise add it if (robotsTable.ContainsKey(sourceHost)) { robotsTable[sourceHost] = robots; } else { robotsTable.Add(sourceHost, robots); } } else { //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl if (!robotsTable.TryGetValue(sourceHost, out robots)) { robots = new RobotsTxtEntry(); robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths); robotsTable.Add(sourceHost, robots); } else { if (robots.ExpirationDate < DateTime.Today) { robots = FetchRobots(sourceHost); robotsTable[sourceHost] = robots; } } } if (targetHost != sourceHost) { //the target url is on a different host, we must get its robots.txt if (!robotsTable.TryGetValue(targetHost, out robots)) { robots = FetchRobots(targetHost); robotsTable.Add(targetHost, robots); } else { if (robots.ExpirationDate < DateTime.Today) { robots = FetchRobots(targetHost); robotsTable[targetHost] = robots; } } } if ((robotsMeta & RobotsMetaTagValue.NoFollow) > 0) { //if the meta tag has the NoFollow option set then we cannot crawl targetUrl retVal = true; } else { robots = robotsTable[targetHost]; //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths if (robots.DisallowedPaths != null) { for (int i = 0; i < robots.DisallowedPaths.Length; i++) { if (targetUrl.IndexOf(robots.DisallowedPaths[i]) != -1) { //we found a match. It is therefore not allowed to crawl targetUrl retVal = true; break; //stop searching as soon as we have a match } } } } } catch (Exception e) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString()); } } finally { mutex.ReleaseMutex(); } return(retVal); }