/// <summary> /// Loads the cache with the banned host entries stored in the database. /// </summary> public void LoadCache() { try { try { dbcon.Open(); } catch { } if (dbcon.State == ConnectionState.Closed) { //log a message return; } SqlCommand cmd = new SqlCommand("cw_select_robots", dbcon); cmd.CommandType = CommandType.StoredProcedure; SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); dbcon.Close(); hosts.Clear(); byte[] key; foreach (DataRow dr in ds.Tables[0].Rows) { key = ((Guid)dr[0]).ToByteArray(); RobotsTxtEntry entry = new RobotsTxtEntry((DateTime)dr[2], SplitPaths((string)dr[1])); hosts.Add(key, entry); //null? WTF!!! plus no existence check is required } ds.Dispose(); OnCacheUpdated(EventArgs.Empty); } catch { if (dbcon.State != ConnectionState.Closed) { try { dbcon.Close(); } catch { } } GC.Collect(); } }
/// <summary> /// Works the same way as the indexer but if the requested entry is not found in the /// cache then it is requested from the database again before returning null. /// </summary> /// <param name="hostName">The host name.</param> public RobotsTxtEntry GetEntry(string hostName) { byte[] key = MD5Hash.md5(hostName); RobotsTxtEntry entry = null; if (hosts.TryGetValue(key, out entry)) { return entry; } try { try { dbcon.Open(); } catch { } if (dbcon.State == ConnectionState.Closed) { //log a message return null; } SqlCommand cmd = new SqlCommand("cw_select_robot", dbcon); cmd.CommandType = CommandType.StoredProcedure; cmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier); cmd.Parameters[0].Value = new Guid(key); SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); dbcon.Close(); if (ds.Tables[0].Rows.Count == 0) { return null; } entry = new RobotsTxtEntry(); entry.DisallowedPaths = SplitPaths((string)ds.Tables[0].Rows[0][1]); entry.ExpirationDate = (DateTime)ds.Tables[0].Rows[0][2]; hosts.Add(key, entry); ds.Dispose(); OnCacheUpdated(EventArgs.Empty); return entry; } catch { if (dbcon.State != ConnectionState.Closed) { try { dbcon.Close(); } catch { } } GC.Collect(); return null; } }
/// <summary> /// Adds a new robots entry to the cache /// </summary> /// <param name="hostNameMD5">The md5 of the hostname to add to the cache.</param> /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param> public void AddEntry(byte[] hostNameMD5, RobotsTxtEntry entry) { if (hostNameMD5.Length != 16) { throw new ArgumentException("Only 16 byte keys are accepted."); } lock (hosts) { if (!hosts.ContainsKey(hostNameMD5)) { hosts.Add(hostNameMD5, entry); OnCacheUpdated(EventArgs.Empty); } } }
/// <summary> /// Adds a new robots entry to the cache /// </summary> /// <param name="hostName">A string containing the hostname to add to the cache.</param> /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param> public void AddEntry(string hostName, RobotsTxtEntry entry) { byte[] key = MD5Hash.md5(hostName); lock (hosts) { if (!hosts.ContainsKey(key)) { hosts.Add(key, entry); OnCacheUpdated(EventArgs.Empty); } } }
/// <summary> /// Checks if the Robots Exclusion Standard allows the crawler to visit a url. /// </summary> /// <param name="targetUrl">The url that is to be validated.</param> /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param> /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the /// restrictions posed by the robots meta tag contained in the sourceUrl.</param> /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is /// allowed (false) or disallowed (true) to visit the target Url.</returns> /// <remarks>This method is safe for multi-threaded operations. However only one /// thread will be able to perform a check at any given time. /// </remarks> public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta) { bool retVal = false; //assume that it's allowed to crawl the targetUrl try { mutex.WaitOne(); //perhaphs we should use the hash code of the hostnames as keys. string targetHost = InternetUtils.HostName(targetUrl); string sourceHost = InternetUtils.HostName(sourceUrl); RobotsTxtEntry robots = null; //Do we need to fetch the robots.txt for the source Url? if(sourceUrl.FlagFetchRobots) { //we must fetch the robots.txt from the source url host and update sourceUrl. robots = FetchRobots(sourceHost); sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths); sourceUrl.FlagFetchRobots = false; //fetch it only once //check if it exists in the Hashtable, if so update it, otherwise add it if(robotsTable.ContainsKey(sourceHost)) { robotsTable[sourceHost] = robots; } else { robotsTable.Add(sourceHost, robots); } } else { //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl if(!robotsTable.TryGetValue(sourceHost, out robots)) { robots = new RobotsTxtEntry(); robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths); robotsTable.Add(sourceHost, robots); } else { if(robots.ExpirationDate<DateTime.Today) { robots = FetchRobots(sourceHost); robotsTable[sourceHost] = robots; } } } if(targetHost != sourceHost) { //the target url is on a different host, we must get its robots.txt if(!robotsTable.TryGetValue(targetHost, out robots)) { robots = FetchRobots(targetHost); robotsTable.Add(targetHost, robots); } else { if(robots.ExpirationDate<DateTime.Today) { robots = FetchRobots(targetHost); robotsTable[targetHost] = robots; } } } if((robotsMeta & RobotsMetaTagValue.NoFollow)>0) { //if the meta tag has the NoFollow option set then we cannot crawl targetUrl retVal = true; } else { robots = robotsTable[targetHost]; //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths if(robots.DisallowedPaths!=null) { for(int i = 0; i < robots.DisallowedPaths.Length; i++) { if(targetUrl.IndexOf(robots.DisallowedPaths[i])!=-1) { //we found a match. It is therefore not allowed to crawl targetUrl retVal = true; break; //stop searching as soon as we have a match } } } } } catch(Exception e) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString()); } } finally { mutex.ReleaseMutex(); } return retVal; }
/// <summary> /// Downloads a robots.txt from a specified host, parses it and constructs a new /// <see cref="RobotsTxtEntry"/> object with the entries of the downloaded file. /// </summary> /// <param name="hostname">The host for which the robots.txt file is required.</param> /// <returns>A new <see cref="RobotsTxtEntry"/> object based on the newly downloaded /// robots.txt file.</returns> private RobotsTxtEntry FetchRobots(string hostname) { RobotsTxtEntry retVal = new RobotsTxtEntry(); try { string robots = DownloadRobots(hostname); if(robots!=null) { if(robots == String.Empty) { retVal.DisallowedPaths = new string [0]; } else { retVal.DisallowedPaths = ParseRobots(robots); } } } catch {} return retVal; }