Esempio n. 1
0
 /// <summary>
 /// Loads the cache with the banned host entries stored in the database.
 /// </summary>
 public void LoadCache()
 {
     try
     {
         try
         {
             dbcon.Open();
         }
         catch
         { }
         if (dbcon.State == ConnectionState.Closed)
         {
             //log a message
             return;
         }
         SqlCommand cmd = new SqlCommand("cw_select_robots", dbcon);
         cmd.CommandType = CommandType.StoredProcedure;
         SqlDataAdapter da = new SqlDataAdapter(cmd);
         DataSet ds = new DataSet();
         da.Fill(ds);
         da.Dispose();
         cmd.Dispose();
         dbcon.Close();
         hosts.Clear();
         byte[] key;
         foreach (DataRow dr in ds.Tables[0].Rows)
         {
             key = ((Guid)dr[0]).ToByteArray();
             RobotsTxtEntry entry = new RobotsTxtEntry((DateTime)dr[2], SplitPaths((string)dr[1]));
             hosts.Add(key, entry); //null? WTF!!! plus no existence check is required
         }
         ds.Dispose();
         OnCacheUpdated(EventArgs.Empty);
     }
     catch
     {
         if (dbcon.State != ConnectionState.Closed)
         {
             try
             {
                 dbcon.Close();
             }
             catch
             { }
         }
         GC.Collect();
     }
 }
Esempio n. 2
0
 /// <summary>
 /// Works the same way as the indexer but if the requested entry is not found in the
 /// cache then it is requested from the database again before returning null.
 /// </summary>
 /// <param name="hostName">The  host name.</param>
 public RobotsTxtEntry GetEntry(string hostName)
 {
     byte[] key = MD5Hash.md5(hostName);
     RobotsTxtEntry entry = null;
     if (hosts.TryGetValue(key, out entry))
     {
         return entry;
     }
     try
     {
         try
         {
             dbcon.Open();
         }
         catch
         { }
         if (dbcon.State == ConnectionState.Closed)
         {
             //log a message
             return null;
         }
         SqlCommand cmd = new SqlCommand("cw_select_robot", dbcon);
         cmd.CommandType = CommandType.StoredProcedure;
         cmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
         cmd.Parameters[0].Value = new Guid(key);
         SqlDataAdapter da = new SqlDataAdapter(cmd);
         DataSet ds = new DataSet();
         da.Fill(ds);
         da.Dispose();
         cmd.Dispose();
         dbcon.Close();
         if (ds.Tables[0].Rows.Count == 0)
         {
             return null;
         }
         entry = new RobotsTxtEntry();
         entry.DisallowedPaths = SplitPaths((string)ds.Tables[0].Rows[0][1]);
         entry.ExpirationDate = (DateTime)ds.Tables[0].Rows[0][2];
         hosts.Add(key, entry);
         ds.Dispose();
         OnCacheUpdated(EventArgs.Empty);
         return entry;
     }
     catch
     {
         if (dbcon.State != ConnectionState.Closed)
         {
             try
             {
                 dbcon.Close();
             }
             catch
             { }
         }
         GC.Collect();
         return null;
     }
 }
Esempio n. 3
0
 /// <summary>
 /// Adds a new robots entry to the cache
 /// </summary>
 /// <param name="hostNameMD5">The md5 of the hostname to add to the cache.</param>
 /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param>
 public void AddEntry(byte[] hostNameMD5, RobotsTxtEntry entry)
 {
     if (hostNameMD5.Length != 16)
     {
         throw new ArgumentException("Only 16 byte keys are accepted.");
     }
     lock (hosts)
     {
         if (!hosts.ContainsKey(hostNameMD5))
         {
             hosts.Add(hostNameMD5, entry);
             OnCacheUpdated(EventArgs.Empty);
         }
     }
 }
Esempio n. 4
0
 /// <summary>
 /// Adds a new robots entry to the cache
 /// </summary>
 /// <param name="hostName">A string containing the hostname to add to the cache.</param>
 /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param>
 public void AddEntry(string hostName, RobotsTxtEntry entry)
 {
     byte[] key = MD5Hash.md5(hostName);
     lock (hosts)
     {
         if (!hosts.ContainsKey(key))
         {
             hosts.Add(key, entry);
             OnCacheUpdated(EventArgs.Empty);
         }
     }
 }
Esempio n. 5
0
 /// <summary>
 /// Checks if the Robots Exclusion Standard allows the crawler to visit a url.
 /// </summary>
 /// <param name="targetUrl">The url that is to be validated.</param>
 /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param>
 /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the 
 /// restrictions posed by the robots meta tag contained in the sourceUrl.</param>
 /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is 
 /// allowed (false) or disallowed (true) to visit the target Url.</returns>
 /// <remarks>This method is safe for multi-threaded operations. However only one
 /// thread will be able to perform a check at any given time.
 /// </remarks>
 public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta)
 {
     bool retVal = false; //assume that it's allowed to crawl the targetUrl
     try
     {
         mutex.WaitOne();
         //perhaphs we should use the hash code of the hostnames as keys.
         string targetHost = InternetUtils.HostName(targetUrl);
         string sourceHost = InternetUtils.HostName(sourceUrl);
         RobotsTxtEntry robots = null;
         //Do we need to fetch the robots.txt for the source Url?
         if(sourceUrl.FlagFetchRobots)
         {
             //we must fetch the robots.txt from the source url host and update sourceUrl.
             robots = FetchRobots(sourceHost);
             sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths);
             sourceUrl.FlagFetchRobots = false; //fetch it only once
             //check if it exists in the Hashtable, if so update it, otherwise add it
             if(robotsTable.ContainsKey(sourceHost))
             {
                 robotsTable[sourceHost] = robots;
             }
             else
             {
                 robotsTable.Add(sourceHost, robots);
             }
         }
         else
         {
             //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl
             if(!robotsTable.TryGetValue(sourceHost, out robots))
             {
                 robots = new RobotsTxtEntry();
                 robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths);
                 robotsTable.Add(sourceHost, robots);
             }
             else
             {
                 if(robots.ExpirationDate<DateTime.Today)
                 {
                     robots = FetchRobots(sourceHost);
                     robotsTable[sourceHost] = robots;
                 }
             }
         }
         if(targetHost != sourceHost)
         {
             //the target url is on a different host, we must get its robots.txt
             if(!robotsTable.TryGetValue(targetHost, out robots))
             {
                 robots = FetchRobots(targetHost);
                 robotsTable.Add(targetHost, robots);
             }
             else
             {
                 if(robots.ExpirationDate<DateTime.Today)
                 {
                     robots = FetchRobots(targetHost);
                     robotsTable[targetHost] = robots;
                 }
             }
         }
         if((robotsMeta & RobotsMetaTagValue.NoFollow)>0)
         {
             //if the meta tag has the NoFollow option set then we cannot crawl targetUrl
             retVal = true;
         }
         else
         {
             robots = robotsTable[targetHost];
             //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths
             if(robots.DisallowedPaths!=null)
             {
                 for(int i = 0; i < robots.DisallowedPaths.Length; i++)
                 {
                     if(targetUrl.IndexOf(robots.DisallowedPaths[i])!=-1)
                     {
                         //we found a match. It is therefore not allowed to crawl targetUrl
                         retVal = true;
                         break; //stop searching as soon as we have a match
                     }
                 }
             }
         }
     }
     catch(Exception e)
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString());
         }
     }
     finally
     {
         mutex.ReleaseMutex();
     }
     return retVal;
 }
Esempio n. 6
0
 /// <summary>
 /// Downloads a robots.txt from a specified host, parses it and constructs a new
 /// <see cref="RobotsTxtEntry"/> object with the entries of the downloaded file.
 /// </summary>
 /// <param name="hostname">The host for which the robots.txt file is required.</param>
 /// <returns>A new <see cref="RobotsTxtEntry"/> object based on the newly downloaded
 /// robots.txt file.</returns>
 private RobotsTxtEntry FetchRobots(string hostname)
 {
     RobotsTxtEntry retVal = new RobotsTxtEntry();
     try
     {
         string robots = DownloadRobots(hostname);
         if(robots!=null)
         {
             if(robots == String.Empty)
             {
                 retVal.DisallowedPaths = new string [0];
             }
             else
             {
                 retVal.DisallowedPaths = ParseRobots(robots);
             }
         }
     }
     catch
     {}
     return retVal;
 }