/// <summary> /// Works the same way as the indexer but if the requested entry is not found in the /// cache then it is requested from the database again before returning null. /// </summary> /// <param name="hostName">The host name.</param> public RobotsTxtEntry GetEntry(string hostName) { byte[] key = MD5Hash.md5(hostName); RobotsTxtEntry entry = null; if (hosts.TryGetValue(key, out entry)) { return(entry); } try { try { dbcon.Open(); } catch { } if (dbcon.State == ConnectionState.Closed) { //log a message return(null); } SqlCommand cmd = new SqlCommand("cw_select_robot", dbcon); cmd.CommandType = CommandType.StoredProcedure; cmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier); cmd.Parameters[0].Value = new Guid(key); SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); dbcon.Close(); if (ds.Tables[0].Rows.Count == 0) { return(null); } entry = new RobotsTxtEntry(); entry.DisallowedPaths = SplitPaths((string)ds.Tables[0].Rows[0][1]); entry.ExpirationDate = (DateTime)ds.Tables[0].Rows[0][2]; hosts.Add(key, entry); ds.Dispose(); OnCacheUpdated(EventArgs.Empty); return(entry); } catch { if (dbcon.State != ConnectionState.Closed) { try { dbcon.Close(); } catch { } } GC.Collect(); return(null); } }
/// <summary> /// Indexer property /// </summary> public RobotsTxtEntry this[string hostName] { get { byte[] key = MD5Hash.md5(hostName); RobotsTxtEntry entry = null; hosts.TryGetValue(key, out entry); return(entry); } }
/// <summary> /// Adds a new robots entry to the cache /// </summary> /// <param name="hostName">A string containing the hostname to add to the cache.</param> /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param> public void AddEntry(string hostName, RobotsTxtEntry entry) { byte[] key = MD5Hash.md5(hostName); lock (hosts) { if (!hosts.ContainsKey(key)) { hosts.Add(key, entry); OnCacheUpdated(EventArgs.Empty); } } }
/// <summary> /// Indexer property /// </summary> public RobotsTxtEntry this[byte[] hostNameMD5] { get { if (hostNameMD5.Length != 16) { throw new ArgumentException("Only 16 byte keys are accepted."); } RobotsTxtEntry entry = null; hosts.TryGetValue(hostNameMD5, out entry); return(entry); } }
/// <summary> /// Adds a new robots entry to the cache /// </summary> /// <param name="hostNameMD5">The md5 of the hostname to add to the cache.</param> /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param> public void AddEntry(byte[] hostNameMD5, RobotsTxtEntry entry) { if (hostNameMD5.Length != 16) { throw new ArgumentException("Only 16 byte keys are accepted."); } lock (hosts) { if (!hosts.ContainsKey(hostNameMD5)) { hosts.Add(hostNameMD5, entry); OnCacheUpdated(EventArgs.Empty); } } }
/// <summary> /// Loads the cache with the banned host entries stored in the database. /// </summary> public void LoadCache() { try { try { dbcon.Open(); } catch { } if (dbcon.State == ConnectionState.Closed) { //log a message return; } SqlCommand cmd = new SqlCommand("cw_select_robots", dbcon); cmd.CommandType = CommandType.StoredProcedure; SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); dbcon.Close(); hosts.Clear(); byte[] key; foreach (DataRow dr in ds.Tables[0].Rows) { key = ((Guid)dr[0]).ToByteArray(); RobotsTxtEntry entry = new RobotsTxtEntry((DateTime)dr[2], SplitPaths((string)dr[1])); hosts.Add(key, entry); //null? WTF!!! plus no existence check is required } ds.Dispose(); OnCacheUpdated(EventArgs.Empty); } catch { if (dbcon.State != ConnectionState.Closed) { try { dbcon.Close(); } catch { } } GC.Collect(); } }
/// <summary> /// Downloads a robots.txt from a specified host, parses it and constructs a new /// <see cref="RobotsTxtEntry"/> object with the entries of the downloaded file. /// </summary> /// <param name="hostname">The host for which the robots.txt file is required.</param> /// <returns>A new <see cref="RobotsTxtEntry"/> object based on the newly downloaded /// robots.txt file.</returns> private RobotsTxtEntry FetchRobots(string hostname) { RobotsTxtEntry retVal = new RobotsTxtEntry(); try { string robots = DownloadRobots(hostname); if (robots != null) { if (robots == String.Empty) { retVal.DisallowedPaths = new string [0]; } else { retVal.DisallowedPaths = ParseRobots(robots); } } } catch {} return(retVal); }
/// <summary> /// Selects and returns a set of urls that are ready to be crawled. /// </summary> /// <param name="ci">The <see cref="ClientInfo"/> of the client requesting urls to crawl.</param> /// <param name="data">An array of <see cref="InternetUrlToCrawl"/> objects containing the selected urls.</param> /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/> /// encapsulating the error that occured if the operation fails.</returns> public SerializedException SelectUrlsToCrawl(ClientInfo ci, ref InternetUrlToCrawl[] data) { SerializedException sx = null; try { if (!ConnectToDatabase()) { throw new CWDBConnectionFailedException(); } //we must use a transaction to make sure that if something goes wrong the //changes to the database will be rolled back. SqlTransaction transaction = dbcon.BeginTransaction(IsolationLevel.Serializable); //perhaps | repeatableread try { //first select the urls to crawl SqlCommand cmd = new SqlCommand("cw_select_urls_to_crawl", dbcon, transaction); cmd.CommandType = CommandType.StoredProcedure; cmd.CommandTimeout = 120; SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); //now delete them from the table of urls to crawl data = new InternetUrlToCrawl[ds.Tables[0].Rows.Count]; if (data.Length > 0) { int i = 0; foreach (DataRow dr in ds.Tables[0].Rows) { try { InternetUrlToCrawl url = new InternetUrlToCrawl((int)dr[0], (string)dr[1]); if (dr[2] != DBNull.Value) { url.CRC = (long)dr[2]; } if (dr[3] != DBNull.Value) { url.FlagDomain = (DomainFlagValue)((byte)dr[3]); } if (dr[4] != DBNull.Value) { url.RobotsDisallowedPaths = (string)dr[4]; } else { RobotsTxtEntry entry = settings.Robots.GetEntry(InternetUtils.HostName(url)); if (entry != null) { url.RobotsDisallowedPaths = ConcatenatePaths(entry.DisallowedPaths); } else { url.FlagFetchRobots = true; } } data[i++] = url; } catch { continue; } } SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon, transaction); statscmd.CommandType = CommandType.StoredProcedure; statscmd.CommandTimeout = 120; statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier); statscmd.Parameters.Add("@assigned", SqlDbType.BigInt); statscmd.Parameters.Add("@returned", SqlDbType.BigInt); statscmd.Parameters.Add("@type", SqlDbType.TinyInt); statscmd.Parameters[0].Value = ci.ClientID; statscmd.Parameters[1].Value = data.Length; statscmd.Parameters[2].Value = DBNull.Value; statscmd.Parameters[3].Value = 0; statscmd.ExecuteNonQuery(); statscmd.Dispose(); transaction.Commit(); } } catch (Exception ex) { transaction.Rollback(); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("SelectUrlsToCrawl failed, Transaction was rolled back: " + ex.ToString()); } throw ex; } finally { UpdateClientLastActive(ci); LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl); if (!DisconnectFromDatabase()) { throw new CWDBConnectionFailedException("Disconnect from database failure."); } } } catch (Exception e) { sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString()); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("SelectUrlsToCrawl failed: " + e.ToString()); } } return(sx); }
/// <summary> /// Checks if the Robots Exclusion Standard allows the crawler to visit a url. /// </summary> /// <param name="targetUrl">The url that is to be validated.</param> /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param> /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the /// restrictions posed by the robots meta tag contained in the sourceUrl.</param> /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is /// allowed (false) or disallowed (true) to visit the target Url.</returns> /// <remarks>This method is safe for multi-threaded operations. However only one /// thread will be able to perform a check at any given time. /// </remarks> public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta) { bool retVal = false; //assume that it's allowed to crawl the targetUrl try { mutex.WaitOne(); //perhaphs we should use the hash code of the hostnames as keys. string targetHost = InternetUtils.HostName(targetUrl); string sourceHost = InternetUtils.HostName(sourceUrl); RobotsTxtEntry robots = null; //Do we need to fetch the robots.txt for the source Url? if (sourceUrl.FlagFetchRobots) { //we must fetch the robots.txt from the source url host and update sourceUrl. robots = FetchRobots(sourceHost); sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths); sourceUrl.FlagFetchRobots = false; //fetch it only once //check if it exists in the Hashtable, if so update it, otherwise add it if (robotsTable.ContainsKey(sourceHost)) { robotsTable[sourceHost] = robots; } else { robotsTable.Add(sourceHost, robots); } } else { //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl if (!robotsTable.TryGetValue(sourceHost, out robots)) { robots = new RobotsTxtEntry(); robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths); robotsTable.Add(sourceHost, robots); } else { if (robots.ExpirationDate < DateTime.Today) { robots = FetchRobots(sourceHost); robotsTable[sourceHost] = robots; } } } if (targetHost != sourceHost) { //the target url is on a different host, we must get its robots.txt if (!robotsTable.TryGetValue(targetHost, out robots)) { robots = FetchRobots(targetHost); robotsTable.Add(targetHost, robots); } else { if (robots.ExpirationDate < DateTime.Today) { robots = FetchRobots(targetHost); robotsTable[targetHost] = robots; } } } if ((robotsMeta & RobotsMetaTagValue.NoFollow) > 0) { //if the meta tag has the NoFollow option set then we cannot crawl targetUrl retVal = true; } else { robots = robotsTable[targetHost]; //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths if (robots.DisallowedPaths != null) { for (int i = 0; i < robots.DisallowedPaths.Length; i++) { if (targetUrl.IndexOf(robots.DisallowedPaths[i]) != -1) { //we found a match. It is therefore not allowed to crawl targetUrl retVal = true; break; //stop searching as soon as we have a match } } } } } catch (Exception e) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString()); } } finally { mutex.ReleaseMutex(); } return(retVal); }
/// <summary> /// Performs the selection of urls to crawl from the database. It is the method that /// is running on the pluginThread, so it must be able to handle ThreadAbortException /// and ThreadInterruptedException. /// </summary> private void PerformSelection() { try { ReportImmediately(CWLoggerEntryType.Info, "CrawlWave Url Selection Plugin thread has started with ID 0x" + Thread.CurrentThread.GetHashCode().ToString("x4")); while (!mustStop) { //Select urls from the database, update the appropriate tables and stats try { try { dbcon.Open(); } catch {} SqlTransaction transaction = null; if (settings.UseTransactions) { transaction = dbcon.BeginTransaction(); } try { //check if the threshold criteria is met int count = SelectUrlCount(transaction); AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Threshold is " + settings.Threshold + " urls, current Url Queue Size is " + count.ToString()); if (count < settings.Threshold) { AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Performing Url Selection, Selection size = " + settings.SelectionSize.ToString()); //Refresh the banned hosts cache banned.RefreshCache(); //and the robots cache robots.LoadCache(); //Select the Urls to process DataSet urls = SelectUrls(transaction); //insert each one of them to the table of urls to be crawled and //mark them as assigned if (urls.Tables[0].Rows.Count > 0) { //create the SqlCommands SqlCommand icmd = new SqlCommand("cw_insert_url_to_crawl", dbcon, transaction); icmd.CommandType = CommandType.StoredProcedure; icmd.CommandTimeout = 60; icmd.Parameters.Add("@url_id", SqlDbType.Int); icmd.Parameters.Add("@url", SqlDbType.NVarChar, 500); icmd.Parameters.Add("@crc", SqlDbType.BigInt); icmd.Parameters.Add("@domain", SqlDbType.TinyInt); icmd.Parameters.Add("@robots", SqlDbType.NVarChar, 1000); icmd.Parameters.Add("@robots_expiration", SqlDbType.SmallDateTime); SqlCommand mcmd = new SqlCommand("cw_mark_url_as_assigned", dbcon, transaction); mcmd.CommandType = CommandType.StoredProcedure; mcmd.CommandTimeout = 60; mcmd.Parameters.Add("@url_id", SqlDbType.Int); Guid hostID = new Guid(); foreach (DataRow dr in urls.Tables[0].Rows) { hostID = (Guid)dr[2]; byte [] hostIDbytes = hostID.ToByteArray(); if (!banned.IsBanned(hostIDbytes)) { icmd.Parameters[0].Value = dr[0]; icmd.Parameters[1].Value = dr[1]; icmd.Parameters[2].Value = dr[3]; icmd.Parameters[3].Value = dr[4]; RobotsTxtEntry entry = robots.GetEntry(hostIDbytes); if (entry == null) { icmd.Parameters[4].Value = DBNull.Value; icmd.Parameters[5].Value = DBNull.Value; } else { icmd.Parameters[4].Value = ConcatenatePaths(entry.DisallowedPaths); icmd.Parameters[5].Value = entry.ExpirationDate; } try { icmd.ExecuteNonQuery(); } catch { continue; } } mcmd.Parameters[0].Value = (int)dr[0]; try { mcmd.ExecuteNonQuery(); } catch { continue; } //MarkUrlAsAssigned((int)dr[0],transaction); //perhaps this should be inlined because calling the method n times //creates and destroys a lot of objects and spares cpu time } mcmd.Dispose(); icmd.Dispose(); } } try { if (settings.UseTransactions) { transaction.Commit(); transaction.Dispose(); } } catch (Exception ext) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ext.ToString())); } if (dbcon.State == ConnectionState.Closed) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Error, DateTime.Now, "The Url Selection Plugin failed to connect to the database. Stopping...")); return; } if (settings.PauseBetweenOperations) { int waitSeconds = PauseInSeconds(); while (waitSeconds > 0) { Thread.Sleep(1000); if (mustStop) { break; } waitSeconds--; } } } catch (ThreadInterruptedException) { if (settings.UseTransactions) { transaction.Rollback(); transaction.Dispose(); } } finally { if (dbcon.State != ConnectionState.Closed) { try { dbcon.Close(); } catch {} } Report(); } } catch (Exception ex) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ex.ToString())); } } } catch (ThreadAbortException) { //The thread was asked to abort, which means it must return at once return; } catch (ThreadInterruptedException) { //The thread has been asked to Join. We have nothing to do but return. return; } finally { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Info, DateTime.Now, Thread.CurrentThread.Name + " has stopped.")); } }