Пример #1
0
        /// <summary>
        /// Works the same way as the indexer but if the requested entry is not found in the
        /// cache then it is requested from the database again before returning null.
        /// </summary>
        /// <param name="hostName">The  host name.</param>
        public RobotsTxtEntry GetEntry(string hostName)
        {
            byte[]         key   = MD5Hash.md5(hostName);
            RobotsTxtEntry entry = null;

            if (hosts.TryGetValue(key, out entry))
            {
                return(entry);
            }
            try
            {
                try
                {
                    dbcon.Open();
                }
                catch
                { }
                if (dbcon.State == ConnectionState.Closed)
                {
                    //log a message
                    return(null);
                }
                SqlCommand cmd = new SqlCommand("cw_select_robot", dbcon);
                cmd.CommandType = CommandType.StoredProcedure;
                cmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                cmd.Parameters[0].Value = new Guid(key);
                SqlDataAdapter da = new SqlDataAdapter(cmd);
                DataSet        ds = new DataSet();
                da.Fill(ds);
                da.Dispose();
                cmd.Dispose();
                dbcon.Close();
                if (ds.Tables[0].Rows.Count == 0)
                {
                    return(null);
                }
                entry = new RobotsTxtEntry();
                entry.DisallowedPaths = SplitPaths((string)ds.Tables[0].Rows[0][1]);
                entry.ExpirationDate  = (DateTime)ds.Tables[0].Rows[0][2];
                hosts.Add(key, entry);
                ds.Dispose();
                OnCacheUpdated(EventArgs.Empty);
                return(entry);
            }
            catch
            {
                if (dbcon.State != ConnectionState.Closed)
                {
                    try
                    {
                        dbcon.Close();
                    }
                    catch
                    { }
                }
                GC.Collect();
                return(null);
            }
        }
Пример #2
0
 /// <summary>
 /// Indexer property
 /// </summary>
 public RobotsTxtEntry this[string hostName]
 {
     get
     {
         byte[]         key   = MD5Hash.md5(hostName);
         RobotsTxtEntry entry = null;
         hosts.TryGetValue(key, out entry);
         return(entry);
     }
 }
Пример #3
0
 /// <summary>
 /// Adds a new robots entry to the cache
 /// </summary>
 /// <param name="hostName">A string containing the hostname to add to the cache.</param>
 /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param>
 public void AddEntry(string hostName, RobotsTxtEntry entry)
 {
     byte[] key = MD5Hash.md5(hostName);
     lock (hosts)
     {
         if (!hosts.ContainsKey(key))
         {
             hosts.Add(key, entry);
             OnCacheUpdated(EventArgs.Empty);
         }
     }
 }
Пример #4
0
 /// <summary>
 /// Indexer property
 /// </summary>
 public RobotsTxtEntry this[byte[] hostNameMD5]
 {
     get
     {
         if (hostNameMD5.Length != 16)
         {
             throw new ArgumentException("Only 16 byte keys are accepted.");
         }
         RobotsTxtEntry entry = null;
         hosts.TryGetValue(hostNameMD5, out entry);
         return(entry);
     }
 }
Пример #5
0
 /// <summary>
 /// Adds a new robots entry to the cache
 /// </summary>
 /// <param name="hostNameMD5">The md5 of the hostname to add to the cache.</param>
 /// <param name="entry">The <see cref="RobotsTxtEntry"/> related to the host.</param>
 public void AddEntry(byte[] hostNameMD5, RobotsTxtEntry entry)
 {
     if (hostNameMD5.Length != 16)
     {
         throw new ArgumentException("Only 16 byte keys are accepted.");
     }
     lock (hosts)
     {
         if (!hosts.ContainsKey(hostNameMD5))
         {
             hosts.Add(hostNameMD5, entry);
             OnCacheUpdated(EventArgs.Empty);
         }
     }
 }
Пример #6
0
 /// <summary>
 /// Loads the cache with the banned host entries stored in the database.
 /// </summary>
 public void LoadCache()
 {
     try
     {
         try
         {
             dbcon.Open();
         }
         catch
         { }
         if (dbcon.State == ConnectionState.Closed)
         {
             //log a message
             return;
         }
         SqlCommand cmd = new SqlCommand("cw_select_robots", dbcon);
         cmd.CommandType = CommandType.StoredProcedure;
         SqlDataAdapter da = new SqlDataAdapter(cmd);
         DataSet        ds = new DataSet();
         da.Fill(ds);
         da.Dispose();
         cmd.Dispose();
         dbcon.Close();
         hosts.Clear();
         byte[] key;
         foreach (DataRow dr in ds.Tables[0].Rows)
         {
             key = ((Guid)dr[0]).ToByteArray();
             RobotsTxtEntry entry = new RobotsTxtEntry((DateTime)dr[2], SplitPaths((string)dr[1]));
             hosts.Add(key, entry);                     //null? WTF!!! plus no existence check is required
         }
         ds.Dispose();
         OnCacheUpdated(EventArgs.Empty);
     }
     catch
     {
         if (dbcon.State != ConnectionState.Closed)
         {
             try
             {
                 dbcon.Close();
             }
             catch
             { }
         }
         GC.Collect();
     }
 }
Пример #7
0
        /// <summary>
        /// Downloads a robots.txt from a specified host, parses it and constructs a new
        /// <see cref="RobotsTxtEntry"/> object with the entries of the downloaded file.
        /// </summary>
        /// <param name="hostname">The host for which the robots.txt file is required.</param>
        /// <returns>A new <see cref="RobotsTxtEntry"/> object based on the newly downloaded
        /// robots.txt file.</returns>
        private RobotsTxtEntry FetchRobots(string hostname)
        {
            RobotsTxtEntry retVal = new RobotsTxtEntry();

            try
            {
                string robots = DownloadRobots(hostname);
                if (robots != null)
                {
                    if (robots == String.Empty)
                    {
                        retVal.DisallowedPaths = new string [0];
                    }
                    else
                    {
                        retVal.DisallowedPaths = ParseRobots(robots);
                    }
                }
            }
            catch
            {}
            return(retVal);
        }
Пример #8
0
        /// <summary>
        /// Selects and returns a set of urls that are ready to be crawled.
        /// </summary>
        /// <param name="ci">The <see cref="ClientInfo"/> of the client requesting urls to crawl.</param>
        /// <param name="data">An array of <see cref="InternetUrlToCrawl"/> objects containing the selected urls.</param>
        /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/>
        /// encapsulating the error that occured if the operation fails.</returns>
        public SerializedException SelectUrlsToCrawl(ClientInfo ci, ref InternetUrlToCrawl[] data)
        {
            SerializedException sx = null;

            try
            {
                if (!ConnectToDatabase())
                {
                    throw new CWDBConnectionFailedException();
                }
                //we must use a transaction to make sure that if something goes wrong the
                //changes to the database will be rolled back.
                SqlTransaction transaction = dbcon.BeginTransaction(IsolationLevel.Serializable);                //perhaps | repeatableread
                try
                {
                    //first select the urls to crawl
                    SqlCommand cmd = new SqlCommand("cw_select_urls_to_crawl", dbcon, transaction);
                    cmd.CommandType    = CommandType.StoredProcedure;
                    cmd.CommandTimeout = 120;
                    SqlDataAdapter da = new SqlDataAdapter(cmd);
                    DataSet        ds = new DataSet();
                    da.Fill(ds);
                    da.Dispose();
                    cmd.Dispose();
                    //now delete them from the table of urls to crawl
                    data = new InternetUrlToCrawl[ds.Tables[0].Rows.Count];
                    if (data.Length > 0)
                    {
                        int i = 0;
                        foreach (DataRow dr in ds.Tables[0].Rows)
                        {
                            try
                            {
                                InternetUrlToCrawl url = new InternetUrlToCrawl((int)dr[0], (string)dr[1]);
                                if (dr[2] != DBNull.Value)
                                {
                                    url.CRC = (long)dr[2];
                                }
                                if (dr[3] != DBNull.Value)
                                {
                                    url.FlagDomain = (DomainFlagValue)((byte)dr[3]);
                                }
                                if (dr[4] != DBNull.Value)
                                {
                                    url.RobotsDisallowedPaths = (string)dr[4];
                                }
                                else
                                {
                                    RobotsTxtEntry entry = settings.Robots.GetEntry(InternetUtils.HostName(url));
                                    if (entry != null)
                                    {
                                        url.RobotsDisallowedPaths = ConcatenatePaths(entry.DisallowedPaths);
                                    }
                                    else
                                    {
                                        url.FlagFetchRobots = true;
                                    }
                                }
                                data[i++] = url;
                            }
                            catch
                            {
                                continue;
                            }
                        }
                        SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon, transaction);
                        statscmd.CommandType    = CommandType.StoredProcedure;
                        statscmd.CommandTimeout = 120;
                        statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier);
                        statscmd.Parameters.Add("@assigned", SqlDbType.BigInt);
                        statscmd.Parameters.Add("@returned", SqlDbType.BigInt);
                        statscmd.Parameters.Add("@type", SqlDbType.TinyInt);
                        statscmd.Parameters[0].Value = ci.ClientID;
                        statscmd.Parameters[1].Value = data.Length;
                        statscmd.Parameters[2].Value = DBNull.Value;
                        statscmd.Parameters[3].Value = 0;
                        statscmd.ExecuteNonQuery();
                        statscmd.Dispose();
                        transaction.Commit();
                    }
                }
                catch (Exception ex)
                {
                    transaction.Rollback();
                    if (settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        settings.Log.LogWarning("SelectUrlsToCrawl failed, Transaction was rolled back: " + ex.ToString());
                    }
                    throw ex;
                }
                finally
                {
                    UpdateClientLastActive(ci);
                    LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl);
                    if (!DisconnectFromDatabase())
                    {
                        throw new CWDBConnectionFailedException("Disconnect from database failure.");
                    }
                }
            }
            catch (Exception e)
            {
                sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString());
                if (settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    settings.Log.LogWarning("SelectUrlsToCrawl failed: " + e.ToString());
                }
            }
            return(sx);
        }
Пример #9
0
        /// <summary>
        /// Checks if the Robots Exclusion Standard allows the crawler to visit a url.
        /// </summary>
        /// <param name="targetUrl">The url that is to be validated.</param>
        /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param>
        /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the
        /// restrictions posed by the robots meta tag contained in the sourceUrl.</param>
        /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is
        /// allowed (false) or disallowed (true) to visit the target Url.</returns>
        /// <remarks>This method is safe for multi-threaded operations. However only one
        /// thread will be able to perform a check at any given time.
        /// </remarks>
        public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta)
        {
            bool retVal = false;             //assume that it's allowed to crawl the targetUrl

            try
            {
                mutex.WaitOne();
                //perhaphs we should use the hash code of the hostnames as keys.
                string         targetHost = InternetUtils.HostName(targetUrl);
                string         sourceHost = InternetUtils.HostName(sourceUrl);
                RobotsTxtEntry robots     = null;
                //Do we need to fetch the robots.txt for the source Url?
                if (sourceUrl.FlagFetchRobots)
                {
                    //we must fetch the robots.txt from the source url host and update sourceUrl.
                    robots = FetchRobots(sourceHost);
                    sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths);
                    sourceUrl.FlagFetchRobots       = false;               //fetch it only once
                    //check if it exists in the Hashtable, if so update it, otherwise add it
                    if (robotsTable.ContainsKey(sourceHost))
                    {
                        robotsTable[sourceHost] = robots;
                    }
                    else
                    {
                        robotsTable.Add(sourceHost, robots);
                    }
                }
                else
                {
                    //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl
                    if (!robotsTable.TryGetValue(sourceHost, out robots))
                    {
                        robots = new RobotsTxtEntry();
                        robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths);
                        robotsTable.Add(sourceHost, robots);
                    }
                    else
                    {
                        if (robots.ExpirationDate < DateTime.Today)
                        {
                            robots = FetchRobots(sourceHost);
                            robotsTable[sourceHost] = robots;
                        }
                    }
                }
                if (targetHost != sourceHost)
                {
                    //the target url is on a different host, we must get its robots.txt
                    if (!robotsTable.TryGetValue(targetHost, out robots))
                    {
                        robots = FetchRobots(targetHost);
                        robotsTable.Add(targetHost, robots);
                    }
                    else
                    {
                        if (robots.ExpirationDate < DateTime.Today)
                        {
                            robots = FetchRobots(targetHost);
                            robotsTable[targetHost] = robots;
                        }
                    }
                }
                if ((robotsMeta & RobotsMetaTagValue.NoFollow) > 0)
                {
                    //if the meta tag has the NoFollow option set then we cannot crawl targetUrl
                    retVal = true;
                }
                else
                {
                    robots = robotsTable[targetHost];
                    //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths
                    if (robots.DisallowedPaths != null)
                    {
                        for (int i = 0; i < robots.DisallowedPaths.Length; i++)
                        {
                            if (targetUrl.IndexOf(robots.DisallowedPaths[i]) != -1)
                            {
                                //we found a match. It is therefore not allowed to crawl targetUrl
                                retVal = true;
                                break;                                 //stop searching as soon as we have a match
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString());
                }
            }
            finally
            {
                mutex.ReleaseMutex();
            }
            return(retVal);
        }
Пример #10
0
        /// <summary>
        /// Performs the selection of urls to crawl from the database. It is the method that
        /// is running on the pluginThread, so it must be able to handle ThreadAbortException
        /// and ThreadInterruptedException.
        /// </summary>
        private void PerformSelection()
        {
            try
            {
                ReportImmediately(CWLoggerEntryType.Info, "CrawlWave Url Selection Plugin thread has started with ID 0x" + Thread.CurrentThread.GetHashCode().ToString("x4"));
                while (!mustStop)
                {
                    //Select urls from the database, update the appropriate tables and stats
                    try
                    {
                        try
                        {
                            dbcon.Open();
                        }
                        catch
                        {}
                        SqlTransaction transaction = null;
                        if (settings.UseTransactions)
                        {
                            transaction = dbcon.BeginTransaction();
                        }
                        try
                        {
                            //check if the threshold criteria is met
                            int count = SelectUrlCount(transaction);
                            AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Threshold is " + settings.Threshold + " urls, current Url Queue Size is " + count.ToString());
                            if (count < settings.Threshold)
                            {
                                AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Performing Url Selection, Selection size = " + settings.SelectionSize.ToString());
                                //Refresh the banned hosts cache
                                banned.RefreshCache();
                                //and the robots cache
                                robots.LoadCache();
                                //Select the Urls to process
                                DataSet urls = SelectUrls(transaction);
                                //insert each one of them to the table of urls to be crawled and
                                //mark them as assigned
                                if (urls.Tables[0].Rows.Count > 0)
                                {
                                    //create the SqlCommands
                                    SqlCommand icmd = new SqlCommand("cw_insert_url_to_crawl", dbcon, transaction);
                                    icmd.CommandType    = CommandType.StoredProcedure;
                                    icmd.CommandTimeout = 60;
                                    icmd.Parameters.Add("@url_id", SqlDbType.Int);
                                    icmd.Parameters.Add("@url", SqlDbType.NVarChar, 500);
                                    icmd.Parameters.Add("@crc", SqlDbType.BigInt);
                                    icmd.Parameters.Add("@domain", SqlDbType.TinyInt);
                                    icmd.Parameters.Add("@robots", SqlDbType.NVarChar, 1000);
                                    icmd.Parameters.Add("@robots_expiration", SqlDbType.SmallDateTime);

                                    SqlCommand mcmd = new SqlCommand("cw_mark_url_as_assigned", dbcon, transaction);
                                    mcmd.CommandType    = CommandType.StoredProcedure;
                                    mcmd.CommandTimeout = 60;
                                    mcmd.Parameters.Add("@url_id", SqlDbType.Int);

                                    Guid hostID = new Guid();
                                    foreach (DataRow dr in urls.Tables[0].Rows)
                                    {
                                        hostID = (Guid)dr[2];
                                        byte [] hostIDbytes = hostID.ToByteArray();
                                        if (!banned.IsBanned(hostIDbytes))
                                        {
                                            icmd.Parameters[0].Value = dr[0];
                                            icmd.Parameters[1].Value = dr[1];
                                            icmd.Parameters[2].Value = dr[3];
                                            icmd.Parameters[3].Value = dr[4];
                                            RobotsTxtEntry entry = robots.GetEntry(hostIDbytes);
                                            if (entry == null)
                                            {
                                                icmd.Parameters[4].Value = DBNull.Value;
                                                icmd.Parameters[5].Value = DBNull.Value;
                                            }
                                            else
                                            {
                                                icmd.Parameters[4].Value = ConcatenatePaths(entry.DisallowedPaths);
                                                icmd.Parameters[5].Value = entry.ExpirationDate;
                                            }
                                            try
                                            {
                                                icmd.ExecuteNonQuery();
                                            }
                                            catch
                                            {
                                                continue;
                                            }
                                        }
                                        mcmd.Parameters[0].Value = (int)dr[0];
                                        try
                                        {
                                            mcmd.ExecuteNonQuery();
                                        }
                                        catch
                                        {
                                            continue;
                                        }
                                        //MarkUrlAsAssigned((int)dr[0],transaction);
                                        //perhaps this should be inlined because calling the method n times
                                        //creates and destroys a lot of objects and spares cpu time
                                    }
                                    mcmd.Dispose();
                                    icmd.Dispose();
                                }
                            }
                            try
                            {
                                if (settings.UseTransactions)
                                {
                                    transaction.Commit();
                                    transaction.Dispose();
                                }
                            }
                            catch (Exception ext)
                            {
                                events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ext.ToString()));
                            }
                            if (dbcon.State == ConnectionState.Closed)
                            {
                                events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Error, DateTime.Now, "The Url Selection Plugin failed to connect to the database. Stopping..."));
                                return;
                            }
                            if (settings.PauseBetweenOperations)
                            {
                                int waitSeconds = PauseInSeconds();
                                while (waitSeconds > 0)
                                {
                                    Thread.Sleep(1000);
                                    if (mustStop)
                                    {
                                        break;
                                    }
                                    waitSeconds--;
                                }
                            }
                        }
                        catch (ThreadInterruptedException)
                        {
                            if (settings.UseTransactions)
                            {
                                transaction.Rollback();
                                transaction.Dispose();
                            }
                        }
                        finally
                        {
                            if (dbcon.State != ConnectionState.Closed)
                            {
                                try
                                {
                                    dbcon.Close();
                                }
                                catch
                                {}
                            }
                            Report();
                        }
                    }
                    catch (Exception ex)
                    {
                        events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ex.ToString()));
                    }
                }
            }
            catch (ThreadAbortException)
            {
                //The thread was asked to abort, which means it must return at once
                return;
            }
            catch (ThreadInterruptedException)
            {
                //The thread has been asked to Join. We have nothing to do but return.
                return;
            }
            finally
            {
                events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Info, DateTime.Now, Thread.CurrentThread.Name + " has stopped."));
            }
        }