Example #1
0
        /// <summary>
        /// Get the host name associated with the specified host id.
        /// </summary>
        /// <param name="hostID">The host id to look up.</param>
        /// <returns>The name of the host.</returns>
        protected virtual String GetHost(int hostID)
        {
            RepeatableStatement.Results rs = null;

            try
            {
                rs = this.stmtGetHost.ExecuteQuery(hostID);
                if (!rs.DataReader.Read())
                {
                    throw new WorkloadException("Can't find previously created host.");
                }
                return((String)rs.DataReader[0]);
            }
            catch (Exception e)
            {
                throw new WorkloadException(e);
            }
            finally
            {
                if (rs != null)
                {
                    rs.Close();
                }
            }
        }
Example #2
0
        /// <summary>
        /// Return true if there are no more workload units.
        /// </summary>
        /// <returns>Returns true if there are no more workload units.</returns>
        public virtual bool WorkloadEmpty()
        {
            RepeatableStatement.Results rs = null;

            try
            {
                rs = this.stmtWorkloadEmpty.ExecuteQuery(this.currentHostID);
                if (!rs.DataReader.Read())
                {
                    return(true);
                }
                return((int)rs.DataReader[0] < 1);
            }
            catch (Exception e)
            {
                throw (new WorkloadException(e));
            }
            finally
            {
                if (rs != null)
                {
                    rs.Close();
                }
            }
        }
Example #3
0
        /// <summary>
        /// Setup the workload so that it can be resumed from where
        /// the last spider left the workload.
        /// </summary>
        public virtual void Resume()
        {
            RepeatableStatement.Results rs = null;

            try
            {
                rs = this.stmtResume.ExecuteQuery();

                if (!rs.DataReader.Read())
                {
                    throw (new WorkloadException(
                               "Can't resume, unable to determine current host."));
                }

                this.currentHostID = (int)rs.DataReader[0];
                this.currentHost   = (String)GetHost(this.currentHostID);
            }
            catch (Exception e)
            {
                throw (new WorkloadException(e));
            }
            finally
            {
                if (rs != null)
                {
                    rs.Close();
                }
            }

            this.stmtResume2.Execute();
        }
Example #4
0
 /// <summary>
 /// Get the source page that contains the specified URL.
 /// </summary>
 /// <param name="url">The Uri to seek the source for.</param>
 /// <returns>The source of the specified URL.</returns>
 public virtual Uri GetSource(Uri url)
 {
     RepeatableStatement.Results rs = null;
     try
     {
         rs = this.stmtGetSource.ExecuteQuery(ComputeHash(url));
         while (rs.DataReader.Read())
         {
             String u = (String)rs.DataReader[0];
             if (u.Equals(url.ToString()))
             {
                 return(new Uri((String)rs.DataReader[0]));
             }
         }
         return(null);
     }
     catch (UriFormatException e)
     {
         throw (new WorkloadException(e));
     }
     catch (Exception e)
     {
         throw (new WorkloadException(e));
     }
     finally
     {
         if (rs != null)
         {
             rs.Close();
         }
     }
 }
Example #5
0
        /// <summary>
        /// Close the workload.
        /// </summary>
        public void Close()
        {
            if (this.workResultSet != null)
            {
                try
                {
                    this.workResultSet.Close();
                }
                catch (Exception)
                {
                    spider.Logging.Log(Logger.Level.ERROR,
                                       "Error trying to close workload result set, ignoring...");
                }
                this.workResultSet = null;
            }

            foreach (RepeatableStatement statement in this.statements)
            {
                statement.Close();
            }

            if (this.connection != null)
            {
                this.connection.Close();
            }
        }
Example #6
0
        /// <summary>
        /// Called internally to get a work unit. This function
        /// does not wait for work, rather it simply returns null.
        /// </summary>
        /// <returns>The next Uri to process.</returns>
        protected virtual Uri GetWorkInternal()
        {
            if (this.currentHostID == -1)
            {
                throw new WorkloadException(
                          "Attempting to obtain work before adding first URL.");
            }

            try
            {
                bool requery = false;

                if (this.workResultSet == null)
                {
                    requery = true;
                }
                else
                {
                    if (!this.workResultSet.DataReader.Read())
                    {
                        requery = true;
                    }
                }

                if (requery)
                {
                    if (this.workResultSet != null)
                    {
                        this.workResultSet.Close();
                    }

                    this.workResultSet = this.stmtGetWork.ExecuteQuery(Status.STATUS_WAITING,
                                                                       this.currentHostID);

                    if (!this.workResultSet.DataReader.Read())
                    {
                        return(null);
                    }
                }

                int    id  = (int)this.workResultSet.DataReader[0];
                String url = (String)this.workResultSet.DataReader[1];

                this.stmtGetWork2.Execute(Status.STATUS_PROCESSING, id);
                return(new Uri(url));
            }
            catch (UriFormatException e)
            {
                throw (new WorkloadException(e));
            }
            catch (Exception e)
            {
                throw (new WorkloadException(e));
            }
        }
Example #7
0
        /// <summary>
        /// Move on to process the next host. This should only be
        /// called after getWork returns null.
        /// </summary>
        /// <returns>The name of the next host.</returns>
        public virtual String NextHost()
        {
            if (this.currentHostID == -1)
            {
                throw new WorkloadException(
                          "Attempting to obtain host before adding first URL.");
            }
            else
            {
                MarkHostProcessed(this.currentHost);
            }

            try
            {
                bool requery = false;

                if (this.hostResultSet == null)
                {
                    requery = true;
                }
                else
                {
                    if (!this.hostResultSet.DataReader.Read())
                    {
                        requery = true;
                    }
                }

                if (requery)
                {
                    if (this.hostResultSet != null)
                    {
                        this.hostResultSet.Close();
                    }

                    this.hostResultSet = this.stmtGetNextHost.ExecuteQuery(Status.STATUS_WAITING);

                    if (!this.hostResultSet.DataReader.Read())
                    {
                        return(null);
                    }
                }

                this.currentHostID = (int)this.hostResultSet.DataReader[0];
                this.currentHost   = (String)this.hostResultSet.DataReader[1];
                this.stmtSetHostStatus.Execute(Status.STATUS_PROCESSING, this.currentHostID);
                spider.Logging.Log(Logger.Level.INFO, "Moving to new host: " + this.currentHost);
                return(this.currentHost);
            }
            catch (Exception e)
            {
                throw (new WorkloadException(e));
            }
        }
Example #8
0
        /// <summary>
        /// Get the id for the specified host name.
        /// </summary>
        /// <param name="host">The host to lookup.</param>
        /// <param name="require">Should an exception be thrown if the host is not located.</param>
        /// <returns>The id of the specified host name.</returns>
        protected virtual int GetHostID(String host, bool require)
        {
            RepeatableStatement.Results rs = null;

            // is this the current host?
            if (this.currentHostID != -1)
            {
                if (String.Compare(this.currentHost, host, true) == 0)
                {
                    return(this.currentHostID);
                }
            }

            // use the database to find it
            try
            {
                rs = this.stmtGetHostID.ExecuteQuery(host);

                if (rs.DataReader.Read())
                {
                    return((int)rs.DataReader[0]);
                }
            }
            finally
            {
                if (rs != null)
                {
                    rs.Close();
                }
            }

            if (require)
            {
                StringBuilder str = new StringBuilder();
                str.Append("Failed to find previously visited Host,");
                str.Append("Host=\"");
                str.Append(host);
                str.Append("\".");
                throw (new WorkloadException(str.ToString()));
            }
            else
            {
                return(-1);
            }
        }
Example #9
0
        /// <summary>
        /// Get the workload ID, given a URL.
        /// </summary>
        /// <param name="url"> The URL to look up.</param>
        /// <param name="require">Should an exception be thrown if the workload
        /// is not located.</param>
        /// <returns>The ID of the workload.</returns>
        protected virtual int GetWorkloadID(Uri url, bool require)
        {
            int hash = 0;

            RepeatableStatement.Results rs = null;
            try
            {
                hash = ComputeHash(url);
                rs   = this.stmtGetWorkloadID.ExecuteQuery(hash);
                while (rs.DataReader.Read())
                {
                    if (rs.DataReader[1].Equals(url.ToString()))
                    {
                        return((int)rs.DataReader[0]);
                    }
                }
            }
            finally
            {
                if (rs != null)
                {
                    rs.Close();
                }
            }

            if (require)
            {
                StringBuilder str = new StringBuilder();
                str.Append("Failed to find previously visited URL, hash=\"");
                str.Append(hash);
                str.Append("\", URL=\"");
                str.Append(url.ToString());
                str.Append("\".");
                throw (new WorkloadException(str.ToString()));
            }
            else
            {
                return(-1);
            }
        }
        /// <summary>
        /// Called internally to get a work unit. This function
        /// does not wait for work, rather it simply returns null.
        /// </summary>
        /// <returns>The next Uri to process.</returns>
        protected virtual Uri GetWorkInternal()
        {
            if (this.currentHostID == -1)
            {
                throw new WorkloadException(
                    "Attempting to obtain work before adding first URL.");
            }

            try
            {
                bool requery = false;

                if (this.workResultSet == null)
                {
                    requery = true;
                }
                else
                {
                    if (!this.workResultSet.DataReader.Read())
                    {
                        requery = true;
                    }
                }

                if (requery)
                {
                    if (this.workResultSet != null)
                    {
                        this.workResultSet.Close();
                    }

                    this.workResultSet = this.stmtGetWork.ExecuteQuery(Status.STATUS_WAITING,
                        this.currentHostID);

                    if (!this.workResultSet.DataReader.Read())
                    {
                        return null;
                    }
                }

                int id = (int)this.workResultSet.DataReader[0];
                String url = (String)this.workResultSet.DataReader[1];

                this.stmtGetWork2.Execute(Status.STATUS_PROCESSING, id);
                return new Uri(url);

            }
            catch (UriFormatException e)
            {
                throw (new WorkloadException(e));
            }
            catch (Exception e)
            {
                throw (new WorkloadException(e));
            }
        }
        /// <summary>
        /// Move on to process the next host. This should only be
        /// called after getWork returns null.
        /// </summary>
        /// <returns>The name of the next host.</returns>
        public virtual String NextHost()
        {
            if (this.currentHostID == -1)
            {
                throw new WorkloadException(
                    "Attempting to obtain host before adding first URL.");
            }
            else
            {
                MarkHostProcessed(this.currentHost);
            }

            try
            {
                bool requery = false;

                if (this.hostResultSet == null)
                {
                    requery = true;
                }
                else
                {
                    if (!this.hostResultSet.DataReader.Read())
                    {
                        requery = true;
                    }
                }

                if (requery)
                {
                    if (this.hostResultSet != null)
                    {
                        this.hostResultSet.Close();
                    }

                    this.hostResultSet = this.stmtGetNextHost.ExecuteQuery(Status.STATUS_WAITING);

                    if (!this.hostResultSet.DataReader.Read())
                    {
                        return null;
                    }
                }

                this.currentHostID = (int)this.hostResultSet.DataReader[0];
                this.currentHost = (String)this.hostResultSet.DataReader[1];
                this.stmtSetHostStatus.Execute(Status.STATUS_PROCESSING, this.currentHostID);
                spider.Logging.Log(Logger.Level.INFO, "Moving to new host: " + this.currentHost);
                return this.currentHost;

            }
            catch (Exception e)
            {
                throw (new WorkloadException(e));
            }

        }
        /// <summary>
        /// Close the workload.
        /// </summary>
        public void Close()
        {
            if (this.workResultSet != null)
            {
                try
                {
                    this.workResultSet.Close();
                }
                catch (Exception)
                {
                    spider.Logging.Log(Logger.Level.ERROR,
                        "Error trying to close workload result set, ignoring...");
                }
                this.workResultSet = null;
            }

            foreach (RepeatableStatement statement in this.statements)
            {
                statement.Close();
            }

            if (this.connection != null)
            {
                this.connection.Close();
            }
        }