/// <summary> /// Get the host name associated with the specified host id. /// </summary> /// <param name="hostID">The host id to look up.</param> /// <returns>The name of the host.</returns> protected virtual String GetHost(int hostID) { RepeatableStatement.Results rs = null; try { rs = this.stmtGetHost.ExecuteQuery(hostID); if (!rs.DataReader.Read()) { throw new WorkloadException("Can't find previously created host."); } return((String)rs.DataReader[0]); } catch (Exception e) { throw new WorkloadException(e); } finally { if (rs != null) { rs.Close(); } } }
/// <summary> /// Return true if there are no more workload units. /// </summary> /// <returns>Returns true if there are no more workload units.</returns> public virtual bool WorkloadEmpty() { RepeatableStatement.Results rs = null; try { rs = this.stmtWorkloadEmpty.ExecuteQuery(this.currentHostID); if (!rs.DataReader.Read()) { return(true); } return((int)rs.DataReader[0] < 1); } catch (Exception e) { throw (new WorkloadException(e)); } finally { if (rs != null) { rs.Close(); } } }
/// <summary> /// Setup the workload so that it can be resumed from where /// the last spider left the workload. /// </summary> public virtual void Resume() { RepeatableStatement.Results rs = null; try { rs = this.stmtResume.ExecuteQuery(); if (!rs.DataReader.Read()) { throw (new WorkloadException( "Can't resume, unable to determine current host.")); } this.currentHostID = (int)rs.DataReader[0]; this.currentHost = (String)GetHost(this.currentHostID); } catch (Exception e) { throw (new WorkloadException(e)); } finally { if (rs != null) { rs.Close(); } } this.stmtResume2.Execute(); }
/// <summary> /// Get the source page that contains the specified URL. /// </summary> /// <param name="url">The Uri to seek the source for.</param> /// <returns>The source of the specified URL.</returns> public virtual Uri GetSource(Uri url) { RepeatableStatement.Results rs = null; try { rs = this.stmtGetSource.ExecuteQuery(ComputeHash(url)); while (rs.DataReader.Read()) { String u = (String)rs.DataReader[0]; if (u.Equals(url.ToString())) { return(new Uri((String)rs.DataReader[0])); } } return(null); } catch (UriFormatException e) { throw (new WorkloadException(e)); } catch (Exception e) { throw (new WorkloadException(e)); } finally { if (rs != null) { rs.Close(); } } }
/// <summary> /// Close the workload. /// </summary> public void Close() { if (this.workResultSet != null) { try { this.workResultSet.Close(); } catch (Exception) { spider.Logging.Log(Logger.Level.ERROR, "Error trying to close workload result set, ignoring..."); } this.workResultSet = null; } foreach (RepeatableStatement statement in this.statements) { statement.Close(); } if (this.connection != null) { this.connection.Close(); } }
/// <summary> /// Called internally to get a work unit. This function /// does not wait for work, rather it simply returns null. /// </summary> /// <returns>The next Uri to process.</returns> protected virtual Uri GetWorkInternal() { if (this.currentHostID == -1) { throw new WorkloadException( "Attempting to obtain work before adding first URL."); } try { bool requery = false; if (this.workResultSet == null) { requery = true; } else { if (!this.workResultSet.DataReader.Read()) { requery = true; } } if (requery) { if (this.workResultSet != null) { this.workResultSet.Close(); } this.workResultSet = this.stmtGetWork.ExecuteQuery(Status.STATUS_WAITING, this.currentHostID); if (!this.workResultSet.DataReader.Read()) { return(null); } } int id = (int)this.workResultSet.DataReader[0]; String url = (String)this.workResultSet.DataReader[1]; this.stmtGetWork2.Execute(Status.STATUS_PROCESSING, id); return(new Uri(url)); } catch (UriFormatException e) { throw (new WorkloadException(e)); } catch (Exception e) { throw (new WorkloadException(e)); } }
/// <summary> /// Move on to process the next host. This should only be /// called after getWork returns null. /// </summary> /// <returns>The name of the next host.</returns> public virtual String NextHost() { if (this.currentHostID == -1) { throw new WorkloadException( "Attempting to obtain host before adding first URL."); } else { MarkHostProcessed(this.currentHost); } try { bool requery = false; if (this.hostResultSet == null) { requery = true; } else { if (!this.hostResultSet.DataReader.Read()) { requery = true; } } if (requery) { if (this.hostResultSet != null) { this.hostResultSet.Close(); } this.hostResultSet = this.stmtGetNextHost.ExecuteQuery(Status.STATUS_WAITING); if (!this.hostResultSet.DataReader.Read()) { return(null); } } this.currentHostID = (int)this.hostResultSet.DataReader[0]; this.currentHost = (String)this.hostResultSet.DataReader[1]; this.stmtSetHostStatus.Execute(Status.STATUS_PROCESSING, this.currentHostID); spider.Logging.Log(Logger.Level.INFO, "Moving to new host: " + this.currentHost); return(this.currentHost); } catch (Exception e) { throw (new WorkloadException(e)); } }
/// <summary> /// Get the id for the specified host name. /// </summary> /// <param name="host">The host to lookup.</param> /// <param name="require">Should an exception be thrown if the host is not located.</param> /// <returns>The id of the specified host name.</returns> protected virtual int GetHostID(String host, bool require) { RepeatableStatement.Results rs = null; // is this the current host? if (this.currentHostID != -1) { if (String.Compare(this.currentHost, host, true) == 0) { return(this.currentHostID); } } // use the database to find it try { rs = this.stmtGetHostID.ExecuteQuery(host); if (rs.DataReader.Read()) { return((int)rs.DataReader[0]); } } finally { if (rs != null) { rs.Close(); } } if (require) { StringBuilder str = new StringBuilder(); str.Append("Failed to find previously visited Host,"); str.Append("Host=\""); str.Append(host); str.Append("\"."); throw (new WorkloadException(str.ToString())); } else { return(-1); } }
/// <summary> /// Get the workload ID, given a URL. /// </summary> /// <param name="url"> The URL to look up.</param> /// <param name="require">Should an exception be thrown if the workload /// is not located.</param> /// <returns>The ID of the workload.</returns> protected virtual int GetWorkloadID(Uri url, bool require) { int hash = 0; RepeatableStatement.Results rs = null; try { hash = ComputeHash(url); rs = this.stmtGetWorkloadID.ExecuteQuery(hash); while (rs.DataReader.Read()) { if (rs.DataReader[1].Equals(url.ToString())) { return((int)rs.DataReader[0]); } } } finally { if (rs != null) { rs.Close(); } } if (require) { StringBuilder str = new StringBuilder(); str.Append("Failed to find previously visited URL, hash=\""); str.Append(hash); str.Append("\", URL=\""); str.Append(url.ToString()); str.Append("\"."); throw (new WorkloadException(str.ToString())); } else { return(-1); } }
/// <summary> /// Called internally to get a work unit. This function /// does not wait for work, rather it simply returns null. /// </summary> /// <returns>The next Uri to process.</returns> protected virtual Uri GetWorkInternal() { if (this.currentHostID == -1) { throw new WorkloadException( "Attempting to obtain work before adding first URL."); } try { bool requery = false; if (this.workResultSet == null) { requery = true; } else { if (!this.workResultSet.DataReader.Read()) { requery = true; } } if (requery) { if (this.workResultSet != null) { this.workResultSet.Close(); } this.workResultSet = this.stmtGetWork.ExecuteQuery(Status.STATUS_WAITING, this.currentHostID); if (!this.workResultSet.DataReader.Read()) { return null; } } int id = (int)this.workResultSet.DataReader[0]; String url = (String)this.workResultSet.DataReader[1]; this.stmtGetWork2.Execute(Status.STATUS_PROCESSING, id); return new Uri(url); } catch (UriFormatException e) { throw (new WorkloadException(e)); } catch (Exception e) { throw (new WorkloadException(e)); } }
/// <summary> /// Move on to process the next host. This should only be /// called after getWork returns null. /// </summary> /// <returns>The name of the next host.</returns> public virtual String NextHost() { if (this.currentHostID == -1) { throw new WorkloadException( "Attempting to obtain host before adding first URL."); } else { MarkHostProcessed(this.currentHost); } try { bool requery = false; if (this.hostResultSet == null) { requery = true; } else { if (!this.hostResultSet.DataReader.Read()) { requery = true; } } if (requery) { if (this.hostResultSet != null) { this.hostResultSet.Close(); } this.hostResultSet = this.stmtGetNextHost.ExecuteQuery(Status.STATUS_WAITING); if (!this.hostResultSet.DataReader.Read()) { return null; } } this.currentHostID = (int)this.hostResultSet.DataReader[0]; this.currentHost = (String)this.hostResultSet.DataReader[1]; this.stmtSetHostStatus.Execute(Status.STATUS_PROCESSING, this.currentHostID); spider.Logging.Log(Logger.Level.INFO, "Moving to new host: " + this.currentHost); return this.currentHost; } catch (Exception e) { throw (new WorkloadException(e)); } }
/// <summary> /// Close the workload. /// </summary> public void Close() { if (this.workResultSet != null) { try { this.workResultSet.Close(); } catch (Exception) { spider.Logging.Log(Logger.Level.ERROR, "Error trying to close workload result set, ignoring..."); } this.workResultSet = null; } foreach (RepeatableStatement statement in this.statements) { statement.Close(); } if (this.connection != null) { this.connection.Close(); } }