예제 #1
0
        /// <summary>
        /// Updates the Url and the Url Data tables
        /// </summary>
        /// <param name="data">The UrlCrawlData containing the data of the crawled Url.</param>
        /// <param name="transaction">The currently active <see cref="SqlTransaction"/>.</param>
        /// <returns>The ID of the updated url or 0 of something goes wrong.</returns>
        private int UpdateUrl(UrlCrawlData data, SqlTransaction transaction)
        {
            int retVal = 0;

            try
            {
                //build the Sql Command for updating the url table
                SqlCommand urlcmd = new SqlCommand("cw_update_url", dbcon, transaction);
                urlcmd.CommandType    = CommandType.StoredProcedure;
                urlcmd.CommandTimeout = settings.DBActionTimeout;
                urlcmd.Parameters.Add("@url_id", SqlDbType.Int);
                urlcmd.Parameters.Add("@url", SqlDbType.NVarChar, 500);
                urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@crc", SqlDbType.BigInt);
                urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_updated", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@last_visited", SqlDbType.SmallDateTime);
                urlcmd.Parameters.Add("@flag_redirected", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@id", SqlDbType.Int);
                urlcmd.Parameters["@id"].Direction = ParameterDirection.Output;

                //Build the SQL Command for updating the hosts table
                SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon, transaction);
                hostcmd.CommandType    = CommandType.StoredProcedure;
                hostcmd.CommandTimeout = settings.DBActionTimeout;
                hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100);

                //set their parameters
                urlcmd.Parameters[0].Value = data.ID;
                urlcmd.Parameters[1].Value = data.Url;
                urlcmd.Parameters[2].Value = new Guid(data.MD5);
                Uri    uri       = new Uri(data.Url);
                string host_name = uri.Host;
                Guid   host_id   = new Guid(MD5Hash.md5(host_name));
                urlcmd.Parameters[3].Value = host_id;
                urlcmd.Parameters[5].Value = data.CRC;
                if (data.Redirected)
                {
                    //we must first attempt to insert the host, otherwise the urlcmd will fail
                    hostcmd.Parameters[0].Value = host_id;
                    hostcmd.Parameters[1].Value = host_name;
                    try
                    {
                        hostcmd.ExecuteNonQuery();
                    }
                    catch
                    {
                        //it probably exists already
                    }

                    urlcmd.Parameters[4].Value  = (byte)data.RedirectedPriority;
                    urlcmd.Parameters[6].Value  = (byte)data.RedirectedFlagDomain;
                    urlcmd.Parameters[7].Value  = (data.RedirectedFlagRobots)?1:0;
                    urlcmd.Parameters[8].Value  = (data.Updated)?1:0;
                    urlcmd.Parameters[9].Value  = data.TimeStamp;
                    urlcmd.Parameters[10].Value = 1;
                }
                else
                {
                    urlcmd.Parameters[4].Value = DBNull.Value;
                    urlcmd.Parameters[6].Value = (byte)data.UrlToCrawl.FlagDomain;
                    if (data.FlagFetchRobots)
                    {
                        urlcmd.Parameters[7].Value = (data.RedirectedFlagRobots)?1:0;
                    }
                    else
                    {
                        urlcmd.Parameters[7].Value = 0;
                    }
                    urlcmd.Parameters[8].Value  = (data.Updated)?1:0;
                    urlcmd.Parameters[9].Value  = data.TimeStamp;
                    urlcmd.Parameters[10].Value = 0;
                }
                //retVal = data.ID;
                //make sure the host command is disposed
                hostcmd.Dispose();
                urlcmd.ExecuteNonQuery();
                retVal = (int)urlcmd.Parameters["@id"].Value;
                urlcmd.Dispose();

                if (data.Updated)
                {
                    //if necessary build the sql command for updating the url data tables
                    SqlCommand urldatacmd = new SqlCommand("cw_update_url_data", dbcon, transaction);
                    urldatacmd.CommandType    = CommandType.StoredProcedure;
                    urldatacmd.CommandTimeout = settings.DBActionTimeout;
                    urldatacmd.Parameters.Add("@url_id", SqlDbType.Int);
                    urldatacmd.Parameters.Add("@data", SqlDbType.Image);
                    urldatacmd.Parameters.Add("@length", SqlDbType.Int);
                    urldatacmd.Parameters.Add("@original_length", SqlDbType.Int);
                    urldatacmd.Parameters.Add("@http_code", SqlDbType.SmallInt);
                    urldatacmd.Parameters.Add("@retrieval_time", SqlDbType.Int);

                    urldatacmd.Parameters[0].Value = retVal;
                    //compress the url's data
                    if (data.Data != String.Empty)
                    {
                        byte [] compressed = null;
                        string  urldata    = InternetUtils.Base64Decode(data.Data);
                        CompressionUtils.CompressString(ref urldata, out compressed);
                        urldatacmd.Parameters[1].Value = compressed;
                        urldatacmd.Parameters[2].Value = compressed.Length;
                        urldatacmd.Parameters[3].Value = data.Data.Length;
                    }
                    else
                    {
                        urldatacmd.Parameters[1].Value = new byte[0];
                        urldatacmd.Parameters[2].Value = 0;
                        urldatacmd.Parameters[3].Value = 0;
                    }
                    urldatacmd.Parameters[4].Value = (short)data.HttpStatusCode;
                    urldatacmd.Parameters[5].Value = data.RetrievalTime;
                    urldatacmd.ExecuteNonQuery();
                    urldatacmd.Dispose();
                }
            }
            catch (Exception e)
            {
                AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater failed to update a Url in the database: " + e.ToString());
                retVal = 0;
            }
            return(retVal);
        }