示例#1
0
        /// <summary>
        /// Performs the selection of urls to crawl from the database. It is the method that
        /// is running on the pluginThread, so it must be able to handle ThreadAbortException
        /// and ThreadInterruptedException.
        /// </summary>
        private void PerformSelection()
        {
            try
            {
                ReportImmediately(CWLoggerEntryType.Info, "CrawlWave Url Selection Plugin thread has started with ID 0x" + Thread.CurrentThread.GetHashCode().ToString("x4"));
                while (!mustStop)
                {
                    //Select urls from the database, update the appropriate tables and stats
                    try
                    {
                        try
                        {
                            dbcon.Open();
                        }
                        catch
                        {}
                        SqlTransaction transaction = null;
                        if (settings.UseTransactions)
                        {
                            transaction = dbcon.BeginTransaction();
                        }
                        try
                        {
                            //check if the threshold criteria is met
                            int count = SelectUrlCount(transaction);
                            AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Threshold is " + settings.Threshold + " urls, current Url Queue Size is " + count.ToString());
                            if (count < settings.Threshold)
                            {
                                AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Performing Url Selection, Selection size = " + settings.SelectionSize.ToString());
                                //Refresh the banned hosts cache
                                banned.RefreshCache();
                                //and the robots cache
                                robots.LoadCache();
                                //Select the Urls to process
                                DataSet urls = SelectUrls(transaction);
                                //insert each one of them to the table of urls to be crawled and
                                //mark them as assigned
                                if (urls.Tables[0].Rows.Count > 0)
                                {
                                    //create the SqlCommands
                                    SqlCommand icmd = new SqlCommand("cw_insert_url_to_crawl", dbcon, transaction);
                                    icmd.CommandType    = CommandType.StoredProcedure;
                                    icmd.CommandTimeout = 60;
                                    icmd.Parameters.Add("@url_id", SqlDbType.Int);
                                    icmd.Parameters.Add("@url", SqlDbType.NVarChar, 500);
                                    icmd.Parameters.Add("@crc", SqlDbType.BigInt);
                                    icmd.Parameters.Add("@domain", SqlDbType.TinyInt);
                                    icmd.Parameters.Add("@robots", SqlDbType.NVarChar, 1000);
                                    icmd.Parameters.Add("@robots_expiration", SqlDbType.SmallDateTime);

                                    SqlCommand mcmd = new SqlCommand("cw_mark_url_as_assigned", dbcon, transaction);
                                    mcmd.CommandType    = CommandType.StoredProcedure;
                                    mcmd.CommandTimeout = 60;
                                    mcmd.Parameters.Add("@url_id", SqlDbType.Int);

                                    Guid hostID = new Guid();
                                    foreach (DataRow dr in urls.Tables[0].Rows)
                                    {
                                        hostID = (Guid)dr[2];
                                        byte [] hostIDbytes = hostID.ToByteArray();
                                        if (!banned.IsBanned(hostIDbytes))
                                        {
                                            icmd.Parameters[0].Value = dr[0];
                                            icmd.Parameters[1].Value = dr[1];
                                            icmd.Parameters[2].Value = dr[3];
                                            icmd.Parameters[3].Value = dr[4];
                                            RobotsTxtEntry entry = robots.GetEntry(hostIDbytes);
                                            if (entry == null)
                                            {
                                                icmd.Parameters[4].Value = DBNull.Value;
                                                icmd.Parameters[5].Value = DBNull.Value;
                                            }
                                            else
                                            {
                                                icmd.Parameters[4].Value = ConcatenatePaths(entry.DisallowedPaths);
                                                icmd.Parameters[5].Value = entry.ExpirationDate;
                                            }
                                            try
                                            {
                                                icmd.ExecuteNonQuery();
                                            }
                                            catch
                                            {
                                                continue;
                                            }
                                        }
                                        mcmd.Parameters[0].Value = (int)dr[0];
                                        try
                                        {
                                            mcmd.ExecuteNonQuery();
                                        }
                                        catch
                                        {
                                            continue;
                                        }
                                        //MarkUrlAsAssigned((int)dr[0],transaction);
                                        //perhaps this should be inlined because calling the method n times
                                        //creates and destroys a lot of objects and spares cpu time
                                    }
                                    mcmd.Dispose();
                                    icmd.Dispose();
                                }
                            }
                            try
                            {
                                if (settings.UseTransactions)
                                {
                                    transaction.Commit();
                                    transaction.Dispose();
                                }
                            }
                            catch (Exception ext)
                            {
                                events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ext.ToString()));
                            }
                            if (dbcon.State == ConnectionState.Closed)
                            {
                                events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Error, DateTime.Now, "The Url Selection Plugin failed to connect to the database. Stopping..."));
                                return;
                            }
                            if (settings.PauseBetweenOperations)
                            {
                                int waitSeconds = PauseInSeconds();
                                while (waitSeconds > 0)
                                {
                                    Thread.Sleep(1000);
                                    if (mustStop)
                                    {
                                        break;
                                    }
                                    waitSeconds--;
                                }
                            }
                        }
                        catch (ThreadInterruptedException)
                        {
                            if (settings.UseTransactions)
                            {
                                transaction.Rollback();
                                transaction.Dispose();
                            }
                        }
                        finally
                        {
                            if (dbcon.State != ConnectionState.Closed)
                            {
                                try
                                {
                                    dbcon.Close();
                                }
                                catch
                                {}
                            }
                            Report();
                        }
                    }
                    catch (Exception ex)
                    {
                        events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ex.ToString()));
                    }
                }
            }
            catch (ThreadAbortException)
            {
                //The thread was asked to abort, which means it must return at once
                return;
            }
            catch (ThreadInterruptedException)
            {
                //The thread has been asked to Join. We have nothing to do but return.
                return;
            }
            finally
            {
                events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Info, DateTime.Now, Thread.CurrentThread.Name + " has stopped."));
            }
        }
示例#2
0
 /// <summary>
 /// Visits a url and if necessary stores it in the output file and the database
 /// </summary>
 /// <param name="url">The url to visit</param>
 private void VisitUrl(string url)
 {
     try
     {
         if (url == String.Empty)
         {
             throw new Exception("VisitUrl received an empty url!");
         }
         Uri uri = null;
         try
         {
             uri = new Uri(url);
             if (banned.IsBanned(uri.Host))
             {
                 throw new UriFormatException("Banned host encountered");
             }
         }
         catch (UriFormatException e)
         {
             Interlocked.Increment(ref processedDomains);
             throw e;                     //cause the function to exit
         }
         //create the Web Request and get the header
         HttpWebRequest pageRequest = (HttpWebRequest)HttpWebRequest.Create(uri);
         pageRequest.UserAgent = UserAgent;
         pageRequest.Timeout   = 10000;             //don't wait more than 10 seconds for a page
         pageRequest.Method    = httpMethod;
         HttpWebResponse pageResponse = null;
         try
         {
             pageResponse = (HttpWebResponse)pageRequest.GetResponse();
         }
         catch                 //either WebException or UriFormatException
         {
             Interlocked.Increment(ref processedDomains);
         }
         if (pageResponse != null)
         {
             if (pageResponse.ResponseUri.AbsoluteUri != url)
             {
                 if (!pageResponse.ResponseUri.AbsoluteUri.StartsWith(url))
                 {
                     url = pageResponse.ResponseUri.AbsoluteUri;
                 }
             }
             pageResponse.Close();
             //We must store the Url in the database and if necessary in the output file
             lock (sync)
             {
                 InsertUrlInDatabase(url);
             }
             if (settings.CleanUrls)
             {
                 lock (outputStream)
                 {
                     outputStream.WriteLine(url);
                     outputStream.Flush();
                 }
             }
             Interlocked.Increment(ref processedDomains);
         }
     }
     catch (Exception ex)
     {
         AddToReportQueue(CWLoggerEntryType.Warning, "Initializer Plugin VisitUrl failed: " + ex.ToString());
     }
     finally
     {
         percent = (int)(((long)processedDomains * 100) / totalDomains);
     }
 }