/// <summary> /// Performs the selection of urls to crawl from the database. It is the method that /// is running on the pluginThread, so it must be able to handle ThreadAbortException /// and ThreadInterruptedException. /// </summary> private void PerformSelection() { try { ReportImmediately(CWLoggerEntryType.Info, "CrawlWave Url Selection Plugin thread has started with ID 0x" + Thread.CurrentThread.GetHashCode().ToString("x4")); while (!mustStop) { //Select urls from the database, update the appropriate tables and stats try { try { dbcon.Open(); } catch {} SqlTransaction transaction = null; if (settings.UseTransactions) { transaction = dbcon.BeginTransaction(); } try { //check if the threshold criteria is met int count = SelectUrlCount(transaction); AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Threshold is " + settings.Threshold + " urls, current Url Queue Size is " + count.ToString()); if (count < settings.Threshold) { AddToReportQueue(CWLoggerEntryType.Info, "UrlSelectionPlugin: Performing Url Selection, Selection size = " + settings.SelectionSize.ToString()); //Refresh the banned hosts cache banned.RefreshCache(); //and the robots cache robots.LoadCache(); //Select the Urls to process DataSet urls = SelectUrls(transaction); //insert each one of them to the table of urls to be crawled and //mark them as assigned if (urls.Tables[0].Rows.Count > 0) { //create the SqlCommands SqlCommand icmd = new SqlCommand("cw_insert_url_to_crawl", dbcon, transaction); icmd.CommandType = CommandType.StoredProcedure; icmd.CommandTimeout = 60; icmd.Parameters.Add("@url_id", SqlDbType.Int); icmd.Parameters.Add("@url", SqlDbType.NVarChar, 500); icmd.Parameters.Add("@crc", SqlDbType.BigInt); icmd.Parameters.Add("@domain", SqlDbType.TinyInt); icmd.Parameters.Add("@robots", SqlDbType.NVarChar, 1000); icmd.Parameters.Add("@robots_expiration", SqlDbType.SmallDateTime); SqlCommand mcmd = new SqlCommand("cw_mark_url_as_assigned", dbcon, transaction); mcmd.CommandType = CommandType.StoredProcedure; mcmd.CommandTimeout = 60; mcmd.Parameters.Add("@url_id", SqlDbType.Int); Guid hostID = new Guid(); foreach (DataRow dr in urls.Tables[0].Rows) { hostID = (Guid)dr[2]; byte [] hostIDbytes = hostID.ToByteArray(); if (!banned.IsBanned(hostIDbytes)) { icmd.Parameters[0].Value = dr[0]; icmd.Parameters[1].Value = dr[1]; icmd.Parameters[2].Value = dr[3]; icmd.Parameters[3].Value = dr[4]; RobotsTxtEntry entry = robots.GetEntry(hostIDbytes); if (entry == null) { icmd.Parameters[4].Value = DBNull.Value; icmd.Parameters[5].Value = DBNull.Value; } else { icmd.Parameters[4].Value = ConcatenatePaths(entry.DisallowedPaths); icmd.Parameters[5].Value = entry.ExpirationDate; } try { icmd.ExecuteNonQuery(); } catch { continue; } } mcmd.Parameters[0].Value = (int)dr[0]; try { mcmd.ExecuteNonQuery(); } catch { continue; } //MarkUrlAsAssigned((int)dr[0],transaction); //perhaps this should be inlined because calling the method n times //creates and destroys a lot of objects and spares cpu time } mcmd.Dispose(); icmd.Dispose(); } } try { if (settings.UseTransactions) { transaction.Commit(); transaction.Dispose(); } } catch (Exception ext) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ext.ToString())); } if (dbcon.State == ConnectionState.Closed) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Error, DateTime.Now, "The Url Selection Plugin failed to connect to the database. Stopping...")); return; } if (settings.PauseBetweenOperations) { int waitSeconds = PauseInSeconds(); while (waitSeconds > 0) { Thread.Sleep(1000); if (mustStop) { break; } waitSeconds--; } } } catch (ThreadInterruptedException) { if (settings.UseTransactions) { transaction.Rollback(); transaction.Dispose(); } } finally { if (dbcon.State != ConnectionState.Closed) { try { dbcon.Close(); } catch {} } Report(); } } catch (Exception ex) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, ex.ToString())); } } } catch (ThreadAbortException) { //The thread was asked to abort, which means it must return at once return; } catch (ThreadInterruptedException) { //The thread has been asked to Join. We have nothing to do but return. return; } finally { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Info, DateTime.Now, Thread.CurrentThread.Name + " has stopped.")); } }
/// <summary> /// Visits a url and if necessary stores it in the output file and the database /// </summary> /// <param name="url">The url to visit</param> private void VisitUrl(string url) { try { if (url == String.Empty) { throw new Exception("VisitUrl received an empty url!"); } Uri uri = null; try { uri = new Uri(url); if (banned.IsBanned(uri.Host)) { throw new UriFormatException("Banned host encountered"); } } catch (UriFormatException e) { Interlocked.Increment(ref processedDomains); throw e; //cause the function to exit } //create the Web Request and get the header HttpWebRequest pageRequest = (HttpWebRequest)HttpWebRequest.Create(uri); pageRequest.UserAgent = UserAgent; pageRequest.Timeout = 10000; //don't wait more than 10 seconds for a page pageRequest.Method = httpMethod; HttpWebResponse pageResponse = null; try { pageResponse = (HttpWebResponse)pageRequest.GetResponse(); } catch //either WebException or UriFormatException { Interlocked.Increment(ref processedDomains); } if (pageResponse != null) { if (pageResponse.ResponseUri.AbsoluteUri != url) { if (!pageResponse.ResponseUri.AbsoluteUri.StartsWith(url)) { url = pageResponse.ResponseUri.AbsoluteUri; } } pageResponse.Close(); //We must store the Url in the database and if necessary in the output file lock (sync) { InsertUrlInDatabase(url); } if (settings.CleanUrls) { lock (outputStream) { outputStream.WriteLine(url); outputStream.Flush(); } } Interlocked.Increment(ref processedDomains); } } catch (Exception ex) { AddToReportQueue(CWLoggerEntryType.Warning, "Initializer Plugin VisitUrl failed: " + ex.ToString()); } finally { percent = (int)(((long)processedDomains * 100) / totalDomains); } }