/// <summary> /// Extracts the words found in the contents of a document. Used by DBUpdater when /// a document is stored in the database in order to extract the words it contains /// and add them to the database at the same time. /// </summary> /// <param name="data">The <see cref="UrlCrawlData"/> to be processed.</param> public void ExtractWords(ref UrlCrawlData data) { //First try to extract the words from the document. If something goes wrong just //return, otherwise add the words to the cache, remove any old words related to //the url with this id from the database and store the new url-words. try { SortedList words = wordExtractor.ExtractWords(data.Data); if(words.Count == 0) { return; } //add all the words to the database if they don't exist already string word = String.Empty; short word_count = 0; int word_id = -1; foreach(DictionaryEntry de in words) { word = (string)de.Key; cache.AddStemmedWord(word); } //remove all the old words related to this url from the database RemoveUrlWords(data.ID); //now add relationships between the url and its words foreach(DictionaryEntry d in words) { word = (string)d.Key; word_count = (short)d.Value; word_id = cache[word]; AddUrlWord(data.ID, word_id, word_count); } UpdateUrlDataLastProcess(data.ID); } catch(Exception e) { events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, "WordExtractionPlugin failed to extract words from Url with ID " + data.ID.ToString() + ": " + e.ToString())); } }
/// <summary> /// Stores an array of <see cref="UrlCrawlData"/> objects and the <see cref="ClientInfo"/> /// of the client who returned them on a compressed file on disk. /// </summary> /// <param name="info">The <see cref="ClientInfo"/> of the client who returned the data.</param> /// <param name="data">An array of <see cref="UrlCrawlData"/> objects containing the /// data returned by the client.</param> private void SaveXMLFile(ClientInfo info, UrlCrawlData[] data) { UrlCrawlDataFile udf = new UrlCrawlDataFile(info, data); string id = Guid.NewGuid().ToString(); //serialize the object into a memory stream MemoryStream ms = new MemoryStream(); //this may need to use SoapFormatter //XmlSerializer xml = new XmlSerializer(typeof(UrlCrawlDataFile)); SoapFormatter xml = new SoapFormatter(); xml.Serialize(ms, udf); byte[] buffer = ms.ToArray(); ms.Close(); string fileName = settings.DataFilesPath + id + ".zip"; Crc32 crc = new Crc32(); ZipOutputStream zs = new ZipOutputStream(File.Create(fileName)); ZipEntry entry = new ZipEntry(id); entry.DateTime = DateTime.Now; entry.Size = buffer.Length; crc.Update(buffer); entry.Crc = crc.Value; zs.PutNextEntry(entry); zs.Write(buffer, 0, buffer.Length); zs.Finish(); zs.Close(); }
/// <summary> /// Stores the results that the clients return after crawling a set of Urls. /// </summary> /// <param name="ci">The <see cref="ClientInfo"/> of the client returning the data.</param> /// <param name="data">An array of <see cref="UrlCrawlData"/> objects containing the data of the crawled urls.</param> /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/> /// encapsulating the error that occured if the operation fails.</returns> public SerializedException StoreCrawlResults(ClientInfo ci, UrlCrawlData[] data) { SerializedException sx = null; try { if (!ConnectToDatabase()) { throw new CWDBConnectionFailedException(); } try { //store the new robots.txt files in the database, nothing else needs to //be done since the urls will be marked as not assigned when their data //is processed by DBUpdater if ((data != null) && (data.Length > 0)) { SqlCommand cmd = new SqlCommand("cw_update_or_insert_robot", dbcon); cmd.CommandType = CommandType.StoredProcedure; cmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier); cmd.Parameters.Add("@disallowed", SqlDbType.NVarChar, 1000); foreach (UrlCrawlData urlData in data) { if ((urlData.FlagFetchRobots) || (urlData.Redirected)) { string url = urlData.Url; cmd.Parameters[0].Value = new Guid(MD5Hash.md5(InternetUtils.HostName(url))); cmd.Parameters[1].Value = urlData.RobotsDisallowedPaths; try { cmd.ExecuteNonQuery(); } catch { continue; } } } cmd.Dispose(); SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon); statscmd.CommandType = CommandType.StoredProcedure; statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier); statscmd.Parameters.Add("@assigned", SqlDbType.BigInt); statscmd.Parameters.Add("@returned", SqlDbType.BigInt); statscmd.Parameters.Add("@type", SqlDbType.TinyInt); statscmd.Parameters[0].Value = ci.ClientID; statscmd.Parameters[1].Value = DBNull.Value; statscmd.Parameters[2].Value = data.Length; statscmd.Parameters[3].Value = 1; statscmd.ExecuteNonQuery(); statscmd.Dispose(); } } catch (Exception ex) { if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("StoreCrawlResults failed: " + ex.ToString()); } throw ex; } finally { //save xml file on disk try { SaveXMLFile(ci, data); } catch (Exception se) { sx = new SerializedException(se.GetType().ToString(), se.Message, se.ToString()); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("StoreCrawlResults failed to save XML data on disk: " + se.ToString()); } } } if (!DisconnectFromDatabase()) { throw new CWDBConnectionFailedException("Disconnect from database failure."); } } catch (Exception e) { sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString()); } finally { UpdateClientLastActive(ci); LogClientAction(ci, CWClientActions.LogGetCrawlResults); } return sx; }
/// <summary> /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the /// crawling threads that may be interrupted or aborted at any time it must be able /// to handle ThreadAbortException and ThreadInterruptedException. /// </summary> /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/> /// object that encapsulates the url that must be crawled.</param> private void CrawlUrl(ref InternetUrlToCrawl urlToCrawl) { try { UrlCrawlData urlData = new UrlCrawlData(); HiResTimer timer = new HiResTimer(); //create the web request and download the data HttpWebRequest pageRequest = null; try { pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url); } catch { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.Data = String.Empty; UpdateStatistics(HttpStatusCode.BadRequest, 0); lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.BadRequest, 0); return; } pageRequest.UserAgent = globals.UserAgent; pageRequest.Timeout=ExponentialBackoff.DefaultBackoff; //page timeout = 30 seconds HttpWebResponse pageResponse=null; try { timer.Start(); pageResponse = (HttpWebResponse)pageRequest.GetResponse(); //the above line might throw either WebException or UriFormatException } catch(WebException we) { HttpWebResponse response=(HttpWebResponse)we.Response; if (response!=null) { //although an exception occured we're able to get the Status Code urlData.HttpStatusCode=response.StatusCode; urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(response.StatusCode, response.ContentLength); } else { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(HttpStatusCode.BadRequest, 0); } } catch(UriFormatException) { //this will occur if the url is not valid urlData.HttpStatusCode=HttpStatusCode.BadRequest; urlData.Updated=false; urlData.Data = String.Empty; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; } finally { timer.Stop(); urlData.TimeStamp = DateTime.UtcNow; } if(pageResponse !=null) { //update the fields urlData.HttpStatusCode = pageResponse.StatusCode; //download and parse the contents of the url Stream receiveStream=pageResponse.GetResponseStream(); StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding); string contents = String.Empty; try { contents=receivedBytes.ReadToEnd(); } catch { //it should be response timeout not request timeout urlData.HttpStatusCode = HttpStatusCode.RequestTimeout; urlData.Updated = true; urlData.RetrievalTime = (int)timer.Duration; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.UrlToCrawl = urlToCrawl; try { receivedBytes.Close(); receiveStream.Close(); pageResponse.Close(); } catch {} lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.RequestTimeout, 0); return; } byte []buffer=Encoding.ASCII.GetBytes(contents); receiveStream.Close(); receivedBytes.Close(); UpdateStatistics(pageResponse.StatusCode, contents.Length); string redirectUrl = string.Empty; if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url) {//now that was a bloody BUGBUG redirectUrl = pageResponse.ResponseUri.AbsoluteUri; urlData.RedirectedPriority = CleanupRedirectUrl(ref redirectUrl); if(urlToCrawl.Url != redirectUrl) { urlData.Redirected=true; urlToCrawl.Url=redirectUrl; } } Parser parser = SelectParser(pageResponse.ContentType); pageResponse.Close(); long CRC = CompressionUtils.BufferCRC(buffer); if(CRC != urlToCrawl.CRC) { urlData.Updated = true; urlToCrawl.CRC = CRC; } if(urlData.Updated) { urlData.RetrievalTime = (int)timer.Duration; //if redirected, calculate robots, domain & priority for redirect url if(urlData.Redirected) { urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl); urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(ref redirectUrl, ref urlToCrawl, RobotsMetaTagValue.NoMeta); } //perform link extraction and content extraction ArrayList outlinks = null; try { if((parser == htmlParser)||(parser == textParser)) { urlData.Data = parser.ExtractContent(ref contents, false); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } else { urlData.Data = parser.ExtractContent(buffer, false); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(buffer, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } } catch { if(outlinks == null) { outlinks = new ArrayList(); } } urlData.OutLinks = new InternetUrlToIndex[outlinks.Count]; for(int i = 0; i< outlinks.Count; i++) { urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i]; } //finally update the urlData object with the modified UrlToCrawl urlData.UrlToCrawl = urlToCrawl; } } //lock and update CrawledUrls lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } throw ex; //PerformCrawling should catch this one ????? } finally { GC.Collect(); } }
/// <summary> /// SendResultsToServer runs on a dedicated thread. It periodically attempts to send /// the data produced from the crawling of urls back to the server. It communicates /// with the CrawlWave.Server web service asynchronously. /// </summary> private void SendResultsToServer() { Interlocked.Increment(ref runningThreads); if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("Started Results Thread with ID 0x" + Thread.CurrentThread.GetHashCode().ToString("x4")); } try { string FileName = String.Empty; UrlCrawlData [] urls = new UrlCrawlData[0]; SoapFormatter serializer = null; MemoryStream ms = null; Stream ReadStream = null; while(!mustStop) { try { FileName = String.Empty; lock(resultFileNames.SyncRoot) { if(resultFileNames.Count>0) { FileName = (string)resultFileNames.Dequeue(); } } if(FileName != String.Empty) { //urls = new UrlCrawlData[0]; ReadStream = null; serializer = null; try { ReadStream=File.Open(FileName, FileMode.Open); serializer = new SoapFormatter(); urls = (UrlCrawlData [] /*ArrayList*/)serializer.Deserialize(ReadStream); } catch(Exception e) { /*//something went wrong - put the filename back to the queue so that the //client will attempt to send it to the server later lock(resultFileNames.SyncRoot) { resultFileNames.Enqueue(FileName); }*/ globals.FileLog.LogWarning("SendResults: could not deserialize data from " + FileName +". The file will be deleted. " + e.ToString()); //the file must be deleted try { ReadStream.Close(); serializer = null; File.Delete(FileName); } catch {} } finally { if(ReadStream != null) { try { ReadStream.Close(); ReadStream = null; //TODO check if this is needed } catch {} } } if(urls.Length /*Count*/>0) { // UrlCrawlData [] data = new UrlCrawlData [urls.Count]; // for(int i = 0; i < urls.Count; i++) // { // data [i] = (UrlCrawlData)urls[i]; // } // urls.Clear(); byte [] buffer = null; //TODO: should this be called asynchronously? //proxy.SecureServer.BeginGetCrawlResults(globals.Client_Info, data, new AsyncCallback(SendResultsToServerCallback), proxy.SecureServer); try { ms = new MemoryStream(); serializer.Serialize(ms, urls/*data*/); buffer = ms.ToArray(); ms.Close(); SerializedException sx = proxy.SecureServer.GetCrawlResultsRaw(globals.Client_Info, buffer); if(sx!=null) { sx.ThrowException(); } lock(resultFileNames.SyncRoot) { try { File.Delete(FileName); } catch { resultFileNames.Enqueue(FileName); } } OnResultsSent(EventArgs.Empty); } catch(Exception e) { lock(resultFileNames.SyncRoot) { resultFileNames.Enqueue(FileName); } if(ms!=null) { ms.Close(); } throw e; } finally { //data = null; for(int i = 0; i<urls.Length; i++) { urls[i] = null; } buffer = null; ms=null; } } } } catch(Exception e) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("Crawler.SendResultsToServer failed: " + e.ToString()); } } serializer = null; GC.Collect(); Thread.Sleep(syncBackOff.Next()); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning(Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } } finally { if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has finished."); } GC.Collect(); Interlocked.Decrement(ref runningThreads); } }
/// <summary> /// Constructs a new instance of the <see cref="UrlCrawlDataFile"/> class with the /// provided values. /// </summary> /// <param name="info">The <see cref="ClientInfo"/> of the client who returned the data.</param> /// <param name="data">An array of <see cref="UrlCrawlData"/> objects.</param> public UrlCrawlDataFile(ClientInfo info, UrlCrawlData [] data) { Info = info; Data = data; }
public SerializedException GetCrawlResults(ClientInfo ci, UrlCrawlData[] data) { return engine.StoreCrawlResults(ci, data); }
/// <summary> /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the /// crawling threads that may be interrupted or aborted at any time it must be able /// to handle ThreadAbortException and ThreadInterruptedException. /// </summary> /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/> /// object that encapsulates the url that must be crawled.</param> internal void CrawlUrl(ref InternetUrlToCrawl urlToCrawl) { try { UrlCrawlData urlData = new UrlCrawlData(); HiResTimer timer = new HiResTimer(); //create the web request and download the data HttpWebRequest pageRequest = null; try { pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url); } catch { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.Data = String.Empty; UpdateStatistics(HttpStatusCode.BadRequest, 0); lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } return; } pageRequest.UserAgent = globals.UserAgent; pageRequest.Timeout=Backoff.DefaultBackoff; //page timeout = 30 seconds pageRequest.KeepAlive = false; HttpWebResponse pageResponse=null; try { timer.Start(); pageResponse = (HttpWebResponse)pageRequest.GetResponse(); //the above line might throw either WebException or UriFormatException } catch(WebException we) { HttpWebResponse response=(HttpWebResponse)we.Response; if (response!=null) { //although an exception occured we're able to get the Status Code urlData.HttpStatusCode=response.StatusCode; urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(response.StatusCode, response.ContentLength); response.Close(); } else { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(HttpStatusCode.BadRequest, 0); } } catch(UriFormatException) { //this will occur if the url is not valid urlData.HttpStatusCode=HttpStatusCode.BadRequest; urlData.Updated=false; urlData.Data = String.Empty; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; } finally { timer.Stop(); urlData.TimeStamp = DateTime.UtcNow; } if(pageResponse !=null) { //update the fields urlData.HttpStatusCode = pageResponse.StatusCode; //download and parse the contents of the url Stream receiveStream=pageResponse.GetResponseStream(); /*StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);*/ MemoryStream receivedBytes = new MemoryStream(); byte [] buffer = new byte[4096]; int read = 0; try { while((read=receiveStream.Read(buffer,0,4096))>0) { receivedBytes.Write(buffer,0,read); } } catch { //it should be response timeout not request timeout urlData.HttpStatusCode = HttpStatusCode.RequestTimeout; urlData.Updated = true; urlData.RetrievalTime = (int)timer.Duration; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.UrlToCrawl = urlToCrawl; try { receivedBytes.Close(); receiveStream.Close(); pageResponse.Close(); } catch {} lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.RequestTimeout, 0); return; } buffer = receivedBytes.ToArray(); Parser parser = SelectParser(pageResponse.ContentType); string contents = String.Empty; if(parser == htmlParser) { Encoding encoding = null; switch(pageResponse.ContentEncoding) { case "": case "none": contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length); //re-check the encoding encoding = DetectContentEncoding(ref contents); if(encoding != defaultEncoding) { contents = encoding.GetString(buffer, 0, buffer.Length); } break; case "gzip": case "x-gzip": //first decompress the stream and then re-check the encoding byte [] decompressed_buffer = new byte [0]; DecompressGzippedContent(buffer, out decompressed_buffer); contents = this.defaultEncoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length); //re-check the encoding encoding = DetectContentEncoding(ref contents); if(encoding != defaultEncoding) { contents = encoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length); } break; default: try { encoding = Encoding.GetEncoding(pageResponse.ContentEncoding); contents = encoding.GetString(buffer, 0, buffer.Length); } catch//(NotSupportedException) { encoding = defaultEncoding; //the encoding specified is unsupported. contents = String.Empty; } break; } } else { if(parser == textParser) { try { contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length); } catch { //something went seriously wrong here. The crawler got a header that says the server is //sending back a plain text document but for some reason we can't get the string contents. contents = String.Empty; } } } receiveStream.Close(); receivedBytes.Close(); UpdateStatistics(pageResponse.StatusCode, buffer.Length); string redirectUrl = string.Empty; if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url) { redirectUrl = pageResponse.ResponseUri.AbsoluteUri; urlData.RedirectedPriority = htmlParser.CleanUrlParams(ref redirectUrl);//CleanupRedirectUrl(ref redirectUrl); if(urlToCrawl.Url != redirectUrl)//now that was a bloody BUGBUG { urlData.Redirected=true; urlToCrawl.Url=redirectUrl; } } pageResponse.Close(); long CRC = CompressionUtils.BufferCRC(buffer); if(CRC != urlToCrawl.CRC) { urlData.Updated = true; urlToCrawl.CRC = CRC; } if(urlData.Updated) { urlData.RetrievalTime = (int)timer.Duration; //if redirected, calculate robots, domain & priority for redirect url if(urlData.Redirected) { urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl); urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(redirectUrl, urlToCrawl, RobotsMetaTagValue.NoMeta); } //perform link extraction and content extraction ArrayList outlinks = null; try { if((parser == htmlParser)||(parser == textParser)) { string clean = parser.ExtractContent(ref contents, false); if(clean.Length>1048576) { clean = clean.Substring(0,1048576); } urlData.Data = InternetUtils.Base64Encode(clean); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } else { contents = parser.ExtractContent(buffer, false); if(contents.Length>1048576) { contents = contents.Substring(0,1048576); } urlData.Data = InternetUtils.Base64Encode(contents); if(urlData.Data == null) { urlData.Data = String.Empty; } if(parser == pdfParser) { outlinks = textParser.ExtractLinks(ref contents, ref urlToCrawl); } else { outlinks = htmlParser.ExtractLinks(ref contents, ref urlToCrawl); } if(outlinks == null) { outlinks = new ArrayList(); } } } catch { if(outlinks == null) { outlinks = new ArrayList(); } } urlData.OutLinks = new InternetUrlToIndex[outlinks.Count]; for(int i = 0; i< outlinks.Count; i++) { urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i]; } //finally update the urlData object with the modified UrlToCrawl urlData.UrlToCrawl = urlToCrawl; } } //lock and update CrawledUrls lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } throw ex; //PerformCrawling should catch this one ????? } finally { GC.Collect(); } }
/// <summary> /// Acts as an asynchronous callback for the SendResultsToServer method that sends /// the crawled urls data to the server by calling GetCrawlResults. /// </summary> /// <param name="result">The result of the asynchronous call.</param> /*private void SendResultsToServerCallback(IAsyncResult result) { try { CrawlWaveServer server = (CrawlWaveServer)result.AsyncState; SerializedException sx = server.EndGetCrawlResults(result); if(sx!=null) { sx.ThrowException(); } lock(resultFileNames.SyncRoot) { string fileName = (string)resultFileNames.Dequeue(); try { File.Delete(fileName); } catch { resultFileNames.Enqueue(fileName); } } } catch(Exception e) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("Crawler.SendResultsToServerCallback: " + e.ToString()); } } }*/ /// <summary> /// Sends the crawl results to the server in a synchronous mode, it reads one data /// file at a time. /// </summary> private void SendResultsSynchronously() { if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("SendResultsSynchronously is attempting to send results to server."); } try { string FileName = String.Empty; UrlCrawlData [] urls = new UrlCrawlData[0]; //SoapFormatter serializer = null; BinaryFormatter serializer = null; MemoryStream ms = null; Stream ReadStream = null; lock(resultFileNames.SyncRoot) { if(resultFileNames.Count>0) { FileName = (string)resultFileNames.Dequeue(); } } if(FileName != String.Empty) { ReadStream = null; serializer = null; try { ReadStream=File.Open(FileName, FileMode.Open); //serializer = new SoapFormatter(); serializer = new BinaryFormatter(); urls = (UrlCrawlData [] /*ArrayList*/)serializer.Deserialize(ReadStream); } catch(Exception e) { //something went wrong during deserialization globals.FileLog.LogWarning("SendResultsSynchronously: could not deserialize data from " + FileName +". The file will be deleted. " + e.ToString()); //the file must be deleted try { ReadStream.Close(); serializer = null; File.Delete(FileName); } catch {} } finally { if(ReadStream != null) { try { ReadStream.Close(); ReadStream = null; //TODO check if this is needed } catch {} } } if(urls.Length /*Count*/>0) { byte [] buffer = null; //TODO: should this be called asynchronously? //proxy.SecureServer.BeginGetCrawlResults(globals.Client_Info, data, new AsyncCallback(SendResultsToServerCallback), proxy.SecureServer); try { ms = new MemoryStream(); serializer.Serialize(ms, urls/*data*/); buffer = ms.ToArray(); ms.Close(); //proxy.Timeout = 600000; // SerializedException sx = proxy.GetCrawlResultsRaw(globals.Client_Info, buffer); SerializedException sx = proxy.GetCrawlResults(globals.Client_Info, urls); if(sx!=null) { sx.ThrowException(); } lock(resultFileNames.SyncRoot) { try { File.Delete(FileName); } catch { resultFileNames.Enqueue(FileName); } } OnResultsSent(EventArgs.Empty); } catch(Exception e) { lock(resultFileNames.SyncRoot) { resultFileNames.Enqueue(FileName); } if(ms!=null) { ms.Close(); } throw e; } finally { //data = null; //proxy.Timeout = 100000; for(int i = 0; i<urls.Length; i++) { urls[i] = null; } buffer = null; ms=null; serializer = null; } } } } catch(Exception e) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("Crawler.SendResultsSynchronously failed: " + e.ToString()); } } finally { GC.Collect(); } }
/// <summary> /// Updates the Url and the Url Data tables /// </summary> /// <param name="data">The UrlCrawlData containing the data of the crawled Url.</param> /// <param name="transaction">The currently active <see cref="SqlTransaction"/>.</param> /// <returns>The ID of the updated url or 0 of something goes wrong.</returns> private int UpdateUrl(UrlCrawlData data, SqlTransaction transaction) { int retVal = 0; try { //build the Sql Command for updating the url table SqlCommand urlcmd = new SqlCommand("cw_update_url", dbcon, transaction); urlcmd.CommandType = CommandType.StoredProcedure; urlcmd.CommandTimeout = settings.DBActionTimeout; urlcmd.Parameters.Add("@url_id",SqlDbType.Int); urlcmd.Parameters.Add("@url", SqlDbType.NVarChar, 500); urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier); urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier); urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt); urlcmd.Parameters.Add("@crc", SqlDbType.BigInt); urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt); urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt); urlcmd.Parameters.Add("@flag_updated", SqlDbType.TinyInt); urlcmd.Parameters.Add("@last_visited", SqlDbType.SmallDateTime); urlcmd.Parameters.Add("@flag_redirected", SqlDbType.TinyInt); urlcmd.Parameters.Add("@id", SqlDbType.Int); urlcmd.Parameters["@id"].Direction = ParameterDirection.Output; //Build the SQL Command for updating the hosts table SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon, transaction); hostcmd.CommandType = CommandType.StoredProcedure; hostcmd.CommandTimeout = settings.DBActionTimeout; hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier); hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100); //set their parameters urlcmd.Parameters[0].Value = data.ID; urlcmd.Parameters[1].Value = data.Url; urlcmd.Parameters[2].Value = new Guid(data.MD5); Uri uri = new Uri(data.Url); string host_name = uri.Host; Guid host_id = new Guid(MD5Hash.md5(host_name)); urlcmd.Parameters[3].Value = host_id; urlcmd.Parameters[5].Value = data.CRC; if(data.Redirected) { //we must first attempt to insert the host, otherwise the urlcmd will fail hostcmd.Parameters[0].Value = host_id; hostcmd.Parameters[1].Value = host_name; try { hostcmd.ExecuteNonQuery(); } catch { //it probably exists already } urlcmd.Parameters[4].Value = (byte)data.RedirectedPriority; urlcmd.Parameters[6].Value = (byte)data.RedirectedFlagDomain; urlcmd.Parameters[7].Value = (data.RedirectedFlagRobots)?1:0; urlcmd.Parameters[8].Value = (data.Updated)?1:0; urlcmd.Parameters[9].Value = data.TimeStamp; urlcmd.Parameters[10].Value = 1; } else { urlcmd.Parameters[4].Value = DBNull.Value; urlcmd.Parameters[6].Value = (byte)data.UrlToCrawl.FlagDomain; if(data.FlagFetchRobots) { urlcmd.Parameters[7].Value = (data.RedirectedFlagRobots)?1:0; } else { urlcmd.Parameters[7].Value = 0; } urlcmd.Parameters[8].Value = (data.Updated)?1:0; urlcmd.Parameters[9].Value = data.TimeStamp; urlcmd.Parameters[10].Value = 0; } //retVal = data.ID; //make sure the host command is disposed hostcmd.Dispose(); urlcmd.ExecuteNonQuery(); retVal = (int)urlcmd.Parameters["@id"].Value; urlcmd.Dispose(); if(data.Updated) { //if necessary build the sql command for updating the url data tables SqlCommand urldatacmd = new SqlCommand("cw_update_url_data", dbcon, transaction); urldatacmd.CommandType = CommandType.StoredProcedure; urldatacmd.CommandTimeout = settings.DBActionTimeout; urldatacmd.Parameters.Add("@url_id", SqlDbType.Int); urldatacmd.Parameters.Add("@data", SqlDbType.Image); urldatacmd.Parameters.Add("@length", SqlDbType.Int); urldatacmd.Parameters.Add("@original_length", SqlDbType.Int); urldatacmd.Parameters.Add("@http_code", SqlDbType.SmallInt); urldatacmd.Parameters.Add("@retrieval_time", SqlDbType.Int); urldatacmd.Parameters[0].Value = retVal; //compress the url's data if(data.Data!= String.Empty) { byte [] compressed = null; string urldata = InternetUtils.Base64Decode(data.Data); CompressionUtils.CompressString(ref urldata, out compressed); urldatacmd.Parameters[1].Value = compressed; urldatacmd.Parameters[2].Value = compressed.Length; urldatacmd.Parameters[3].Value = data.Data.Length; } else { urldatacmd.Parameters[1].Value = new byte[0]; urldatacmd.Parameters[2].Value = 0; urldatacmd.Parameters[3].Value = 0; } urldatacmd.Parameters[4].Value = (short)data.HttpStatusCode; urldatacmd.Parameters[5].Value = data.RetrievalTime; urldatacmd.ExecuteNonQuery(); urldatacmd.Dispose(); } } catch(Exception e) { AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater failed to update a Url in the database: " + e.ToString()); retVal = 0; } return retVal; }
/// <summary> /// Inserts the links contained in a url into the database and updates the link graph /// </summary> /// <param name="UrlID">The ID of the url.</param> /// <param name="data">The <see cref="UrlCrawlData"/> of the url.</param> /// <param name="transaction">The currently active <see cref="SqlTransaction"/>.</param> private void InsertUrlOutLinks(int UrlID, UrlCrawlData data, SqlTransaction transaction) { try { //Build the SQL Commands SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon, transaction); hostcmd.CommandType = CommandType.StoredProcedure; hostcmd.CommandTimeout = settings.DBActionTimeout; hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier); hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100); SqlCommand urlcmd = new SqlCommand("cw_insert_url", dbcon, transaction); urlcmd.CommandType = CommandType.StoredProcedure; urlcmd.CommandTimeout = settings.DBActionTimeout; urlcmd.Parameters.Add("@url",SqlDbType.NVarChar, 500); urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier); urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier); urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt); urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt); urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt); urlcmd.Parameters.Add("@id", SqlDbType.Int); urlcmd.Parameters["@id"].Direction = ParameterDirection.Output; SqlCommand linkcmd = new SqlCommand("cw_insert_link_graph", dbcon, transaction); linkcmd.CommandType = CommandType.StoredProcedure; linkcmd.CommandTimeout = settings.DBActionTimeout; linkcmd.Parameters.Add("@from_url_id", SqlDbType.Int); linkcmd.Parameters.Add("@to_url_id", SqlDbType.Int); int new_id = 0; //insert each out link in the database foreach(InternetUrlToIndex url in data.OutLinks) { try { Uri uri = new Uri(url.Url); Guid host_id = new Guid(MD5Hash.md5(uri.Host)); hostcmd.Parameters[0].Value = host_id; hostcmd.Parameters[1].Value = uri.Host; hostcmd.ExecuteNonQuery(); urlcmd.Parameters[0].Value = url.Url; urlcmd.Parameters[1].Value = new Guid(url.MD5); urlcmd.Parameters[2].Value = host_id; urlcmd.Parameters[3].Value = (byte)url.Priority; urlcmd.Parameters[4].Value = (byte)url.FlagDomain; urlcmd.Parameters[5].Value = (byte)((url.FlagRobots)?1:0); urlcmd.ExecuteNonQuery(); new_id = (int)urlcmd.Parameters["@id"].Value; //(int)urlcmd.ExecuteScalar(); linkcmd.Parameters[0].Value = UrlID; linkcmd.Parameters[1].Value = new_id; linkcmd.ExecuteNonQuery(); } catch(Exception e) { AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater plugin failed to insert an edge to the link graph: " + e.ToString()); continue; } } urlcmd.Dispose(); linkcmd.Dispose(); } catch(Exception e) { AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater plugin: an unexpected error occured when inserting the out links of url with ID " + UrlID.ToString() + " to the link graph: " + e.ToString()); GC.Collect(); } }