/// <summary>
 /// Extracts the words found in the contents of a document. Used by DBUpdater when
 /// a document is stored in the database in order to extract the words it contains
 /// and add them to the database at the same time.
 /// </summary>
 /// <param name="data">The <see cref="UrlCrawlData"/> to be processed.</param>
 public void ExtractWords(ref UrlCrawlData data)
 {
     //First try to extract the words from the document. If something goes wrong just
     //return, otherwise add the words to the cache, remove any old words related to
     //the url with this id from the database and store the new url-words.
     try
     {
         SortedList words = wordExtractor.ExtractWords(data.Data);
         if(words.Count == 0)
         {
             return;
         }
         //add all the words to the database if they don't exist already
         string word = String.Empty;
         short word_count = 0;
         int word_id = -1;
         foreach(DictionaryEntry de in words)
         {
             word = (string)de.Key;
             cache.AddStemmedWord(word);
         }
         //remove all the old words related to this url from the database
         RemoveUrlWords(data.ID);
         //now add relationships between the url and its words
         foreach(DictionaryEntry d in words)
         {
             word = (string)d.Key;
             word_count = (short)d.Value;
             word_id = cache[word];
             AddUrlWord(data.ID, word_id, word_count);
         }
         UpdateUrlDataLastProcess(data.ID);
     }
     catch(Exception e)
     {
         events.Enqueue(new EventLoggerEntry(CWLoggerEntryType.Warning, DateTime.Now, "WordExtractionPlugin failed to extract words from Url with ID " + data.ID.ToString() + ": " + e.ToString()));
     }
 }
Exemple #2
0
 /// <summary>
 /// Stores an array of <see cref="UrlCrawlData"/> objects and the <see cref="ClientInfo"/>
 /// of the client who returned them on a compressed file on disk.
 /// </summary>
 /// <param name="info">The <see cref="ClientInfo"/> of the client who returned the data.</param>
 /// <param name="data">An array of <see cref="UrlCrawlData"/> objects containing the
 /// data returned by the client.</param>
 private void SaveXMLFile(ClientInfo info, UrlCrawlData[] data)
 {
     UrlCrawlDataFile udf = new UrlCrawlDataFile(info, data);
     string id = Guid.NewGuid().ToString();
     //serialize the object into a memory stream
     MemoryStream ms = new MemoryStream();
     //this may need to use SoapFormatter
     //XmlSerializer xml = new XmlSerializer(typeof(UrlCrawlDataFile));
     SoapFormatter xml = new SoapFormatter();
     xml.Serialize(ms, udf);
     byte[] buffer = ms.ToArray();
     ms.Close();
     string fileName = settings.DataFilesPath + id + ".zip";
     Crc32 crc = new Crc32();
     ZipOutputStream zs = new ZipOutputStream(File.Create(fileName));
     ZipEntry entry = new ZipEntry(id);
     entry.DateTime = DateTime.Now;
     entry.Size = buffer.Length;
     crc.Update(buffer);
     entry.Crc = crc.Value;
     zs.PutNextEntry(entry);
     zs.Write(buffer, 0, buffer.Length);
     zs.Finish();
     zs.Close();
 }
Exemple #3
0
 /// <summary>
 /// Stores the results that the clients return after crawling a set of Urls.
 /// </summary>
 /// <param name="ci">The <see cref="ClientInfo"/> of the client returning the data.</param>
 /// <param name="data">An array of <see cref="UrlCrawlData"/> objects containing the data of the crawled urls.</param>
 /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/> 
 /// encapsulating the error that occured if the operation fails.</returns>
 public SerializedException StoreCrawlResults(ClientInfo ci, UrlCrawlData[] data)
 {
     SerializedException sx = null;
     try
     {
         if (!ConnectToDatabase())
         {
             throw new CWDBConnectionFailedException();
         }
         try
         {
             //store the new robots.txt files in the database, nothing else needs to
             //be done since the urls will be marked as not assigned when their data
             //is processed by DBUpdater
             if ((data != null) && (data.Length > 0))
             {
                 SqlCommand cmd = new SqlCommand("cw_update_or_insert_robot", dbcon);
                 cmd.CommandType = CommandType.StoredProcedure;
                 cmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                 cmd.Parameters.Add("@disallowed", SqlDbType.NVarChar, 1000);
                 foreach (UrlCrawlData urlData in data)
                 {
                     if ((urlData.FlagFetchRobots) || (urlData.Redirected))
                     {
                         string url = urlData.Url;
                         cmd.Parameters[0].Value = new Guid(MD5Hash.md5(InternetUtils.HostName(url)));
                         cmd.Parameters[1].Value = urlData.RobotsDisallowedPaths;
                         try
                         {
                             cmd.ExecuteNonQuery();
                         }
                         catch
                         {
                             continue;
                         }
                     }
                 }
                 cmd.Dispose();
                 SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon);
                 statscmd.CommandType = CommandType.StoredProcedure;
                 statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier);
                 statscmd.Parameters.Add("@assigned", SqlDbType.BigInt);
                 statscmd.Parameters.Add("@returned", SqlDbType.BigInt);
                 statscmd.Parameters.Add("@type", SqlDbType.TinyInt);
                 statscmd.Parameters[0].Value = ci.ClientID;
                 statscmd.Parameters[1].Value = DBNull.Value;
                 statscmd.Parameters[2].Value = data.Length;
                 statscmd.Parameters[3].Value = 1;
                 statscmd.ExecuteNonQuery();
                 statscmd.Dispose();
             }
         }
         catch (Exception ex)
         {
             if (settings.LogLevel <= CWLogLevel.LogWarning)
             {
                 settings.Log.LogWarning("StoreCrawlResults failed: " + ex.ToString());
             }
             throw ex;
         }
         finally
         {
             //save xml file on disk
             try
             {
                 SaveXMLFile(ci, data);
             }
             catch (Exception se)
             {
                 sx = new SerializedException(se.GetType().ToString(), se.Message, se.ToString());
                 if (settings.LogLevel <= CWLogLevel.LogWarning)
                 {
                     settings.Log.LogWarning("StoreCrawlResults failed to save XML data on disk: " + se.ToString());
                 }
             }
         }
         if (!DisconnectFromDatabase())
         {
             throw new CWDBConnectionFailedException("Disconnect from database failure.");
         }
     }
     catch (Exception e)
     {
         sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString());
     }
     finally
     {
         UpdateClientLastActive(ci);
         LogClientAction(ci, CWClientActions.LogGetCrawlResults);
     }
     return sx;
 }
Exemple #4
0
        /// <summary>
        /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in
        /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the
        /// crawling threads that may be interrupted or aborted at any time it must be able
        /// to handle ThreadAbortException and ThreadInterruptedException.
        /// </summary>
        /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/>
        /// object that encapsulates the url that must be crawled.</param>
        private void CrawlUrl(ref InternetUrlToCrawl urlToCrawl)
        {
            try
            {
                UrlCrawlData urlData = new UrlCrawlData();
                HiResTimer timer = new HiResTimer();

                //create the web request and download the data
                HttpWebRequest pageRequest = null;
                try
                {
                    pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url);
                }
                catch
                {
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                    urlData.Updated=true;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                    urlData.Data = String.Empty;
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    lock(crawledUrls.SyncRoot)
                    {
                        crawledUrls.Add(urlData);
                    }
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    return;
                }
                pageRequest.UserAgent = globals.UserAgent;
                pageRequest.Timeout=ExponentialBackoff.DefaultBackoff; //page timeout = 30 seconds
                HttpWebResponse pageResponse=null;
                try
                {
                    timer.Start();
                    pageResponse = (HttpWebResponse)pageRequest.GetResponse();
                    //the above line might throw either WebException or UriFormatException
                }
                catch(WebException we)
                {
                    HttpWebResponse response=(HttpWebResponse)we.Response;
                    if (response!=null)
                    {
                        //although an exception occured we're able to get the Status Code
                        urlData.HttpStatusCode=response.StatusCode;
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(response.StatusCode, response.ContentLength);
                    }
                    else
                    {
                        urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    }
                }
                catch(UriFormatException)
                {
                    //this will occur if the url is not valid
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;
                    urlData.Updated=false;
                    urlData.Data = String.Empty;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                }
                finally
                {
                    timer.Stop();
                    urlData.TimeStamp = DateTime.UtcNow;
                }
                if(pageResponse !=null)
                {
                    //update the fields
                    urlData.HttpStatusCode = pageResponse.StatusCode;
                    //download and parse the contents of the url
                    Stream receiveStream=pageResponse.GetResponseStream();
                    StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);
                    string contents = String.Empty;
                    try
                    {
                        contents=receivedBytes.ReadToEnd();
                    }
                    catch
                    {
                        //it should be response timeout not request timeout
                        urlData.HttpStatusCode = HttpStatusCode.RequestTimeout;
                        urlData.Updated = true;
                        urlData.RetrievalTime = (int)timer.Duration;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        urlData.UrlToCrawl = urlToCrawl;
                        try
                        {
                            receivedBytes.Close();
                            receiveStream.Close();
                            pageResponse.Close();
                        }
                        catch
                        {}
                        lock(crawledUrls.SyncRoot)
                        {
                            crawledUrls.Add(urlData);
                        }
                        UpdateStatistics(HttpStatusCode.RequestTimeout, 0);
                        return;
                    }
                    byte []buffer=Encoding.ASCII.GetBytes(contents);
                    receiveStream.Close();
                    receivedBytes.Close();
                    UpdateStatistics(pageResponse.StatusCode, contents.Length);
                    string redirectUrl = string.Empty;
                    if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url)
                    {//now that was a bloody BUGBUG
                        redirectUrl = pageResponse.ResponseUri.AbsoluteUri;
                        urlData.RedirectedPriority = CleanupRedirectUrl(ref redirectUrl);
                        if(urlToCrawl.Url != redirectUrl)
                        {
                            urlData.Redirected=true;
                            urlToCrawl.Url=redirectUrl;
                        }
                    }
                    Parser parser = SelectParser(pageResponse.ContentType);
                    pageResponse.Close();
                    long CRC = CompressionUtils.BufferCRC(buffer);
                    if(CRC != urlToCrawl.CRC)
                    {
                        urlData.Updated = true;
                        urlToCrawl.CRC = CRC;
                    }
                    if(urlData.Updated)
                    {
                        urlData.RetrievalTime = (int)timer.Duration;
                        //if redirected, calculate robots, domain & priority for redirect url
                        if(urlData.Redirected)
                        {
                            urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl);
                            urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(ref redirectUrl, ref urlToCrawl, RobotsMetaTagValue.NoMeta);
                        }
                        //perform link extraction and content extraction
                        ArrayList outlinks = null;
                        try
                        {
                            if((parser == htmlParser)||(parser == textParser))
                            {
                                urlData.Data = parser.ExtractContent(ref contents, false);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                            else
                            {
                                urlData.Data = parser.ExtractContent(buffer, false);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(buffer, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                        }
                        catch
                        {
                            if(outlinks == null)
                            {
                                outlinks = new ArrayList();
                            }
                        }
                        urlData.OutLinks = new InternetUrlToIndex[outlinks.Count];
                        for(int i = 0; i< outlinks.Count; i++)
                        {
                            urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i];
                        }
                        //finally update the urlData object with the modified UrlToCrawl
                        urlData.UrlToCrawl = urlToCrawl;
                    }
                }
                //lock and update CrawledUrls
                lock(crawledUrls.SyncRoot)
                {
                    crawledUrls.Add(urlData);
                }
            }
            catch(ThreadAbortException tae)
            {
                //The thread has been asked to abort. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
                }
                return;
            }
            catch(ThreadInterruptedException tie)
            {
                //The thread has been asked to join. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
                }
                return;
            }
            catch(Exception ex)
            {
                if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
                {
                    if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
                    }
                }
                throw ex; //PerformCrawling should catch this one ?????
            }
            finally
            {
                GC.Collect();
            }
        }
Exemple #5
0
 /// <summary>
 /// SendResultsToServer runs on a dedicated thread. It periodically attempts to send
 /// the data produced from the crawling of urls back to the server. It communicates
 /// with the CrawlWave.Server web service asynchronously.
 /// </summary>
 private void SendResultsToServer()
 {
     Interlocked.Increment(ref runningThreads);
     if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
     {
         globals.FileLog.LogInfo("Started Results Thread with ID 0x" + Thread.CurrentThread.GetHashCode().ToString("x4"));
     }
     try
     {
         string FileName = String.Empty;
         UrlCrawlData [] urls = new UrlCrawlData[0];
         SoapFormatter serializer = null;
         MemoryStream ms = null;
         Stream ReadStream = null;
         while(!mustStop)
         {
             try
             {
                 FileName = String.Empty;
                 lock(resultFileNames.SyncRoot)
                 {
                     if(resultFileNames.Count>0)
                     {
                         FileName = (string)resultFileNames.Dequeue();
                     }
                 }
                 if(FileName != String.Empty)
                 {
                     //urls = new UrlCrawlData[0];
                     ReadStream = null;
                     serializer = null;
                     try
                     {
                         ReadStream=File.Open(FileName, FileMode.Open);
                         serializer = new SoapFormatter();
                         urls = (UrlCrawlData [] /*ArrayList*/)serializer.Deserialize(ReadStream);
                     }
                     catch(Exception e)
                     {
                         /*//something went wrong - put the filename back to the queue so that the
                         //client will attempt to send it to the server later
                         lock(resultFileNames.SyncRoot)
                         {
                             resultFileNames.Enqueue(FileName);
                         }*/
                         globals.FileLog.LogWarning("SendResults: could not deserialize data from " + FileName +". The file will be deleted. " + e.ToString());
                         //the file must be deleted
                         try
                         {
                             ReadStream.Close();
                             serializer = null;
                             File.Delete(FileName);
                         }
                         catch
                         {}
                     }
                     finally
                     {
                         if(ReadStream != null)
                         {
                             try
                             {
                                 ReadStream.Close();
                                 ReadStream = null; //TODO check if this is needed
                             }
                             catch
                             {}
                         }
                     }
                     if(urls.Length /*Count*/>0)
                     {
     //								UrlCrawlData [] data = new UrlCrawlData [urls.Count];
     //								for(int i = 0; i < urls.Count; i++)
     //								{
     //									data [i] = (UrlCrawlData)urls[i];
     //								}
     //								urls.Clear();
                         byte [] buffer = null;
                         //TODO: should this be called asynchronously?
                         //proxy.SecureServer.BeginGetCrawlResults(globals.Client_Info, data, new AsyncCallback(SendResultsToServerCallback), proxy.SecureServer);
                         try
                         {
                             ms = new MemoryStream();
                             serializer.Serialize(ms, urls/*data*/);
                             buffer = ms.ToArray();
                             ms.Close();
                             SerializedException sx = proxy.SecureServer.GetCrawlResultsRaw(globals.Client_Info, buffer);
                             if(sx!=null)
                             {
                                 sx.ThrowException();
                             }
                             lock(resultFileNames.SyncRoot)
                             {
                                 try
                                 {
                                     File.Delete(FileName);
                                 }
                                 catch
                                 {
                                     resultFileNames.Enqueue(FileName);
                                 }
                             }
                             OnResultsSent(EventArgs.Empty);
                         }
                         catch(Exception e)
                         {
                             lock(resultFileNames.SyncRoot)
                             {
                                 resultFileNames.Enqueue(FileName);
                             }
                             if(ms!=null)
                             {
                                 ms.Close();
                             }
                             throw e;
                         }
                         finally
                         {
                             //data = null;
                             for(int i = 0; i<urls.Length; i++)
                             {
                                 urls[i] = null;
                             }
                             buffer = null;
                             ms=null;
                         }
                     }
                 }
             }
             catch(Exception e)
             {
                 if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                 {
                     globals.FileLog.LogWarning("Crawler.SendResultsToServer failed: " + e.ToString());
                 }
             }
             serializer = null;
             GC.Collect();
             Thread.Sleep(syncBackOff.Next());
         }
     }
     catch(ThreadAbortException tae)
     {
         //The thread has been asked to abort. Log information and return at once
         if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
         {
             globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
         }
         return;
     }
     catch(ThreadInterruptedException tie)
     {
         //The thread has been asked to join. Log information and return at once
         if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
         {
             globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
         }
         return;
     }
     catch(Exception ex)
     {
         if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
         {
             if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
             {
                 globals.FileLog.LogWarning(Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
             }
         }
     }
     finally
     {
         if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
         {
             globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has finished.");
         }
         GC.Collect();
         Interlocked.Decrement(ref runningThreads);
     }
 }
Exemple #6
0
 /// <summary>
 /// Constructs a new instance of the <see cref="UrlCrawlDataFile"/> class with the
 /// provided values.
 /// </summary>
 /// <param name="info">The <see cref="ClientInfo"/> of the client who returned the data.</param>
 /// <param name="data">An array of <see cref="UrlCrawlData"/> objects.</param>
 public UrlCrawlDataFile(ClientInfo info, UrlCrawlData [] data)
 {
     Info = info;
     Data = data;
 }
 public SerializedException GetCrawlResults(ClientInfo ci, UrlCrawlData[] data)
 {
     return engine.StoreCrawlResults(ci, data);
 }
Exemple #8
0
        /// <summary>
        /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in
        /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the
        /// crawling threads that may be interrupted or aborted at any time it must be able
        /// to handle ThreadAbortException and ThreadInterruptedException.
        /// </summary>
        /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/>
        /// object that encapsulates the url that must be crawled.</param>
        internal void CrawlUrl(ref InternetUrlToCrawl urlToCrawl)
        {
            try
            {
                UrlCrawlData urlData = new UrlCrawlData();
                HiResTimer timer = new HiResTimer();

                //create the web request and download the data
                HttpWebRequest pageRequest = null;
                try
                {
                    pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url);
                }
                catch
                {
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                    urlData.Updated=true;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                    urlData.Data = String.Empty;
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    lock(crawledUrls.SyncRoot)
                    {
                        crawledUrls.Add(urlData);
                    }
                    return;
                }
                pageRequest.UserAgent = globals.UserAgent;
                pageRequest.Timeout=Backoff.DefaultBackoff; //page timeout = 30 seconds
                pageRequest.KeepAlive = false;
                HttpWebResponse pageResponse=null;
                try
                {
                    timer.Start();
                    pageResponse = (HttpWebResponse)pageRequest.GetResponse();
                    //the above line might throw either WebException or UriFormatException
                }
                catch(WebException we)
                {
                    HttpWebResponse response=(HttpWebResponse)we.Response;
                    if (response!=null)
                    {
                        //although an exception occured we're able to get the Status Code
                        urlData.HttpStatusCode=response.StatusCode;
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(response.StatusCode, response.ContentLength);
                        response.Close();
                    }
                    else
                    {
                        urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    }
                }
                catch(UriFormatException)
                {
                    //this will occur if the url is not valid
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;
                    urlData.Updated=false;
                    urlData.Data = String.Empty;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                }
                finally
                {
                    timer.Stop();
                    urlData.TimeStamp = DateTime.UtcNow;
                }
                if(pageResponse !=null)
                {
                    //update the fields
                    urlData.HttpStatusCode = pageResponse.StatusCode;
                    //download and parse the contents of the url
                    Stream receiveStream=pageResponse.GetResponseStream();
                    /*StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);*/
                    MemoryStream receivedBytes = new MemoryStream();
                    byte [] buffer = new byte[4096];
                    int read = 0;
                    try
                    {
                        while((read=receiveStream.Read(buffer,0,4096))>0)
                        {
                            receivedBytes.Write(buffer,0,read);
                        }
                    }
                    catch
                    {
                        //it should be response timeout not request timeout
                        urlData.HttpStatusCode = HttpStatusCode.RequestTimeout;
                        urlData.Updated = true;
                        urlData.RetrievalTime = (int)timer.Duration;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        urlData.UrlToCrawl = urlToCrawl;
                        try
                        {
                            receivedBytes.Close();
                            receiveStream.Close();
                            pageResponse.Close();
                        }
                        catch
                        {}
                        lock(crawledUrls.SyncRoot)
                        {
                            crawledUrls.Add(urlData);
                        }
                        UpdateStatistics(HttpStatusCode.RequestTimeout, 0);
                        return;
                    }
                    buffer = receivedBytes.ToArray();
                    Parser parser = SelectParser(pageResponse.ContentType);
                    string contents = String.Empty;
                    if(parser == htmlParser)
                    {
                        Encoding encoding = null;
                        switch(pageResponse.ContentEncoding)
                        {
                            case "":
                            case "none":
                                contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length);
                                //re-check the encoding
                                encoding = DetectContentEncoding(ref contents);
                                if(encoding != defaultEncoding)
                                {
                                    contents = encoding.GetString(buffer, 0, buffer.Length);
                                }
                                break;

                            case "gzip":
                            case "x-gzip":
                                //first decompress the stream and then re-check the encoding
                                byte [] decompressed_buffer = new byte [0];
                                DecompressGzippedContent(buffer, out decompressed_buffer);
                                contents = this.defaultEncoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length);
                                //re-check the encoding
                                encoding = DetectContentEncoding(ref contents);
                                if(encoding != defaultEncoding)
                                {
                                    contents = encoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length);
                                }
                                break;

                            default:
                                try
                                {
                                    encoding = Encoding.GetEncoding(pageResponse.ContentEncoding);
                                    contents = encoding.GetString(buffer, 0, buffer.Length);
                                }
                                catch//(NotSupportedException)
                                {
                                    encoding = defaultEncoding;
                                    //the encoding specified is unsupported.
                                    contents = String.Empty;
                                }
                                break;
                        }
                    }
                    else
                    {
                        if(parser == textParser)
                        {
                            try
                            {
                                contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length);
                            }
                            catch
                            {
                                //something went seriously wrong here. The crawler got a header that says the server is
                                //sending back a plain text document but for some reason we can't get the string contents.
                                contents = String.Empty;
                            }
                        }
                    }
                    receiveStream.Close();
                    receivedBytes.Close();
                    UpdateStatistics(pageResponse.StatusCode, buffer.Length);
                    string redirectUrl = string.Empty;
                    if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url)
                    {
                        redirectUrl = pageResponse.ResponseUri.AbsoluteUri;
                        urlData.RedirectedPriority = htmlParser.CleanUrlParams(ref redirectUrl);//CleanupRedirectUrl(ref redirectUrl);
                        if(urlToCrawl.Url != redirectUrl)//now that was a bloody BUGBUG
                        {
                            urlData.Redirected=true;
                            urlToCrawl.Url=redirectUrl;
                        }
                    }
                    pageResponse.Close();
                    long CRC = CompressionUtils.BufferCRC(buffer);
                    if(CRC != urlToCrawl.CRC)
                    {
                        urlData.Updated = true;
                        urlToCrawl.CRC = CRC;
                    }
                    if(urlData.Updated)
                    {
                        urlData.RetrievalTime = (int)timer.Duration;
                        //if redirected, calculate robots, domain & priority for redirect url
                        if(urlData.Redirected)
                        {
                            urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl);
                            urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(redirectUrl, urlToCrawl, RobotsMetaTagValue.NoMeta);
                        }
                        //perform link extraction and content extraction
                        ArrayList outlinks = null;
                        try
                        {
                            if((parser == htmlParser)||(parser == textParser))
                            {
                                string clean = parser.ExtractContent(ref contents, false);
                                if(clean.Length>1048576)
                                {
                                    clean = clean.Substring(0,1048576);
                                }
                                urlData.Data = InternetUtils.Base64Encode(clean);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                            else
                            {
                                contents = parser.ExtractContent(buffer, false);
                                if(contents.Length>1048576)
                                {
                                    contents = contents.Substring(0,1048576);
                                }
                                urlData.Data = InternetUtils.Base64Encode(contents);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                if(parser == pdfParser)
                                {
                                    outlinks = textParser.ExtractLinks(ref contents, ref urlToCrawl);
                                }
                                else
                                {
                                    outlinks = htmlParser.ExtractLinks(ref contents, ref urlToCrawl);
                                }
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                        }
                        catch
                        {
                            if(outlinks == null)
                            {
                                outlinks = new ArrayList();
                            }
                        }
                        urlData.OutLinks = new InternetUrlToIndex[outlinks.Count];
                        for(int i = 0; i< outlinks.Count; i++)
                        {
                            urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i];
                        }
                        //finally update the urlData object with the modified UrlToCrawl
                        urlData.UrlToCrawl = urlToCrawl;
                    }
                }
                //lock and update CrawledUrls
                lock(crawledUrls.SyncRoot)
                {
                    crawledUrls.Add(urlData);
                }
            }
            catch(ThreadAbortException tae)
            {
                //The thread has been asked to abort. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
                }
                return;
            }
            catch(ThreadInterruptedException tie)
            {
                //The thread has been asked to join. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
                }
                return;
            }
            catch(Exception ex)
            {
                if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
                {
                    if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
                    }
                }
                throw ex; //PerformCrawling should catch this one ?????
            }
            finally
            {
                GC.Collect();
            }
        }
Exemple #9
0
        /// <summary>
        /// Acts as an asynchronous callback for the SendResultsToServer method that sends
        /// the crawled urls data to the server by calling GetCrawlResults.
        /// </summary>
        /// <param name="result">The result of the asynchronous call.</param>
        /*private void SendResultsToServerCallback(IAsyncResult result)
        {
            try
            {
                CrawlWaveServer server = (CrawlWaveServer)result.AsyncState;
                SerializedException sx = server.EndGetCrawlResults(result);
                if(sx!=null)
                {
                    sx.ThrowException();
                }
                lock(resultFileNames.SyncRoot)
                {
                    string fileName = (string)resultFileNames.Dequeue();
                    try
                    {
                        File.Delete(fileName);
                    }
                    catch
                    {
                        resultFileNames.Enqueue(fileName);
                    }
                }
            }
            catch(Exception e)
            {
                if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("Crawler.SendResultsToServerCallback: " + e.ToString());
                }
            }
        }*/
        /// <summary>
        /// Sends the crawl results to the server in a synchronous mode, it reads one data
        /// file at a time.
        /// </summary>
        private void SendResultsSynchronously()
        {
            if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
            {
                globals.FileLog.LogInfo("SendResultsSynchronously is attempting to send results to server.");
            }
            try
            {
                string FileName = String.Empty;
                UrlCrawlData [] urls = new UrlCrawlData[0];
                //SoapFormatter serializer = null;
                BinaryFormatter serializer = null;
                MemoryStream ms = null;
                Stream ReadStream = null;

                lock(resultFileNames.SyncRoot)
                {
                    if(resultFileNames.Count>0)
                    {
                        FileName = (string)resultFileNames.Dequeue();
                    }
                }
                if(FileName != String.Empty)
                {
                    ReadStream = null;
                    serializer = null;
                    try
                    {
                        ReadStream=File.Open(FileName, FileMode.Open);
                        //serializer = new SoapFormatter();
                        serializer = new BinaryFormatter();
                        urls = (UrlCrawlData [] /*ArrayList*/)serializer.Deserialize(ReadStream);
                    }
                    catch(Exception e)
                    {
                        //something went wrong during deserialization
                        globals.FileLog.LogWarning("SendResultsSynchronously: could not deserialize data from " + FileName +". The file will be deleted. " + e.ToString());
                        //the file must be deleted
                        try
                        {
                            ReadStream.Close();
                            serializer = null;
                            File.Delete(FileName);
                        }
                        catch
                        {}
                    }
                    finally
                    {
                        if(ReadStream != null)
                        {
                            try
                            {
                                ReadStream.Close();
                                ReadStream = null; //TODO check if this is needed
                            }
                            catch
                            {}
                        }
                    }
                    if(urls.Length /*Count*/>0)
                    {
                        byte [] buffer = null;
                        //TODO: should this be called asynchronously?
                        //proxy.SecureServer.BeginGetCrawlResults(globals.Client_Info, data, new AsyncCallback(SendResultsToServerCallback), proxy.SecureServer);
                        try
                        {
                            ms = new MemoryStream();
                            serializer.Serialize(ms, urls/*data*/);
                            buffer = ms.ToArray();
                            ms.Close();
                            //proxy.Timeout = 600000;
            //							SerializedException sx = proxy.GetCrawlResultsRaw(globals.Client_Info, buffer);
                            SerializedException sx = proxy.GetCrawlResults(globals.Client_Info, urls);
                            if(sx!=null)
                            {
                                sx.ThrowException();
                            }
                            lock(resultFileNames.SyncRoot)
                            {
                                try
                                {
                                    File.Delete(FileName);
                                }
                                catch
                                {
                                    resultFileNames.Enqueue(FileName);
                                }
                            }
                            OnResultsSent(EventArgs.Empty);
                        }
                        catch(Exception e)
                        {
                            lock(resultFileNames.SyncRoot)
                            {
                                resultFileNames.Enqueue(FileName);
                            }
                            if(ms!=null)
                            {
                                ms.Close();
                            }
                            throw e;
                        }
                        finally
                        {
                            //data = null;
                            //proxy.Timeout = 100000;
                            for(int i = 0; i<urls.Length; i++)
                            {
                                urls[i] = null;
                            }
                            buffer = null;
                            ms=null;
                            serializer = null;
                        }
                    }
                }
            }
            catch(Exception e)
            {
                if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("Crawler.SendResultsSynchronously failed: " + e.ToString());
                }
            }
            finally
            {
                GC.Collect();
            }
        }
        /// <summary>
        /// Updates the Url and the Url Data tables
        /// </summary>
        /// <param name="data">The UrlCrawlData containing the data of the crawled Url.</param>
        /// <param name="transaction">The currently active <see cref="SqlTransaction"/>.</param>
        /// <returns>The ID of the updated url or 0 of something goes wrong.</returns>
        private int UpdateUrl(UrlCrawlData data, SqlTransaction transaction)
        {
            int retVal = 0;
            try
            {
                //build the Sql Command for updating the url table
                SqlCommand urlcmd = new SqlCommand("cw_update_url", dbcon, transaction);
                urlcmd.CommandType = CommandType.StoredProcedure;
                urlcmd.CommandTimeout = settings.DBActionTimeout;
                urlcmd.Parameters.Add("@url_id",SqlDbType.Int);
                urlcmd.Parameters.Add("@url", SqlDbType.NVarChar, 500);
                urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@crc", SqlDbType.BigInt);
                urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_updated", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@last_visited", SqlDbType.SmallDateTime);
                urlcmd.Parameters.Add("@flag_redirected", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@id", SqlDbType.Int);
                urlcmd.Parameters["@id"].Direction = ParameterDirection.Output;

                //Build the SQL Command for updating the hosts table
                SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon, transaction);
                hostcmd.CommandType = CommandType.StoredProcedure;
                hostcmd.CommandTimeout = settings.DBActionTimeout;
                hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100);

                //set their parameters
                urlcmd.Parameters[0].Value = data.ID;
                urlcmd.Parameters[1].Value = data.Url;
                urlcmd.Parameters[2].Value = new Guid(data.MD5);
                Uri uri = new Uri(data.Url);
                string host_name = uri.Host;
                Guid host_id = new Guid(MD5Hash.md5(host_name));
                urlcmd.Parameters[3].Value = host_id;
                urlcmd.Parameters[5].Value = data.CRC;
                if(data.Redirected)
                {
                    //we must first attempt to insert the host, otherwise the urlcmd will fail
                    hostcmd.Parameters[0].Value = host_id;
                    hostcmd.Parameters[1].Value = host_name;
                    try
                    {
                        hostcmd.ExecuteNonQuery();
                    }
                    catch
                    {
                        //it probably exists already
                    }

                    urlcmd.Parameters[4].Value = (byte)data.RedirectedPriority;
                    urlcmd.Parameters[6].Value = (byte)data.RedirectedFlagDomain;
                    urlcmd.Parameters[7].Value = (data.RedirectedFlagRobots)?1:0;
                    urlcmd.Parameters[8].Value = (data.Updated)?1:0;
                    urlcmd.Parameters[9].Value = data.TimeStamp;
                    urlcmd.Parameters[10].Value = 1;
                }
                else
                {
                    urlcmd.Parameters[4].Value = DBNull.Value;
                    urlcmd.Parameters[6].Value = (byte)data.UrlToCrawl.FlagDomain;
                    if(data.FlagFetchRobots)
                    {
                        urlcmd.Parameters[7].Value = (data.RedirectedFlagRobots)?1:0;
                    }
                    else
                    {
                        urlcmd.Parameters[7].Value = 0;
                    }
                    urlcmd.Parameters[8].Value = (data.Updated)?1:0;
                    urlcmd.Parameters[9].Value = data.TimeStamp;
                    urlcmd.Parameters[10].Value = 0;
                }
                //retVal = data.ID;
                //make sure the host command is disposed
                hostcmd.Dispose();
                urlcmd.ExecuteNonQuery();
                retVal = (int)urlcmd.Parameters["@id"].Value;
                urlcmd.Dispose();

                if(data.Updated)
                {
                    //if necessary build the sql command for updating the url data tables
                    SqlCommand urldatacmd = new SqlCommand("cw_update_url_data", dbcon, transaction);
                    urldatacmd.CommandType = CommandType.StoredProcedure;
                    urldatacmd.CommandTimeout = settings.DBActionTimeout;
                    urldatacmd.Parameters.Add("@url_id", SqlDbType.Int);
                    urldatacmd.Parameters.Add("@data", SqlDbType.Image);
                    urldatacmd.Parameters.Add("@length", SqlDbType.Int);
                    urldatacmd.Parameters.Add("@original_length", SqlDbType.Int);
                    urldatacmd.Parameters.Add("@http_code", SqlDbType.SmallInt);
                    urldatacmd.Parameters.Add("@retrieval_time", SqlDbType.Int);

                    urldatacmd.Parameters[0].Value = retVal;
                    //compress the url's data
                    if(data.Data!= String.Empty)
                    {
                        byte [] compressed = null;
                        string urldata = InternetUtils.Base64Decode(data.Data);
                        CompressionUtils.CompressString(ref urldata, out compressed);
                        urldatacmd.Parameters[1].Value = compressed;
                        urldatacmd.Parameters[2].Value = compressed.Length;
                        urldatacmd.Parameters[3].Value = data.Data.Length;
                    }
                    else
                    {
                        urldatacmd.Parameters[1].Value = new byte[0];
                        urldatacmd.Parameters[2].Value = 0;
                        urldatacmd.Parameters[3].Value = 0;
                    }
                    urldatacmd.Parameters[4].Value = (short)data.HttpStatusCode;
                    urldatacmd.Parameters[5].Value = data.RetrievalTime;
                    urldatacmd.ExecuteNonQuery();
                    urldatacmd.Dispose();
                }
            }
            catch(Exception e)
            {
                AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater failed to update a Url in the database: " + e.ToString());
                retVal = 0;
            }
            return retVal;
        }
        /// <summary>
        /// Inserts the links contained in a url into the database and updates the link graph
        /// </summary>
        /// <param name="UrlID">The ID of the url.</param>
        /// <param name="data">The <see cref="UrlCrawlData"/> of the url.</param>
        /// <param name="transaction">The currently active <see cref="SqlTransaction"/>.</param>
        private void InsertUrlOutLinks(int UrlID, UrlCrawlData data, SqlTransaction transaction)
        {
            try
            {
                //Build the SQL Commands
                SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon, transaction);
                hostcmd.CommandType = CommandType.StoredProcedure;
                hostcmd.CommandTimeout = settings.DBActionTimeout;
                hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100);

                SqlCommand urlcmd = new SqlCommand("cw_insert_url", dbcon, transaction);
                urlcmd.CommandType = CommandType.StoredProcedure;
                urlcmd.CommandTimeout = settings.DBActionTimeout;
                urlcmd.Parameters.Add("@url",SqlDbType.NVarChar, 500);
                urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@id", SqlDbType.Int);
                urlcmd.Parameters["@id"].Direction = ParameterDirection.Output;

                SqlCommand linkcmd = new SqlCommand("cw_insert_link_graph", dbcon, transaction);
                linkcmd.CommandType = CommandType.StoredProcedure;
                linkcmd.CommandTimeout = settings.DBActionTimeout;
                linkcmd.Parameters.Add("@from_url_id", SqlDbType.Int);
                linkcmd.Parameters.Add("@to_url_id", SqlDbType.Int);

                int new_id = 0;
                //insert each out link in the database
                foreach(InternetUrlToIndex url in data.OutLinks)
                {
                    try
                    {
                        Uri uri = new Uri(url.Url);
                        Guid host_id = new Guid(MD5Hash.md5(uri.Host));

                        hostcmd.Parameters[0].Value = host_id;
                        hostcmd.Parameters[1].Value = uri.Host;
                        hostcmd.ExecuteNonQuery();

                        urlcmd.Parameters[0].Value = url.Url;
                        urlcmd.Parameters[1].Value = new Guid(url.MD5);
                        urlcmd.Parameters[2].Value = host_id;
                        urlcmd.Parameters[3].Value = (byte)url.Priority;
                        urlcmd.Parameters[4].Value = (byte)url.FlagDomain;
                        urlcmd.Parameters[5].Value = (byte)((url.FlagRobots)?1:0);
                        urlcmd.ExecuteNonQuery();
                        new_id = (int)urlcmd.Parameters["@id"].Value; //(int)urlcmd.ExecuteScalar();

                        linkcmd.Parameters[0].Value = UrlID;
                        linkcmd.Parameters[1].Value = new_id;
                        linkcmd.ExecuteNonQuery();
                    }
                    catch(Exception e)
                    {
                        AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater plugin failed to insert an edge to the link graph: " + e.ToString());
                        continue;
                    }
                }
                urlcmd.Dispose();
                linkcmd.Dispose();
            }
            catch(Exception e)
            {
                AddToReportQueue(CWLoggerEntryType.Warning, "DBUpdater plugin: an unexpected error occured when inserting the out links of url with ID " + UrlID.ToString() + " to the link graph: " + e.ToString());
                GC.Collect();
            }
        }