Exemple #1
0
        /// <summary>
        /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in
        /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the
        /// crawling threads that may be interrupted or aborted at any time it must be able
        /// to handle ThreadAbortException and ThreadInterruptedException.
        /// </summary>
        /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/>
        /// object that encapsulates the url that must be crawled.</param>
        private void CrawlUrl(ref InternetUrlToCrawl urlToCrawl)
        {
            try
            {
                UrlCrawlData urlData = new UrlCrawlData();
                HiResTimer timer = new HiResTimer();

                //create the web request and download the data
                HttpWebRequest pageRequest = null;
                try
                {
                    pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url);
                }
                catch
                {
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                    urlData.Updated=true;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                    urlData.Data = String.Empty;
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    lock(crawledUrls.SyncRoot)
                    {
                        crawledUrls.Add(urlData);
                    }
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    return;
                }
                pageRequest.UserAgent = globals.UserAgent;
                pageRequest.Timeout=ExponentialBackoff.DefaultBackoff; //page timeout = 30 seconds
                HttpWebResponse pageResponse=null;
                try
                {
                    timer.Start();
                    pageResponse = (HttpWebResponse)pageRequest.GetResponse();
                    //the above line might throw either WebException or UriFormatException
                }
                catch(WebException we)
                {
                    HttpWebResponse response=(HttpWebResponse)we.Response;
                    if (response!=null)
                    {
                        //although an exception occured we're able to get the Status Code
                        urlData.HttpStatusCode=response.StatusCode;
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(response.StatusCode, response.ContentLength);
                    }
                    else
                    {
                        urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    }
                }
                catch(UriFormatException)
                {
                    //this will occur if the url is not valid
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;
                    urlData.Updated=false;
                    urlData.Data = String.Empty;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                }
                finally
                {
                    timer.Stop();
                    urlData.TimeStamp = DateTime.UtcNow;
                }
                if(pageResponse !=null)
                {
                    //update the fields
                    urlData.HttpStatusCode = pageResponse.StatusCode;
                    //download and parse the contents of the url
                    Stream receiveStream=pageResponse.GetResponseStream();
                    StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);
                    string contents = String.Empty;
                    try
                    {
                        contents=receivedBytes.ReadToEnd();
                    }
                    catch
                    {
                        //it should be response timeout not request timeout
                        urlData.HttpStatusCode = HttpStatusCode.RequestTimeout;
                        urlData.Updated = true;
                        urlData.RetrievalTime = (int)timer.Duration;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        urlData.UrlToCrawl = urlToCrawl;
                        try
                        {
                            receivedBytes.Close();
                            receiveStream.Close();
                            pageResponse.Close();
                        }
                        catch
                        {}
                        lock(crawledUrls.SyncRoot)
                        {
                            crawledUrls.Add(urlData);
                        }
                        UpdateStatistics(HttpStatusCode.RequestTimeout, 0);
                        return;
                    }
                    byte []buffer=Encoding.ASCII.GetBytes(contents);
                    receiveStream.Close();
                    receivedBytes.Close();
                    UpdateStatistics(pageResponse.StatusCode, contents.Length);
                    string redirectUrl = string.Empty;
                    if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url)
                    {//now that was a bloody BUGBUG
                        redirectUrl = pageResponse.ResponseUri.AbsoluteUri;
                        urlData.RedirectedPriority = CleanupRedirectUrl(ref redirectUrl);
                        if(urlToCrawl.Url != redirectUrl)
                        {
                            urlData.Redirected=true;
                            urlToCrawl.Url=redirectUrl;
                        }
                    }
                    Parser parser = SelectParser(pageResponse.ContentType);
                    pageResponse.Close();
                    long CRC = CompressionUtils.BufferCRC(buffer);
                    if(CRC != urlToCrawl.CRC)
                    {
                        urlData.Updated = true;
                        urlToCrawl.CRC = CRC;
                    }
                    if(urlData.Updated)
                    {
                        urlData.RetrievalTime = (int)timer.Duration;
                        //if redirected, calculate robots, domain & priority for redirect url
                        if(urlData.Redirected)
                        {
                            urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl);
                            urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(ref redirectUrl, ref urlToCrawl, RobotsMetaTagValue.NoMeta);
                        }
                        //perform link extraction and content extraction
                        ArrayList outlinks = null;
                        try
                        {
                            if((parser == htmlParser)||(parser == textParser))
                            {
                                urlData.Data = parser.ExtractContent(ref contents, false);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                            else
                            {
                                urlData.Data = parser.ExtractContent(buffer, false);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(buffer, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                        }
                        catch
                        {
                            if(outlinks == null)
                            {
                                outlinks = new ArrayList();
                            }
                        }
                        urlData.OutLinks = new InternetUrlToIndex[outlinks.Count];
                        for(int i = 0; i< outlinks.Count; i++)
                        {
                            urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i];
                        }
                        //finally update the urlData object with the modified UrlToCrawl
                        urlData.UrlToCrawl = urlToCrawl;
                    }
                }
                //lock and update CrawledUrls
                lock(crawledUrls.SyncRoot)
                {
                    crawledUrls.Add(urlData);
                }
            }
            catch(ThreadAbortException tae)
            {
                //The thread has been asked to abort. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
                }
                return;
            }
            catch(ThreadInterruptedException tie)
            {
                //The thread has been asked to join. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
                }
                return;
            }
            catch(Exception ex)
            {
                if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
                {
                    if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
                    }
                }
                throw ex; //PerformCrawling should catch this one ?????
            }
            finally
            {
                GC.Collect();
            }
        }
Exemple #2
0
 /// <summary>
 /// DetectConnectionSpeed attempts to detect the computer's internet connection speed
 /// by measuring how much time it takes to download the contents of a web site.
 /// </summary>
 /// <returns>A <see cref="CWConnectionSpeed"/> value containing the estimated internet
 /// connection speed.</returns>
 public static CWConnectionSpeed DetectConnectionSpeed()
 {
     CWConnectionSpeed retVal = CWConnectionSpeed.Unknown;
     try
     {
         WebClient client = new WebClient();
         HiResTimer timer = new HiResTimer();
         byte[] data;
         timer.Start();
         try
         {
             data = client.DownloadData("http://www.in.gr/");
         }
         catch
         {
             data = new byte[0];
         }
         finally
         {
             timer.Stop();
             client.Dispose();
         }
         if (data.Length > 0)
         {
             double Kbps = ((data.Length * 8) / timer.Duration);
             //determine which enumeration value fits best
             if ((Kbps > 0) && (Kbps <= 56))
                 retVal = CWConnectionSpeed.Modem56K;
             else if (Kbps <= 64)
                 retVal = CWConnectionSpeed.ISDN64K;
             else if (Kbps <= 128)
                 retVal = CWConnectionSpeed.ISDN128K;
             else if (Kbps <= 256)
                 retVal = CWConnectionSpeed.DSL256K;
             else if (Kbps <= 512)
                 retVal = CWConnectionSpeed.DSL512K;
             else if (Kbps <= 1024)
                 retVal = CWConnectionSpeed.DSL1M;
             else if ((Kbps > 1024) && (Kbps <= 1536))
                 retVal = CWConnectionSpeed.T1;
             else if ((Kbps > 1536) && (Kbps <= 46080))
                 retVal = CWConnectionSpeed.T3;
             else if ((Kbps > 46080) && (Kbps < 158720))
                 retVal = CWConnectionSpeed.Fiber;
             else if (Kbps >= 158720)
                 retVal = CWConnectionSpeed.ATM;
             else
                 retVal = CWConnectionSpeed.Unknown;
         }
     }
     catch
     { }
     return retVal;
 }
Exemple #3
0
        /// <summary>
        /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in
        /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the
        /// crawling threads that may be interrupted or aborted at any time it must be able
        /// to handle ThreadAbortException and ThreadInterruptedException.
        /// </summary>
        /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/>
        /// object that encapsulates the url that must be crawled.</param>
        internal void CrawlUrl(ref InternetUrlToCrawl urlToCrawl)
        {
            try
            {
                UrlCrawlData urlData = new UrlCrawlData();
                HiResTimer timer = new HiResTimer();

                //create the web request and download the data
                HttpWebRequest pageRequest = null;
                try
                {
                    pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url);
                }
                catch
                {
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                    urlData.Updated=true;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                    urlData.Data = String.Empty;
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    lock(crawledUrls.SyncRoot)
                    {
                        crawledUrls.Add(urlData);
                    }
                    return;
                }
                pageRequest.UserAgent = globals.UserAgent;
                pageRequest.Timeout=Backoff.DefaultBackoff; //page timeout = 30 seconds
                pageRequest.KeepAlive = false;
                HttpWebResponse pageResponse=null;
                try
                {
                    timer.Start();
                    pageResponse = (HttpWebResponse)pageRequest.GetResponse();
                    //the above line might throw either WebException or UriFormatException
                }
                catch(WebException we)
                {
                    HttpWebResponse response=(HttpWebResponse)we.Response;
                    if (response!=null)
                    {
                        //although an exception occured we're able to get the Status Code
                        urlData.HttpStatusCode=response.StatusCode;
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(response.StatusCode, response.ContentLength);
                        response.Close();
                    }
                    else
                    {
                        urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    }
                }
                catch(UriFormatException)
                {
                    //this will occur if the url is not valid
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;
                    urlData.Updated=false;
                    urlData.Data = String.Empty;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                }
                finally
                {
                    timer.Stop();
                    urlData.TimeStamp = DateTime.UtcNow;
                }
                if(pageResponse !=null)
                {
                    //update the fields
                    urlData.HttpStatusCode = pageResponse.StatusCode;
                    //download and parse the contents of the url
                    Stream receiveStream=pageResponse.GetResponseStream();
                    /*StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);*/
                    MemoryStream receivedBytes = new MemoryStream();
                    byte [] buffer = new byte[4096];
                    int read = 0;
                    try
                    {
                        while((read=receiveStream.Read(buffer,0,4096))>0)
                        {
                            receivedBytes.Write(buffer,0,read);
                        }
                    }
                    catch
                    {
                        //it should be response timeout not request timeout
                        urlData.HttpStatusCode = HttpStatusCode.RequestTimeout;
                        urlData.Updated = true;
                        urlData.RetrievalTime = (int)timer.Duration;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        urlData.UrlToCrawl = urlToCrawl;
                        try
                        {
                            receivedBytes.Close();
                            receiveStream.Close();
                            pageResponse.Close();
                        }
                        catch
                        {}
                        lock(crawledUrls.SyncRoot)
                        {
                            crawledUrls.Add(urlData);
                        }
                        UpdateStatistics(HttpStatusCode.RequestTimeout, 0);
                        return;
                    }
                    buffer = receivedBytes.ToArray();
                    Parser parser = SelectParser(pageResponse.ContentType);
                    string contents = String.Empty;
                    if(parser == htmlParser)
                    {
                        Encoding encoding = null;
                        switch(pageResponse.ContentEncoding)
                        {
                            case "":
                            case "none":
                                contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length);
                                //re-check the encoding
                                encoding = DetectContentEncoding(ref contents);
                                if(encoding != defaultEncoding)
                                {
                                    contents = encoding.GetString(buffer, 0, buffer.Length);
                                }
                                break;

                            case "gzip":
                            case "x-gzip":
                                //first decompress the stream and then re-check the encoding
                                byte [] decompressed_buffer = new byte [0];
                                DecompressGzippedContent(buffer, out decompressed_buffer);
                                contents = this.defaultEncoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length);
                                //re-check the encoding
                                encoding = DetectContentEncoding(ref contents);
                                if(encoding != defaultEncoding)
                                {
                                    contents = encoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length);
                                }
                                break;

                            default:
                                try
                                {
                                    encoding = Encoding.GetEncoding(pageResponse.ContentEncoding);
                                    contents = encoding.GetString(buffer, 0, buffer.Length);
                                }
                                catch//(NotSupportedException)
                                {
                                    encoding = defaultEncoding;
                                    //the encoding specified is unsupported.
                                    contents = String.Empty;
                                }
                                break;
                        }
                    }
                    else
                    {
                        if(parser == textParser)
                        {
                            try
                            {
                                contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length);
                            }
                            catch
                            {
                                //something went seriously wrong here. The crawler got a header that says the server is
                                //sending back a plain text document but for some reason we can't get the string contents.
                                contents = String.Empty;
                            }
                        }
                    }
                    receiveStream.Close();
                    receivedBytes.Close();
                    UpdateStatistics(pageResponse.StatusCode, buffer.Length);
                    string redirectUrl = string.Empty;
                    if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url)
                    {
                        redirectUrl = pageResponse.ResponseUri.AbsoluteUri;
                        urlData.RedirectedPriority = htmlParser.CleanUrlParams(ref redirectUrl);//CleanupRedirectUrl(ref redirectUrl);
                        if(urlToCrawl.Url != redirectUrl)//now that was a bloody BUGBUG
                        {
                            urlData.Redirected=true;
                            urlToCrawl.Url=redirectUrl;
                        }
                    }
                    pageResponse.Close();
                    long CRC = CompressionUtils.BufferCRC(buffer);
                    if(CRC != urlToCrawl.CRC)
                    {
                        urlData.Updated = true;
                        urlToCrawl.CRC = CRC;
                    }
                    if(urlData.Updated)
                    {
                        urlData.RetrievalTime = (int)timer.Duration;
                        //if redirected, calculate robots, domain & priority for redirect url
                        if(urlData.Redirected)
                        {
                            urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl);
                            urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(redirectUrl, urlToCrawl, RobotsMetaTagValue.NoMeta);
                        }
                        //perform link extraction and content extraction
                        ArrayList outlinks = null;
                        try
                        {
                            if((parser == htmlParser)||(parser == textParser))
                            {
                                string clean = parser.ExtractContent(ref contents, false);
                                if(clean.Length>1048576)
                                {
                                    clean = clean.Substring(0,1048576);
                                }
                                urlData.Data = InternetUtils.Base64Encode(clean);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                            else
                            {
                                contents = parser.ExtractContent(buffer, false);
                                if(contents.Length>1048576)
                                {
                                    contents = contents.Substring(0,1048576);
                                }
                                urlData.Data = InternetUtils.Base64Encode(contents);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                if(parser == pdfParser)
                                {
                                    outlinks = textParser.ExtractLinks(ref contents, ref urlToCrawl);
                                }
                                else
                                {
                                    outlinks = htmlParser.ExtractLinks(ref contents, ref urlToCrawl);
                                }
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                        }
                        catch
                        {
                            if(outlinks == null)
                            {
                                outlinks = new ArrayList();
                            }
                        }
                        urlData.OutLinks = new InternetUrlToIndex[outlinks.Count];
                        for(int i = 0; i< outlinks.Count; i++)
                        {
                            urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i];
                        }
                        //finally update the urlData object with the modified UrlToCrawl
                        urlData.UrlToCrawl = urlToCrawl;
                    }
                }
                //lock and update CrawledUrls
                lock(crawledUrls.SyncRoot)
                {
                    crawledUrls.Add(urlData);
                }
            }
            catch(ThreadAbortException tae)
            {
                //The thread has been asked to abort. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
                }
                return;
            }
            catch(ThreadInterruptedException tie)
            {
                //The thread has been asked to join. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
                }
                return;
            }
            catch(Exception ex)
            {
                if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
                {
                    if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
                    }
                }
                throw ex; //PerformCrawling should catch this one ?????
            }
            finally
            {
                GC.Collect();
            }
        }
Exemple #4
0
        /// <summary>
        /// DetectConnectionSpeed attempts to detect the computer's internet connection speed
        /// by measuring how much time it takes to download the contents of a web site.
        /// </summary>
        /// <returns>A <see cref="CWConnectionSpeed"/> value containing the estimated internet
        /// connection speed.</returns>
        public static CWConnectionSpeed DetectConnectionSpeed()
        {
            CWConnectionSpeed retVal = CWConnectionSpeed.Unknown;

            try
            {
                WebClient  client = new WebClient();
                HiResTimer timer  = new HiResTimer();
                byte[]     data;
                timer.Start();
                try
                {
                    data = client.DownloadData("http://www.in.gr/");
                }
                catch
                {
                    data = new byte[0];
                }
                finally
                {
                    timer.Stop();
                    client.Dispose();
                }
                if (data.Length > 0)
                {
                    double Kbps = ((data.Length * 8) / timer.Duration);
                    //determine which enumeration value fits best
                    if ((Kbps > 0) && (Kbps <= 56))
                    {
                        retVal = CWConnectionSpeed.Modem56K;
                    }
                    else if (Kbps <= 64)
                    {
                        retVal = CWConnectionSpeed.ISDN64K;
                    }
                    else if (Kbps <= 128)
                    {
                        retVal = CWConnectionSpeed.ISDN128K;
                    }
                    else if (Kbps <= 256)
                    {
                        retVal = CWConnectionSpeed.DSL256K;
                    }
                    else if (Kbps <= 512)
                    {
                        retVal = CWConnectionSpeed.DSL512K;
                    }
                    else if (Kbps <= 1024)
                    {
                        retVal = CWConnectionSpeed.DSL1M;
                    }
                    else if ((Kbps > 1024) && (Kbps <= 1536))
                    {
                        retVal = CWConnectionSpeed.T1;
                    }
                    else if ((Kbps > 1536) && (Kbps <= 46080))
                    {
                        retVal = CWConnectionSpeed.T3;
                    }
                    else if ((Kbps > 46080) && (Kbps < 158720))
                    {
                        retVal = CWConnectionSpeed.Fiber;
                    }
                    else if (Kbps >= 158720)
                    {
                        retVal = CWConnectionSpeed.ATM;
                    }
                    else
                    {
                        retVal = CWConnectionSpeed.Unknown;
                    }
                }
            }
            catch
            { }
            return(retVal);
        }