Beispiel #1
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToCrawl"/> object from an existing <see cref="InternetUrlToCrawl"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToCrawl"/> object</param>
 public InternetUrlToCrawl(InternetUrlToCrawl IUrl)
 {
     ID  = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     crc                   = IUrl.CRC;
     flagDomain            = IUrl.FlagDomain;
     flagFetchRobots       = IUrl.FlagFetchRobots;
     robotsDisallowedPaths = IUrl.RobotsDisallowedPaths;
 }
Beispiel #2
0
 /// <summary>
 /// Returns the host name of an <see cref="InternetUrlToCrawl"/>
 /// </summary>
 /// <param name="url">The <see cref="InternetUrlToCrawl"/> to examine</param>
 /// <returns>A string containing the Url's host name or IP Address.</returns>
 public static string HostName(InternetUrlToCrawl url)
 {
     try
     {
         Uri uri = new Uri(url.Url);
         return(uri.Host);
     }
     catch
     {
         return(String.Empty);
     }
 }
Beispiel #3
0
 /// <summary>
 /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes
 /// it with the default values.
 /// </summary>
 public UrlCrawlData()
 {
     url                  = new InternetUrlToCrawl();
     updated              = false;
     redirected           = false;
     redirectedFlagRobots = false;
     redirectedFlagDomain = DomainFlagValue.MustVisit;
     redirectedPriority   = 255;
     httpStatusCode       = HttpStatusCode.OK;
     data                 = String.Empty;
     timeStamp            = DateTime.UtcNow;
     retrievalTime        = 0;
     outLinks             = null;//new ArrayList();
 }
Beispiel #4
0
 /// <summary>
 /// Extracts links from the contents of a document.
 /// </summary>
 /// <param name="content">The contents of the document.</param>
 /// <param name="contentUrl">The url of the document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks>
 public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
 {
     ArrayList links=new ArrayList();
     ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
     OnExtractLinksComplete(e);
     return links;
 }
Beispiel #5
0
 public SerializedException SendUrlsToCrawl(ClientInfo ci, out InternetUrlToCrawl[] data)
 {
     data = null;
     engine.LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl);
     return engine.SelectUrlsToCrawl(ci, ref data);
 }
Beispiel #6
0
 /// <summary>
 /// Extracts links from the contents of a SWF document.
 /// </summary>
 /// <param name="content">The contents of the SWF document.</param>
 /// <param name="contentUrl">The url of the SWF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
 public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
 {
     ArrayList links=null;
     if((content==null)||(content.Length==0))
     {
         throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
     }
     try
     {
         mutex.WaitOne();
         string FileName = globals.AppWorkPath + Guid.NewGuid().ToString();
         string swfFileName = FileName + ".swf";
         string htmFileName = FileName + ".htm";
         FileStream swf = null;
         StreamReader htm = null;
         try
         {
             //store the swf file
             swf = new FileStream(swfFileName,FileMode.Create);
             swf.Write(content, 0, content.Length);
             swf.Close();
             swf = null;
             //convert it to html
             bool success = converter.ConvertSwfFile(swfFileName, htmFileName);
             if(success)
             {
                 htm = new StreamReader(htmFileName, encoding);
                 string html = htm.ReadToEnd();
                 htm.Close();
                 htm = null;
                 links = parser.ExtractLinks(ref html, ref contentUrl);
             }
         }
         catch(Exception ex)
         {
             if(swf!=null)
             {
                 try
                 {
                     swf.Close();
                 }
                 catch
                 {}
             }
             if(htm!=null)
             {
                 try
                 {
                     htm.Close();
                 }
                 catch
                 {}
             }
             if(globals.Settings.LogLevel <= CWLogLevel.LogInfo)
             {
                 globals.FileLog.LogInfo("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString());
             }
         }
         finally
         {
             File.Delete(swfFileName);
             File.Delete(htmFileName);
         }
     }
     catch(Exception ex)
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.Message);
         }
     }
     finally
     {
         GC.Collect();
         mutex.ReleaseMutex();
     }
     ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
     OnExtractLinksComplete(e);
     return links;
 }
Beispiel #7
0
 /// <summary>
 /// Extracts links from the contents of a SWF document.
 /// </summary>
 /// <param name="content">The contents of the SWF document.</param>
 /// <param name="contentUrl">The url of the PDF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="NotSupportedException">Whenever this method is called.</exception>
 /// <remarks>
 /// Since a SWF document can not be converted to a string this method <b>ALWAYS</b>
 /// throws a <see cref="NotSupportedException"/>.
 /// </remarks>
 public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
 {
     throw new NotSupportedException();
 }
Beispiel #8
0
 /// <summary>
 /// Returns the host name of an <see cref="InternetUrlToCrawl"/>
 /// </summary>
 /// <param name="url">The <see cref="InternetUrlToCrawl"/> to examine</param>
 /// <returns>A string containing the Url's host name or IP Address.</returns>
 public static string HostName(InternetUrlToCrawl url)
 {
     try
     {
         Uri uri = new Uri(url.Url);
         return uri.Host;
     }
     catch
     {
         return String.Empty;
     }
 }
Beispiel #9
0
        /// <summary>
        /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in
        /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the
        /// crawling threads that may be interrupted or aborted at any time it must be able
        /// to handle ThreadAbortException and ThreadInterruptedException.
        /// </summary>
        /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/>
        /// object that encapsulates the url that must be crawled.</param>
        private void CrawlUrl(ref InternetUrlToCrawl urlToCrawl)
        {
            try
            {
                UrlCrawlData urlData = new UrlCrawlData();
                HiResTimer timer = new HiResTimer();

                //create the web request and download the data
                HttpWebRequest pageRequest = null;
                try
                {
                    pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url);
                }
                catch
                {
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                    urlData.Updated=true;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                    urlData.Data = String.Empty;
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    lock(crawledUrls.SyncRoot)
                    {
                        crawledUrls.Add(urlData);
                    }
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    return;
                }
                pageRequest.UserAgent = globals.UserAgent;
                pageRequest.Timeout=ExponentialBackoff.DefaultBackoff; //page timeout = 30 seconds
                HttpWebResponse pageResponse=null;
                try
                {
                    timer.Start();
                    pageResponse = (HttpWebResponse)pageRequest.GetResponse();
                    //the above line might throw either WebException or UriFormatException
                }
                catch(WebException we)
                {
                    HttpWebResponse response=(HttpWebResponse)we.Response;
                    if (response!=null)
                    {
                        //although an exception occured we're able to get the Status Code
                        urlData.HttpStatusCode=response.StatusCode;
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(response.StatusCode, response.ContentLength);
                    }
                    else
                    {
                        urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    }
                }
                catch(UriFormatException)
                {
                    //this will occur if the url is not valid
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;
                    urlData.Updated=false;
                    urlData.Data = String.Empty;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                }
                finally
                {
                    timer.Stop();
                    urlData.TimeStamp = DateTime.UtcNow;
                }
                if(pageResponse !=null)
                {
                    //update the fields
                    urlData.HttpStatusCode = pageResponse.StatusCode;
                    //download and parse the contents of the url
                    Stream receiveStream=pageResponse.GetResponseStream();
                    StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);
                    string contents = String.Empty;
                    try
                    {
                        contents=receivedBytes.ReadToEnd();
                    }
                    catch
                    {
                        //it should be response timeout not request timeout
                        urlData.HttpStatusCode = HttpStatusCode.RequestTimeout;
                        urlData.Updated = true;
                        urlData.RetrievalTime = (int)timer.Duration;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        urlData.UrlToCrawl = urlToCrawl;
                        try
                        {
                            receivedBytes.Close();
                            receiveStream.Close();
                            pageResponse.Close();
                        }
                        catch
                        {}
                        lock(crawledUrls.SyncRoot)
                        {
                            crawledUrls.Add(urlData);
                        }
                        UpdateStatistics(HttpStatusCode.RequestTimeout, 0);
                        return;
                    }
                    byte []buffer=Encoding.ASCII.GetBytes(contents);
                    receiveStream.Close();
                    receivedBytes.Close();
                    UpdateStatistics(pageResponse.StatusCode, contents.Length);
                    string redirectUrl = string.Empty;
                    if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url)
                    {//now that was a bloody BUGBUG
                        redirectUrl = pageResponse.ResponseUri.AbsoluteUri;
                        urlData.RedirectedPriority = CleanupRedirectUrl(ref redirectUrl);
                        if(urlToCrawl.Url != redirectUrl)
                        {
                            urlData.Redirected=true;
                            urlToCrawl.Url=redirectUrl;
                        }
                    }
                    Parser parser = SelectParser(pageResponse.ContentType);
                    pageResponse.Close();
                    long CRC = CompressionUtils.BufferCRC(buffer);
                    if(CRC != urlToCrawl.CRC)
                    {
                        urlData.Updated = true;
                        urlToCrawl.CRC = CRC;
                    }
                    if(urlData.Updated)
                    {
                        urlData.RetrievalTime = (int)timer.Duration;
                        //if redirected, calculate robots, domain & priority for redirect url
                        if(urlData.Redirected)
                        {
                            urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl);
                            urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(ref redirectUrl, ref urlToCrawl, RobotsMetaTagValue.NoMeta);
                        }
                        //perform link extraction and content extraction
                        ArrayList outlinks = null;
                        try
                        {
                            if((parser == htmlParser)||(parser == textParser))
                            {
                                urlData.Data = parser.ExtractContent(ref contents, false);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                            else
                            {
                                urlData.Data = parser.ExtractContent(buffer, false);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(buffer, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                        }
                        catch
                        {
                            if(outlinks == null)
                            {
                                outlinks = new ArrayList();
                            }
                        }
                        urlData.OutLinks = new InternetUrlToIndex[outlinks.Count];
                        for(int i = 0; i< outlinks.Count; i++)
                        {
                            urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i];
                        }
                        //finally update the urlData object with the modified UrlToCrawl
                        urlData.UrlToCrawl = urlToCrawl;
                    }
                }
                //lock and update CrawledUrls
                lock(crawledUrls.SyncRoot)
                {
                    crawledUrls.Add(urlData);
                }
            }
            catch(ThreadAbortException tae)
            {
                //The thread has been asked to abort. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
                }
                return;
            }
            catch(ThreadInterruptedException tie)
            {
                //The thread has been asked to join. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
                }
                return;
            }
            catch(Exception ex)
            {
                if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
                {
                    if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
                    }
                }
                throw ex; //PerformCrawling should catch this one ?????
            }
            finally
            {
                GC.Collect();
            }
        }
Beispiel #10
0
 /// <summary>
 /// Extracts links from the contents of a PDF document.
 /// </summary>
 /// <param name="content">The contents of the PDF document.</param>
 /// <param name="contentUrl">The url of the PDF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="NotSupportedException">Whenever this method is called.</exception>
 /// <remarks>
 /// Since a PDF document can not be converted to a string this method <b>ALWAYS</b>
 /// throws a <see cref="NotSupportedException"/>.
 /// </remarks>
 public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
 {
     throw new NotSupportedException("The ExtractLinks method of the PdfParser cannot accept a string as input.");
 }
Beispiel #11
0
 /// <summary>
 /// Performs the extraction of links from a text document. It can extract simple
 /// links that are separated from the rest of the text using spaces or line brakes
 /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
 /// of <see cref="InternetUrlToIndex"/> objects.
 /// </summary>
 /// <remarks>
 /// Besides the parsing and extraction of Urls, ExtractLinks also performs other 
 /// tasks as well, such as:<br/>
 /// <list type="bullet">
 ///   <item>
 ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
 ///   </item>
 ///   <item>
 ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
 ///   </item>
 ///   <item>
 ///     <description>Filtering of session id variables in dynamic Urls and limiting
 ///     of the number of GET variables in dynamic Urls.</description>
 ///   </item>
 ///   <item>
 ///     <description>Flagging of Urls according to their country domain.</description>
 ///   </item>
 /// </list>
 /// <b>Update History</b>
 /// <list type="table">
 ///   <listheader>
 ///		<term>Date</term>
 ///		<description>Description</description>
 ///   </listheader>
 ///   <item>
 ///     <term>15/09/04</term>
 ///     <description>First release. A lot more needs to be done.</description>
 ///   </item>
 /// </list>
 /// </remarks>
 /// <param name="content">The text that must be parsed for links. IIt is passed as
 /// an array of bytes containing the text contents in UTF8 binary format, in order
 /// to reduce memory consumption.</param>
 /// <param name="contentUrl">The Url from which the content comes.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
 {
     ArrayList retVal = null;
     try
     {
         mutex.WaitOne();
         string html = Encoding.UTF8.GetString(content);
         retVal = ExtractLinks(ref html, ref contentUrl);
     }
     catch
     {}
     finally
     {
         mutex.ReleaseMutex();
     }
     return retVal;
 }
Beispiel #12
0
        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other 
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. It is passed by
        /// reference in order to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = new ArrayList();
            // It is important to notice that if the FlagFetchRobots of the contentUrl is
            // true then the TextParser must remember this value because during the Robots
            // Filtering it will become false so as not to download the robots.txt file
            // every time a Url must be filtered.
            //bool FlagFetchRobots = contentUrl.FlagFetchRobots;
            try
            {
                //make sure only one thread will parse contents at a time.
                //mutex.WaitOne();
                if(contentUrl.FlagDomain!=DomainFlagValue.MustVisit)
                {
                    contentUrl.FlagDomain = ExtractDomainFlag(ref content);

                    if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                        if (InternetUtils.HostName(contentUrl).Contains("ebay.com"))
                            contentUrl.FlagDomain = DomainFlagValue.MustVisit;
                }
                //perform the hyperlink matching
                MatchCollection matches = hrefRegex.Matches(content);

                if(matches.Count>0)
                {
                    string documentUrl = contentUrl.Url;
                    string baseUrl = BaseUrl(ref documentUrl);
                    byte priority = 0;

                    foreach(Match m in matches)
                    {
                        try
                        {
                            string url = m.Value.Trim();
                            url = NormalizeUrl(ref url, ref baseUrl);
                            priority = CleanUrlParams(ref url);
                            if(FilterUrl(ref url, ref documentUrl))
                            {
                                InternetUrlToIndex iurl = new InternetUrlToIndex(url);
                                iurl.Priority = priority;
                                iurl.FlagDomain = domainFilter.FilterUrl(ref url);
                                //[mod 24/2/05] No robots.txt checking is performed for non-greek urls
                                if(iurl.FlagDomain == DomainFlagValue.MustVisit)
                                {
                                    iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta);
                                }
                                else
                                {
                                    iurl.FlagRobots = false;
                                }
                                if(!links.Contains(iurl))
                                {
                                    links.Add(iurl);
                                }
                            }
                        }
                        catch
                        {
                            if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                            {
                                globals.FileLog.LogInfo("TextParser failed to parse " + m.Value);
                            }
                            continue;
                        }
                    }
                }
            }
            catch(Exception ex)
            {
                if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning(ex.Message);
                }
            }
            finally
            {
                //mutex.ReleaseMutex();
            }
            //contentUrl.FlagFetchRobots = FlagFetchRobots;
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
            OnExtractLinksComplete(e);
            links.TrimToSize();
            return links;
        }
Beispiel #13
0
 /// <summary>
 /// Checks if the Robots Exclusion Standard allows the crawler to visit a url.
 /// </summary>
 /// <param name="targetUrl">The url that is to be validated.</param>
 /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param>
 /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the 
 /// restrictions posed by the robots meta tag contained in the sourceUrl.</param>
 /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is 
 /// allowed (false) or disallowed (true) to visit the target Url.</returns>
 /// <remarks>This method is safe for multi-threaded operations. However only one
 /// thread will be able to perform a check at any given time.
 /// </remarks>
 public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta)
 {
     bool retVal = false; //assume that it's allowed to crawl the targetUrl
     try
     {
         mutex.WaitOne();
         //perhaphs we should use the hash code of the hostnames as keys.
         string targetHost = InternetUtils.HostName(targetUrl);
         string sourceHost = InternetUtils.HostName(sourceUrl);
         RobotsTxtEntry robots = null;
         //Do we need to fetch the robots.txt for the source Url?
         if(sourceUrl.FlagFetchRobots)
         {
             //we must fetch the robots.txt from the source url host and update sourceUrl.
             robots = FetchRobots(sourceHost);
             sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths);
             sourceUrl.FlagFetchRobots = false; //fetch it only once
             //check if it exists in the Hashtable, if so update it, otherwise add it
             if(robotsTable.ContainsKey(sourceHost))
             {
                 robotsTable[sourceHost] = robots;
             }
             else
             {
                 robotsTable.Add(sourceHost, robots);
             }
         }
         else
         {
             //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl
             if(!robotsTable.TryGetValue(sourceHost, out robots))
             {
                 robots = new RobotsTxtEntry();
                 robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths);
                 robotsTable.Add(sourceHost, robots);
             }
             else
             {
                 if(robots.ExpirationDate<DateTime.Today)
                 {
                     robots = FetchRobots(sourceHost);
                     robotsTable[sourceHost] = robots;
                 }
             }
         }
         if(targetHost != sourceHost)
         {
             //the target url is on a different host, we must get its robots.txt
             if(!robotsTable.TryGetValue(targetHost, out robots))
             {
                 robots = FetchRobots(targetHost);
                 robotsTable.Add(targetHost, robots);
             }
             else
             {
                 if(robots.ExpirationDate<DateTime.Today)
                 {
                     robots = FetchRobots(targetHost);
                     robotsTable[targetHost] = robots;
                 }
             }
         }
         if((robotsMeta & RobotsMetaTagValue.NoFollow)>0)
         {
             //if the meta tag has the NoFollow option set then we cannot crawl targetUrl
             retVal = true;
         }
         else
         {
             robots = robotsTable[targetHost];
             //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths
             if(robots.DisallowedPaths!=null)
             {
                 for(int i = 0; i < robots.DisallowedPaths.Length; i++)
                 {
                     if(targetUrl.IndexOf(robots.DisallowedPaths[i])!=-1)
                     {
                         //we found a match. It is therefore not allowed to crawl targetUrl
                         retVal = true;
                         break; //stop searching as soon as we have a match
                     }
                 }
             }
         }
     }
     catch(Exception e)
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString());
         }
     }
     finally
     {
         mutex.ReleaseMutex();
     }
     return retVal;
 }
Beispiel #14
0
 /// <summary>
 /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes
 /// it with the default values.
 /// </summary>
 public UrlCrawlData()
 {
     url=new InternetUrlToCrawl();
     updated=false;
     redirected=false;
     redirectedFlagRobots = false;
     redirectedFlagDomain = DomainFlagValue.MustVisit;
     redirectedPriority = 255;
     httpStatusCode=HttpStatusCode.OK;
     data=String.Empty;
     timeStamp=DateTime.UtcNow;
     retrievalTime=0;
     outLinks=null;//new ArrayList();
 }
Beispiel #15
0
 /// <summary>
 /// Extracts the hypertext references (links) contained in a document.
 /// </summary>
 /// <param name="content">
 /// The content of the document that will be parsed for links.
 /// </param>
 /// <param name="contentUrl">
 /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the
 /// document to be parsed for links and its associated robots.txt file.
 /// </param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating
 /// the links contained in the parsed document.
 /// </returns>
 public abstract ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl);
Beispiel #16
0
 /// <summary>
 /// Extracts the hypertext references (links) contained in a document.
 /// </summary>
 /// <param name="content">
 /// An array of bytes holding the content of the document that will be parsed for links.
 /// </param>
 /// <param name="contentUrl">
 /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the
 /// document to be parsed for links and its associated robots.txt file.
 /// </param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating
 /// the links contained in the parsed document.
 /// </returns>
 public abstract ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl);
Beispiel #17
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToCrawl"/> object from an existing <see cref="InternetUrlToCrawl"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToCrawl"/> object</param>
 public InternetUrlToCrawl(InternetUrlToCrawl IUrl)
 {
     ID = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     crc = IUrl.CRC;
     flagDomain = IUrl.FlagDomain;
     flagFetchRobots = IUrl.FlagFetchRobots;
     robotsDisallowedPaths = IUrl.RobotsDisallowedPaths;
 }
Beispiel #18
0
 /// <summary>
 /// Extracts links from the contents of a PDF document.
 /// </summary>
 /// <param name="content">The contents of the PDF document.</param>
 /// <param name="contentUrl">The url of the PDF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
 public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
 {
     ArrayList links=null;
     if((content==null)||(content.Length==0))
     {
         throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
     }
     try
     {
         mutex.WaitOne();
         string FileName = globals.AppWorkPath + Guid.NewGuid().ToString();
         string pdfFileName = FileName + ".pdf";
         string txtFileName = FileName + ".txt";
         FileStream pdf = null;
         StreamReader txt = null;
         try
         {
             //store the pdf file
             pdf = new FileStream(pdfFileName,FileMode.Create);
             pdf.Write(content, 0, content.Length);
             pdf.Close();
             pdf = null;
             bool success = false;
             //convert it to text
             try
             {
                 converter.loadFile(pdfFileName);
                 converter.convertToTextFile(1, converter.numPages, txtFileName);
                 success = true;
             }
             catch
             {
                 success = false;
             }
             finally
             {
                 converter.closeFile();
             }
             if(success)
             {
                 txt = new StreamReader(txtFileName, encoding);
                 string text = txt.ReadToEnd();
                 txt.Close();
                 txt = null;
                 links = parser.ExtractLinks(ref text, ref contentUrl);
             }
             else
             {
                 txt = null;
             }
         }
         catch(Exception ex)
         {
             if(pdf!=null)
             {
                 try
                 {
                     pdf.Close();
                 }
                 catch
                 {}
             }
             if(txt!=null)
             {
                 try
                 {
                     txt.Close();
                 }
                 catch
                 {}
             }
             if(globals.Settings.LogLevel <= CWLogLevel.LogInfo)
             {
                 globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString());
             }
         }
         finally
         {
             File.Delete(pdfFileName);
             File.Delete(txtFileName);
         }
     }
     catch
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url);
         }
     }
     finally
     {
         GC.Collect();
         mutex.ReleaseMutex();
     }
     ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
     OnExtractLinksComplete(e);
     return links;
 }
Beispiel #19
0
        /// <summary>
        /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in
        /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the
        /// crawling threads that may be interrupted or aborted at any time it must be able
        /// to handle ThreadAbortException and ThreadInterruptedException.
        /// </summary>
        /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/>
        /// object that encapsulates the url that must be crawled.</param>
        internal void CrawlUrl(ref InternetUrlToCrawl urlToCrawl)
        {
            try
            {
                UrlCrawlData urlData = new UrlCrawlData();
                HiResTimer timer = new HiResTimer();

                //create the web request and download the data
                HttpWebRequest pageRequest = null;
                try
                {
                    pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url);
                }
                catch
                {
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                    urlData.Updated=true;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                    urlData.Data = String.Empty;
                    UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    lock(crawledUrls.SyncRoot)
                    {
                        crawledUrls.Add(urlData);
                    }
                    return;
                }
                pageRequest.UserAgent = globals.UserAgent;
                pageRequest.Timeout=Backoff.DefaultBackoff; //page timeout = 30 seconds
                pageRequest.KeepAlive = false;
                HttpWebResponse pageResponse=null;
                try
                {
                    timer.Start();
                    pageResponse = (HttpWebResponse)pageRequest.GetResponse();
                    //the above line might throw either WebException or UriFormatException
                }
                catch(WebException we)
                {
                    HttpWebResponse response=(HttpWebResponse)we.Response;
                    if (response!=null)
                    {
                        //although an exception occured we're able to get the Status Code
                        urlData.HttpStatusCode=response.StatusCode;
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(response.StatusCode, response.ContentLength);
                        response.Close();
                    }
                    else
                    {
                        urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment
                        urlData.Updated=true;
                        urlData.UrlToCrawl = urlToCrawl;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        UpdateStatistics(HttpStatusCode.BadRequest, 0);
                    }
                }
                catch(UriFormatException)
                {
                    //this will occur if the url is not valid
                    urlData.HttpStatusCode=HttpStatusCode.BadRequest;
                    urlData.Updated=false;
                    urlData.Data = String.Empty;
                    urlData.UrlToCrawl = urlToCrawl;
                    urlData.OutLinks = new InternetUrlToIndex[0];
                }
                finally
                {
                    timer.Stop();
                    urlData.TimeStamp = DateTime.UtcNow;
                }
                if(pageResponse !=null)
                {
                    //update the fields
                    urlData.HttpStatusCode = pageResponse.StatusCode;
                    //download and parse the contents of the url
                    Stream receiveStream=pageResponse.GetResponseStream();
                    /*StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);*/
                    MemoryStream receivedBytes = new MemoryStream();
                    byte [] buffer = new byte[4096];
                    int read = 0;
                    try
                    {
                        while((read=receiveStream.Read(buffer,0,4096))>0)
                        {
                            receivedBytes.Write(buffer,0,read);
                        }
                    }
                    catch
                    {
                        //it should be response timeout not request timeout
                        urlData.HttpStatusCode = HttpStatusCode.RequestTimeout;
                        urlData.Updated = true;
                        urlData.RetrievalTime = (int)timer.Duration;
                        urlData.Data = String.Empty;
                        urlData.OutLinks = new InternetUrlToIndex[0];
                        urlData.UrlToCrawl = urlToCrawl;
                        try
                        {
                            receivedBytes.Close();
                            receiveStream.Close();
                            pageResponse.Close();
                        }
                        catch
                        {}
                        lock(crawledUrls.SyncRoot)
                        {
                            crawledUrls.Add(urlData);
                        }
                        UpdateStatistics(HttpStatusCode.RequestTimeout, 0);
                        return;
                    }
                    buffer = receivedBytes.ToArray();
                    Parser parser = SelectParser(pageResponse.ContentType);
                    string contents = String.Empty;
                    if(parser == htmlParser)
                    {
                        Encoding encoding = null;
                        switch(pageResponse.ContentEncoding)
                        {
                            case "":
                            case "none":
                                contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length);
                                //re-check the encoding
                                encoding = DetectContentEncoding(ref contents);
                                if(encoding != defaultEncoding)
                                {
                                    contents = encoding.GetString(buffer, 0, buffer.Length);
                                }
                                break;

                            case "gzip":
                            case "x-gzip":
                                //first decompress the stream and then re-check the encoding
                                byte [] decompressed_buffer = new byte [0];
                                DecompressGzippedContent(buffer, out decompressed_buffer);
                                contents = this.defaultEncoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length);
                                //re-check the encoding
                                encoding = DetectContentEncoding(ref contents);
                                if(encoding != defaultEncoding)
                                {
                                    contents = encoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length);
                                }
                                break;

                            default:
                                try
                                {
                                    encoding = Encoding.GetEncoding(pageResponse.ContentEncoding);
                                    contents = encoding.GetString(buffer, 0, buffer.Length);
                                }
                                catch//(NotSupportedException)
                                {
                                    encoding = defaultEncoding;
                                    //the encoding specified is unsupported.
                                    contents = String.Empty;
                                }
                                break;
                        }
                    }
                    else
                    {
                        if(parser == textParser)
                        {
                            try
                            {
                                contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length);
                            }
                            catch
                            {
                                //something went seriously wrong here. The crawler got a header that says the server is
                                //sending back a plain text document but for some reason we can't get the string contents.
                                contents = String.Empty;
                            }
                        }
                    }
                    receiveStream.Close();
                    receivedBytes.Close();
                    UpdateStatistics(pageResponse.StatusCode, buffer.Length);
                    string redirectUrl = string.Empty;
                    if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url)
                    {
                        redirectUrl = pageResponse.ResponseUri.AbsoluteUri;
                        urlData.RedirectedPriority = htmlParser.CleanUrlParams(ref redirectUrl);//CleanupRedirectUrl(ref redirectUrl);
                        if(urlToCrawl.Url != redirectUrl)//now that was a bloody BUGBUG
                        {
                            urlData.Redirected=true;
                            urlToCrawl.Url=redirectUrl;
                        }
                    }
                    pageResponse.Close();
                    long CRC = CompressionUtils.BufferCRC(buffer);
                    if(CRC != urlToCrawl.CRC)
                    {
                        urlData.Updated = true;
                        urlToCrawl.CRC = CRC;
                    }
                    if(urlData.Updated)
                    {
                        urlData.RetrievalTime = (int)timer.Duration;
                        //if redirected, calculate robots, domain & priority for redirect url
                        if(urlData.Redirected)
                        {
                            urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl);
                            urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(redirectUrl, urlToCrawl, RobotsMetaTagValue.NoMeta);
                        }
                        //perform link extraction and content extraction
                        ArrayList outlinks = null;
                        try
                        {
                            if((parser == htmlParser)||(parser == textParser))
                            {
                                string clean = parser.ExtractContent(ref contents, false);
                                if(clean.Length>1048576)
                                {
                                    clean = clean.Substring(0,1048576);
                                }
                                urlData.Data = InternetUtils.Base64Encode(clean);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl);
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                            else
                            {
                                contents = parser.ExtractContent(buffer, false);
                                if(contents.Length>1048576)
                                {
                                    contents = contents.Substring(0,1048576);
                                }
                                urlData.Data = InternetUtils.Base64Encode(contents);
                                if(urlData.Data == null)
                                {
                                    urlData.Data = String.Empty;
                                }
                                if(parser == pdfParser)
                                {
                                    outlinks = textParser.ExtractLinks(ref contents, ref urlToCrawl);
                                }
                                else
                                {
                                    outlinks = htmlParser.ExtractLinks(ref contents, ref urlToCrawl);
                                }
                                if(outlinks == null)
                                {
                                    outlinks = new ArrayList();
                                }
                            }
                        }
                        catch
                        {
                            if(outlinks == null)
                            {
                                outlinks = new ArrayList();
                            }
                        }
                        urlData.OutLinks = new InternetUrlToIndex[outlinks.Count];
                        for(int i = 0; i< outlinks.Count; i++)
                        {
                            urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i];
                        }
                        //finally update the urlData object with the modified UrlToCrawl
                        urlData.UrlToCrawl = urlToCrawl;
                    }
                }
                //lock and update CrawledUrls
                lock(crawledUrls.SyncRoot)
                {
                    crawledUrls.Add(urlData);
                }
            }
            catch(ThreadAbortException tae)
            {
                //The thread has been asked to abort. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message);
                }
                return;
            }
            catch(ThreadInterruptedException tie)
            {
                //The thread has been asked to join. Log information and return at once
                if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                {
                    globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message);
                }
                return;
            }
            catch(Exception ex)
            {
                if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown
                {
                    if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString());
                    }
                }
                throw ex; //PerformCrawling should catch this one ?????
            }
            finally
            {
                GC.Collect();
            }
        }
Beispiel #20
0
 /// <summary>
 /// Selects and returns a set of urls that are ready to be crawled.
 /// </summary>
 /// <param name="ci">The <see cref="ClientInfo"/> of the client requesting urls to crawl.</param>
 /// <param name="data">An array of <see cref="InternetUrlToCrawl"/> objects containing the selected urls.</param>
 /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/> 
 /// encapsulating the error that occured if the operation fails.</returns>
 public SerializedException SelectUrlsToCrawl(ClientInfo ci, ref InternetUrlToCrawl[] data)
 {
     SerializedException sx = null;
     try
     {
         if (!ConnectToDatabase())
         {
             throw new CWDBConnectionFailedException();
         }
         //we must use a transaction to make sure that if something goes wrong the
         //changes to the database will be rolled back.
         SqlTransaction transaction = dbcon.BeginTransaction(IsolationLevel.Serializable);//perhaps | repeatableread
         try
         {
             //first select the urls to crawl
             SqlCommand cmd = new SqlCommand("cw_select_urls_to_crawl", dbcon, transaction);
             cmd.CommandType = CommandType.StoredProcedure;
             cmd.CommandTimeout = 120;
             SqlDataAdapter da = new SqlDataAdapter(cmd);
             DataSet ds = new DataSet();
             da.Fill(ds);
             da.Dispose();
             cmd.Dispose();
             //now delete them from the table of urls to crawl
             data = new InternetUrlToCrawl[ds.Tables[0].Rows.Count];
             if (data.Length > 0)
             {
                 int i = 0;
                 foreach (DataRow dr in ds.Tables[0].Rows)
                 {
                     try
                     {
                         InternetUrlToCrawl url = new InternetUrlToCrawl((int)dr[0], (string)dr[1]);
                         if (dr[2] != DBNull.Value)
                         {
                             url.CRC = (long)dr[2];
                         }
                         if (dr[3] != DBNull.Value)
                         {
                             url.FlagDomain = (DomainFlagValue)((byte)dr[3]);
                         }
                         if (dr[4] != DBNull.Value)
                         {
                             url.RobotsDisallowedPaths = (string)dr[4];
                         }
                         else
                         {
                             RobotsTxtEntry entry = settings.Robots.GetEntry(InternetUtils.HostName(url));
                             if (entry != null)
                             {
                                 url.RobotsDisallowedPaths = ConcatenatePaths(entry.DisallowedPaths);
                             }
                             else
                             {
                                 url.FlagFetchRobots = true;
                             }
                         }
                         data[i++] = url;
                     }
                     catch
                     {
                         continue;
                     }
                 }
                 SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon, transaction);
                 statscmd.CommandType = CommandType.StoredProcedure;
                 statscmd.CommandTimeout = 120;
                 statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier);
                 statscmd.Parameters.Add("@assigned", SqlDbType.BigInt);
                 statscmd.Parameters.Add("@returned", SqlDbType.BigInt);
                 statscmd.Parameters.Add("@type", SqlDbType.TinyInt);
                 statscmd.Parameters[0].Value = ci.ClientID;
                 statscmd.Parameters[1].Value = data.Length;
                 statscmd.Parameters[2].Value = DBNull.Value;
                 statscmd.Parameters[3].Value = 0;
                 statscmd.ExecuteNonQuery();
                 statscmd.Dispose();
                 transaction.Commit();
             }
         }
         catch (Exception ex)
         {
             transaction.Rollback();
             if (settings.LogLevel <= CWLogLevel.LogWarning)
             {
                 settings.Log.LogWarning("SelectUrlsToCrawl failed, Transaction was rolled back: " + ex.ToString());
             }
             throw ex;
         }
         finally
         {
             UpdateClientLastActive(ci);
             LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl);
             if (!DisconnectFromDatabase())
             {
                 throw new CWDBConnectionFailedException("Disconnect from database failure.");
             }
         }
     }
     catch (Exception e)
     {
         sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString());
         if (settings.LogLevel <= CWLogLevel.LogWarning)
         {
             settings.Log.LogWarning("SelectUrlsToCrawl failed: " + e.ToString());
         }
     }
     return sx;
 }
 /// <summary>
 /// Checks if at least 30 seconds have passed since the last request to a given host
 /// was made, in order not to slammer it with simultaneous or frequent requests.
 /// </summary>
 /// <param name="targetUrl">
 /// A <see cref="InternetUrlToCrawl"/> that is served by a host we wish to check.
 /// </param>
 /// <returns>
 /// An integer containing the number of milliseconds a crawler thread must wait
 /// before visiting this host.
 /// </returns>
 public int FilterUrl(ref InternetUrlToCrawl targetUrl)
 {
     string hostName = InternetUtils.HostName(targetUrl);
     return FilterHost(ref hostName);
 }