Ejemplos de código de InternetUrlToCrawl en C# (CSharp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: NullParser.cs Proyecto: tmzani/CrawlWave

        /// <summary>
        /// Extracts links from the contents of a document.
        /// </summary>
        /// <param name="content">The contents of the document.</param>
        /// <param name="contentUrl">The url of the document.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks>
        public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList       links = new ArrayList();
            ParserEventArgs e     = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            return(links);
        }

Ejemplo n.º 2

0

Mostrar archivo

Archivo: TextParser.cs Proyecto: tmzani/CrawlWave

        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. IIt is passed as
        /// an array of bytes containing the text contents in UTF8 binary format, in order
        /// to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList retVal = null;

            try
            {
                mutex.WaitOne();
                string html = Encoding.UTF8.GetString(content);
                retVal = ExtractLinks(ref html, ref contentUrl);
            }
            catch
            {}
            finally
            {
                mutex.ReleaseMutex();
            }
            return(retVal);
        }

Ejemplo n.º 3

0

Mostrar archivo

        /// <summary>
        /// Checks if at least 30 seconds have passed since the last request to a given host
        /// was made, in order not to slammer it with simultaneous or frequent requests.
        /// </summary>
        /// <param name="targetUrl">
        /// A <see cref="InternetUrlToCrawl"/> that is served by a host we wish to check.
        /// </param>
        /// <returns>
        /// An integer containing the number of milliseconds a crawler thread must wait
        /// before visiting this host.
        /// </returns>
        public int FilterUrl(ref InternetUrlToCrawl targetUrl)
        {
            string hostName = InternetUtils.HostName(targetUrl);

            return(FilterHost(ref hostName));
        }

Ejemplo n.º 4

0

Mostrar archivo

        /// <summary>
        /// Selects and returns a set of urls that are ready to be crawled.
        /// </summary>
        /// <param name="ci">The <see cref="ClientInfo"/> of the client requesting urls to crawl.</param>
        /// <param name="data">An array of <see cref="InternetUrlToCrawl"/> objects containing the selected urls.</param>
        /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/>
        /// encapsulating the error that occured if the operation fails.</returns>
        public SerializedException SelectUrlsToCrawl(ClientInfo ci, ref InternetUrlToCrawl[] data)
        {
            SerializedException sx = null;

            try
            {
                if (!ConnectToDatabase())
                {
                    throw new CWDBConnectionFailedException();
                }
                //we must use a transaction to make sure that if something goes wrong the
                //changes to the database will be rolled back.
                SqlTransaction transaction = dbcon.BeginTransaction(IsolationLevel.Serializable);                //perhaps | repeatableread
                try
                {
                    //first select the urls to crawl
                    SqlCommand cmd = new SqlCommand("cw_select_urls_to_crawl", dbcon, transaction);
                    cmd.CommandType    = CommandType.StoredProcedure;
                    cmd.CommandTimeout = 120;
                    SqlDataAdapter da = new SqlDataAdapter(cmd);
                    DataSet        ds = new DataSet();
                    da.Fill(ds);
                    da.Dispose();
                    cmd.Dispose();
                    //now delete them from the table of urls to crawl
                    data = new InternetUrlToCrawl[ds.Tables[0].Rows.Count];
                    if (data.Length > 0)
                    {
                        int i = 0;
                        foreach (DataRow dr in ds.Tables[0].Rows)
                        {
                            try
                            {
                                InternetUrlToCrawl url = new InternetUrlToCrawl((int)dr[0], (string)dr[1]);
                                if (dr[2] != DBNull.Value)
                                {
                                    url.CRC = (long)dr[2];
                                }
                                if (dr[3] != DBNull.Value)
                                {
                                    url.FlagDomain = (DomainFlagValue)((byte)dr[3]);
                                }
                                if (dr[4] != DBNull.Value)
                                {
                                    url.RobotsDisallowedPaths = (string)dr[4];
                                }
                                else
                                {
                                    RobotsTxtEntry entry = settings.Robots.GetEntry(InternetUtils.HostName(url));
                                    if (entry != null)
                                    {
                                        url.RobotsDisallowedPaths = ConcatenatePaths(entry.DisallowedPaths);
                                    }
                                    else
                                    {
                                        url.FlagFetchRobots = true;
                                    }
                                }
                                data[i++] = url;
                            }
                            catch
                            {
                                continue;
                            }
                        }
                        SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon, transaction);
                        statscmd.CommandType    = CommandType.StoredProcedure;
                        statscmd.CommandTimeout = 120;
                        statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier);
                        statscmd.Parameters.Add("@assigned", SqlDbType.BigInt);
                        statscmd.Parameters.Add("@returned", SqlDbType.BigInt);
                        statscmd.Parameters.Add("@type", SqlDbType.TinyInt);
                        statscmd.Parameters[0].Value = ci.ClientID;
                        statscmd.Parameters[1].Value = data.Length;
                        statscmd.Parameters[2].Value = DBNull.Value;
                        statscmd.Parameters[3].Value = 0;
                        statscmd.ExecuteNonQuery();
                        statscmd.Dispose();
                        transaction.Commit();
                    }
                }
                catch (Exception ex)
                {
                    transaction.Rollback();
                    if (settings.LogLevel <= CWLogLevel.LogWarning)
                    {
                        settings.Log.LogWarning("SelectUrlsToCrawl failed, Transaction was rolled back: " + ex.ToString());
                    }
                    throw ex;
                }
                finally
                {
                    UpdateClientLastActive(ci);
                    LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl);
                    if (!DisconnectFromDatabase())
                    {
                        throw new CWDBConnectionFailedException("Disconnect from database failure.");
                    }
                }
            }
            catch (Exception e)
            {
                sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString());
                if (settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    settings.Log.LogWarning("SelectUrlsToCrawl failed: " + e.ToString());
                }
            }
            return(sx);
        }

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Parser.cs Proyecto: tmzani/CrawlWave

 /// <summary>
 /// Extracts the hypertext references (links) contained in a document.
 /// </summary>
 /// <param name="content">
 /// An array of bytes holding the content of the document that will be parsed for links.
 /// </param>
 /// <param name="contentUrl">
 /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the
 /// document to be parsed for links and its associated robots.txt file.
 /// </param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating
 /// the links contained in the parsed document.
 /// </returns>
 public abstract ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl);

Ejemplo n.º 6

0

Mostrar archivo

Archivo: Parser.cs Proyecto: tmzani/CrawlWave

 /// <summary>
 /// Extracts the hypertext references (links) contained in a document.
 /// </summary>
 /// <param name="content">
 /// The content of the document that will be parsed for links.
 /// </param>
 /// <param name="contentUrl">
 /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the
 /// document to be parsed for links and its associated robots.txt file.
 /// </param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating
 /// the links contained in the parsed document.
 /// </returns>
 public abstract ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl);

Ejemplo n.º 7

0

Mostrar archivo

        /// <summary>
        /// Extracts links from the contents of a SWF document.
        /// </summary>
        /// <param name="content">The contents of the SWF document.</param>
        /// <param name="contentUrl">The url of the SWF document.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
        public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = null;

            if ((content == null) || (content.Length == 0))
            {
                throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
            }
            try
            {
                mutex.WaitOne();
                string       FileName    = globals.AppWorkPath + Guid.NewGuid().ToString();
                string       swfFileName = FileName + ".swf";
                string       htmFileName = FileName + ".htm";
                FileStream   swf         = null;
                StreamReader htm         = null;
                try
                {
                    //store the swf file
                    swf = new FileStream(swfFileName, FileMode.Create);
                    swf.Write(content, 0, content.Length);
                    swf.Close();
                    swf = null;
                    //convert it to html
                    bool success = converter.ConvertSwfFile(swfFileName, htmFileName);
                    if (success)
                    {
                        htm = new StreamReader(htmFileName, encoding);
                        string html = htm.ReadToEnd();
                        htm.Close();
                        htm   = null;
                        links = parser.ExtractLinks(ref html, ref contentUrl);
                    }
                }
                catch (Exception ex)
                {
                    if (swf != null)
                    {
                        try
                        {
                            swf.Close();
                        }
                        catch
                        {}
                    }
                    if (htm != null)
                    {
                        try
                        {
                            htm.Close();
                        }
                        catch
                        {}
                    }
                    if (globals.Settings.LogLevel <= CWLogLevel.LogInfo)
                    {
                        globals.FileLog.LogInfo("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString());
                    }
                }
                finally
                {
                    File.Delete(swfFileName);
                    File.Delete(htmFileName);
                }
            }
            catch (Exception ex)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.Message);
                }
            }
            finally
            {
                GC.Collect();
                mutex.ReleaseMutex();
            }
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            return(links);
        }

Ejemplo n.º 8

0

Mostrar archivo

 /// <summary>
 /// Extracts links from the contents of a SWF document.
 /// </summary>
 /// <param name="content">The contents of the SWF document.</param>
 /// <param name="contentUrl">The url of the PDF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="NotSupportedException">Whenever this method is called.</exception>
 /// <remarks>
 /// Since a SWF document can not be converted to a string this method <b>ALWAYS</b>
 /// throws a <see cref="NotSupportedException"/>.
 /// </remarks>
 public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
 {
     throw new NotSupportedException();
 }

Ejemplo n.º 9

0

Mostrar archivo

        /// <summary>
        /// Checks if the Robots Exclusion Standard allows the crawler to visit a url.
        /// </summary>
        /// <param name="targetUrl">The url that is to be validated.</param>
        /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param>
        /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the
        /// restrictions posed by the robots meta tag contained in the sourceUrl.</param>
        /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is
        /// allowed (false) or disallowed (true) to visit the target Url.</returns>
        /// <remarks>This method is safe for multi-threaded operations. However only one
        /// thread will be able to perform a check at any given time.
        /// </remarks>
        public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta)
        {
            bool retVal = false;             //assume that it's allowed to crawl the targetUrl

            try
            {
                mutex.WaitOne();
                //perhaphs we should use the hash code of the hostnames as keys.
                string         targetHost = InternetUtils.HostName(targetUrl);
                string         sourceHost = InternetUtils.HostName(sourceUrl);
                RobotsTxtEntry robots     = null;
                //Do we need to fetch the robots.txt for the source Url?
                if (sourceUrl.FlagFetchRobots)
                {
                    //we must fetch the robots.txt from the source url host and update sourceUrl.
                    robots = FetchRobots(sourceHost);
                    sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths);
                    sourceUrl.FlagFetchRobots       = false;               //fetch it only once
                    //check if it exists in the Hashtable, if so update it, otherwise add it
                    if (robotsTable.ContainsKey(sourceHost))
                    {
                        robotsTable[sourceHost] = robots;
                    }
                    else
                    {
                        robotsTable.Add(sourceHost, robots);
                    }
                }
                else
                {
                    //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl
                    if (!robotsTable.TryGetValue(sourceHost, out robots))
                    {
                        robots = new RobotsTxtEntry();
                        robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths);
                        robotsTable.Add(sourceHost, robots);
                    }
                    else
                    {
                        if (robots.ExpirationDate < DateTime.Today)
                        {
                            robots = FetchRobots(sourceHost);
                            robotsTable[sourceHost] = robots;
                        }
                    }
                }
                if (targetHost != sourceHost)
                {
                    //the target url is on a different host, we must get its robots.txt
                    if (!robotsTable.TryGetValue(targetHost, out robots))
                    {
                        robots = FetchRobots(targetHost);
                        robotsTable.Add(targetHost, robots);
                    }
                    else
                    {
                        if (robots.ExpirationDate < DateTime.Today)
                        {
                            robots = FetchRobots(targetHost);
                            robotsTable[targetHost] = robots;
                        }
                    }
                }
                if ((robotsMeta & RobotsMetaTagValue.NoFollow) > 0)
                {
                    //if the meta tag has the NoFollow option set then we cannot crawl targetUrl
                    retVal = true;
                }
                else
                {
                    robots = robotsTable[targetHost];
                    //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths
                    if (robots.DisallowedPaths != null)
                    {
                        for (int i = 0; i < robots.DisallowedPaths.Length; i++)
                        {
                            if (targetUrl.IndexOf(robots.DisallowedPaths[i]) != -1)
                            {
                                //we found a match. It is therefore not allowed to crawl targetUrl
                                retVal = true;
                                break;                                 //stop searching as soon as we have a match
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString());
                }
            }
            finally
            {
                mutex.ReleaseMutex();
            }
            return(retVal);
        }

Ejemplo n.º 10

0

Mostrar archivo

        /// <summary>
        /// Extracts links from the contents of a PDF document.
        /// </summary>
        /// <param name="content">The contents of the PDF document.</param>
        /// <param name="contentUrl">The url of the PDF document.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
        public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = null;

            if ((content == null) || (content.Length == 0))
            {
                throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
            }
            try
            {
                mutex.WaitOne();
                string       FileName    = globals.AppWorkPath + Guid.NewGuid().ToString();
                string       pdfFileName = FileName + ".pdf";
                string       txtFileName = FileName + ".txt";
                FileStream   pdf         = null;
                StreamReader txt         = null;
                try
                {
                    //store the pdf file
                    pdf = new FileStream(pdfFileName, FileMode.Create);
                    pdf.Write(content, 0, content.Length);
                    pdf.Close();
                    pdf = null;
                    //convert it to text
                    int success = converter.ConvertPdf2Text(pdfFileName, txtFileName);
                    if (success == 0)
                    {
                        txt = new StreamReader(txtFileName, encoding);
                        string text = txt.ReadToEnd();
                        txt.Close();
                        txt   = null;
                        links = parser.ExtractLinks(ref text, ref contentUrl);
                    }
                    else
                    {
                        txt.Close();
                        txt = null;
                    }
                }
                catch (Exception ex)
                {
                    if (pdf != null)
                    {
                        try
                        {
                            pdf.Close();
                        }
                        catch
                        {}
                    }
                    if (txt != null)
                    {
                        try
                        {
                            txt.Close();
                        }
                        catch
                        {}
                    }
                    if (globals.Settings.LogLevel <= CWLogLevel.LogInfo)
                    {
                        globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString());
                    }
                }
                finally
                {
                    File.Delete(pdfFileName);
                    File.Delete(txtFileName);
                }
            }
            catch
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url);
                }
            }
            finally
            {
                GC.Collect();
                mutex.ReleaseMutex();
            }
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            return(links);
        }

Ejemplo n.º 11

0

Mostrar archivo

 /// <summary>
 /// Extracts links from the contents of a PDF document.
 /// </summary>
 /// <param name="content">The contents of the PDF document.</param>
 /// <param name="contentUrl">The url of the PDF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="NotSupportedException">Whenever this method is called.</exception>
 /// <remarks>
 /// Since a PDF document can not be converted to a string this method <b>ALWAYS</b>
 /// throws a <see cref="NotSupportedException"/>.
 /// </remarks>
 public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
 {
     throw new NotSupportedException("The ExtractLinks method of the PdfParser cannot accept a string as input.");
 }

Ejemplo n.º 12

0

Mostrar archivo

Archivo: TextParser.cs Proyecto: tmzani/CrawlWave

        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. It is passed by
        /// reference in order to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = new ArrayList();

            // It is important to notice that if the FlagFetchRobots of the contentUrl is
            // true then the TextParser must remember this value because during the Robots
            // Filtering it will become false so as not to download the robots.txt file
            // every time a Url must be filtered.
            //bool FlagFetchRobots = contentUrl.FlagFetchRobots;
            try
            {
                //make sure only one thread will parse contents at a time.
                //mutex.WaitOne();
                if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                {
                    contentUrl.FlagDomain = ExtractDomainFlag(ref content);

                    if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                    {
                        if (InternetUtils.HostName(contentUrl).Contains("ebay.com"))
                        {
                            contentUrl.FlagDomain = DomainFlagValue.MustVisit;
                        }
                    }
                }
                //perform the hyperlink matching
                MatchCollection matches = hrefRegex.Matches(content);

                if (matches.Count > 0)
                {
                    string documentUrl = contentUrl.Url;
                    string baseUrl     = BaseUrl(ref documentUrl);
                    byte   priority    = 0;

                    foreach (Match m in matches)
                    {
                        try
                        {
                            string url = m.Value.Trim();
                            url      = NormalizeUrl(ref url, ref baseUrl);
                            priority = CleanUrlParams(ref url);
                            if (FilterUrl(ref url, ref documentUrl))
                            {
                                InternetUrlToIndex iurl = new InternetUrlToIndex(url);
                                iurl.Priority   = priority;
                                iurl.FlagDomain = domainFilter.FilterUrl(ref url);
                                //[mod 24/2/05] No robots.txt checking is performed for non-greek urls
                                if (iurl.FlagDomain == DomainFlagValue.MustVisit)
                                {
                                    iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta);
                                }
                                else
                                {
                                    iurl.FlagRobots = false;
                                }
                                if (!links.Contains(iurl))
                                {
                                    links.Add(iurl);
                                }
                            }
                        }
                        catch
                        {
                            if (globals.Settings.LogLevel == CWLogLevel.LogInfo)
                            {
                                globals.FileLog.LogInfo("TextParser failed to parse " + m.Value);
                            }
                            continue;
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning(ex.Message);
                }
            }
            finally
            {
                //mutex.ReleaseMutex();
            }
            //contentUrl.FlagFetchRobots = FlagFetchRobots;
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            links.TrimToSize();
            return(links);
        }

Ejemplos de InternetUrlToCrawl en C# (CSharp)