Ejemplo n.º 1
0
        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. It is passed by
        /// reference in order to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = new ArrayList();

            // It is important to notice that if the FlagFetchRobots of the contentUrl is
            // true then the TextParser must remember this value because during the Robots
            // Filtering it will become false so as not to download the robots.txt file
            // every time a Url must be filtered.
            //bool FlagFetchRobots = contentUrl.FlagFetchRobots;
            try
            {
                //make sure only one thread will parse contents at a time.
                //mutex.WaitOne();
                if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                {
                    contentUrl.FlagDomain = ExtractDomainFlag(ref content);

                    if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                    {
                        if (InternetUtils.HostName(contentUrl).Contains("ebay.com"))
                        {
                            contentUrl.FlagDomain = DomainFlagValue.MustVisit;
                        }
                    }
                }
                //perform the hyperlink matching
                MatchCollection matches = hrefRegex.Matches(content);

                if (matches.Count > 0)
                {
                    string documentUrl = contentUrl.Url;
                    string baseUrl     = BaseUrl(ref documentUrl);
                    byte   priority    = 0;

                    foreach (Match m in matches)
                    {
                        try
                        {
                            string url = m.Value.Trim();
                            url      = NormalizeUrl(ref url, ref baseUrl);
                            priority = CleanUrlParams(ref url);
                            if (FilterUrl(ref url, ref documentUrl))
                            {
                                InternetUrlToIndex iurl = new InternetUrlToIndex(url);
                                iurl.Priority   = priority;
                                iurl.FlagDomain = domainFilter.FilterUrl(ref url);
                                //[mod 24/2/05] No robots.txt checking is performed for non-greek urls
                                if (iurl.FlagDomain == DomainFlagValue.MustVisit)
                                {
                                    iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta);
                                }
                                else
                                {
                                    iurl.FlagRobots = false;
                                }
                                if (!links.Contains(iurl))
                                {
                                    links.Add(iurl);
                                }
                            }
                        }
                        catch
                        {
                            if (globals.Settings.LogLevel == CWLogLevel.LogInfo)
                            {
                                globals.FileLog.LogInfo("TextParser failed to parse " + m.Value);
                            }
                            continue;
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning(ex.Message);
                }
            }
            finally
            {
                //mutex.ReleaseMutex();
            }
            //contentUrl.FlagFetchRobots = FlagFetchRobots;
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            links.TrimToSize();
            return(links);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Checks if at least 30 seconds have passed since the last request to a given host
        /// was made, in order not to slammer it with simultaneous or frequent requests.
        /// </summary>
        /// <param name="targetUrl">
        /// A <see cref="InternetUrlToIndex"/> that is served by a host we wish to check.
        /// </param>
        /// <returns>
        /// An integer containing the number of milliseconds a crawler thread must wait
        /// before visiting this host.
        /// </returns>
        public int FilterUrl(ref InternetUrlToIndex targetUrl)
        {
            string hostName = InternetUtils.HostName(targetUrl);

            return(FilterHost(ref hostName));
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Inserts a new Url in the system's database.
        /// </summary>
        /// <returns>True if the operation succeeds, false otherwise.</returns>
        private bool InsertUrl()
        {
            bool retVal = false;

            try
            {
                //Perform some extra validation and processing of the Url
                string             url      = txtUrl.Text;
                byte               priority = 0;
                InternetUrlToIndex toInsert = null;
                priority            = ProcessUrl(ref url);
                txtUrl.Text         = url;
                toInsert            = new InternetUrlToIndex(url);
                toInsert.FlagDomain = DomainFlagValue.Unknown;
                toInsert.FlagRobots = false;
                toInsert.Priority   = priority;
                //Connect to the database
                SqlConnection dbcon = null;
                try
                {
                    dbcon = new SqlConnection(globals.ProvideConnectionString());
                    dbcon.Open();
                }
                catch (Exception e)
                {
                    if (dbcon != null)
                    {
                        dbcon.Dispose();
                        dbcon = null;
                    }
                    throw e;
                }
                //perform the insertion
                SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon);
                hostcmd.CommandType = CommandType.StoredProcedure;
                hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100);

                string hostname = InternetUtils.HostName(toInsert);
                Guid   host_id  = new Guid(MD5Hash.md5(hostname));
                hostcmd.Parameters[0].Value = host_id;
                hostcmd.Parameters[1].Value = hostname;
                hostcmd.ExecuteNonQuery();

                SqlCommand urlcmd = new SqlCommand("cw_insert_url", dbcon);
                urlcmd.CommandType = CommandType.StoredProcedure;
                urlcmd.Parameters.Add("@url", SqlDbType.NVarChar, 500);
                urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@id", SqlDbType.Int);
                urlcmd.Parameters["@id"].Direction = ParameterDirection.Output;
                urlcmd.Parameters[0].Value         = toInsert.Url;
                urlcmd.Parameters[1].Value         = new Guid(toInsert.MD5);
                urlcmd.Parameters[2].Value         = host_id;
                urlcmd.Parameters[3].Value         = toInsert.Priority;
                urlcmd.Parameters[4].Value         = (byte)toInsert.FlagDomain;
                urlcmd.Parameters[5].Value         = (byte)((toInsert.FlagRobots)?1:0);
                urlcmd.ExecuteNonQuery();
                urlcmd.Dispose();
                dbcon.Close();
                retVal = true;
            }
            catch (Exception e)
            {
                globals.Log.LogError("CrawlWave.ServerManager failed to insert an entry in the Banned Hosts table: " + e.ToString());
                MessageBox.Show(this.Text + " failed to insert the provided host in the list of Banned Hosts:\n" + e.Message);
                GC.Collect();
            }
            return(retVal);
        }