示例#1
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToIndex"/> object from an existing <see cref="InternetUrlToIndex"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToIndex"/> object</param>
 public InternetUrlToIndex(InternetUrlToIndex IUrl)
 {
     ID  = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     flagRobots = IUrl.FlagRobots;
     flagDomain = IUrl.FlagDomain;
     priority   = IUrl.Priority;
 }
示例#2
0
 /// <summary>
 /// Returns the host name of an <see cref="InternetUrlToIndex"/>
 /// </summary>
 /// <param name="url">The <see cref="InternetUrlToIndex"/> to examine</param>
 /// <returns>A string containing the Url's host name or IP Address.</returns>
 public static string HostName(InternetUrlToIndex url)
 {
     try
     {
         Uri uri = new Uri(url.Url);
         return(uri.Host);
     }
     catch
     {
         return(String.Empty);
     }
 }
示例#3
0
        /// <summary>
        /// Inserts a new Url in the system's database.
        /// </summary>
        /// <returns>True if the operation succeeds, false otherwise.</returns>
        private bool InsertUrl()
        {
            bool retVal = false;
            try
            {
                //Perform some extra validation and processing of the Url
                string url = txtUrl.Text;
                byte priority = 0;
                InternetUrlToIndex toInsert = null;
                priority = ProcessUrl(ref url);
                txtUrl.Text = url;
                toInsert = new InternetUrlToIndex(url);
                toInsert.FlagDomain = DomainFlagValue.Unknown;
                toInsert.FlagRobots = false;
                toInsert.Priority = priority;
                //Connect to the database
                SqlConnection dbcon = null;
                try
                {
                    dbcon = new SqlConnection(globals.ProvideConnectionString());
                    dbcon.Open();
                }
                catch(Exception e)
                {
                    if(dbcon != null)
                    {
                        dbcon.Dispose();
                        dbcon = null;
                    }
                    throw e;
                }
                //perform the insertion
                SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon);
                hostcmd.CommandType = CommandType.StoredProcedure;
                hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier);
                hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100);

                string hostname = InternetUtils.HostName(toInsert);
                Guid host_id = new Guid(MD5Hash.md5(hostname));
                hostcmd.Parameters[0].Value = host_id;
                hostcmd.Parameters[1].Value = hostname;
                hostcmd.ExecuteNonQuery();

                SqlCommand urlcmd = new SqlCommand("cw_insert_url", dbcon);
                urlcmd.CommandType = CommandType.StoredProcedure;
                urlcmd.Parameters.Add("@url", SqlDbType.NVarChar, 500);
                urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier);
                urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt);
                urlcmd.Parameters.Add("@id", SqlDbType.Int);
                urlcmd.Parameters["@id"].Direction = ParameterDirection.Output;
                urlcmd.Parameters[0].Value = toInsert.Url;
                urlcmd.Parameters[1].Value = new Guid(toInsert.MD5);
                urlcmd.Parameters[2].Value = host_id;
                urlcmd.Parameters[3].Value = toInsert.Priority;
                urlcmd.Parameters[4].Value = (byte)toInsert.FlagDomain;
                urlcmd.Parameters[5].Value = (byte)((toInsert.FlagRobots)?1:0);
                urlcmd.ExecuteNonQuery();
                urlcmd.Dispose();
                dbcon.Close();
                retVal = true;
            }
            catch(Exception e)
            {
                globals.Log.LogError("CrawlWave.ServerManager failed to insert an entry in the Banned Hosts table: " + e.ToString());
                MessageBox.Show(this.Text + " failed to insert the provided host in the list of Banned Hosts:\n" + e.Message);
                GC.Collect();
            }
            return retVal;
        }
示例#4
0
 /// <summary>
 /// Checks if at least 30 seconds have passed since the last request to a given host
 /// was made, in order not to slammer it with simultaneous or frequent requests.
 /// </summary>
 /// <param name="targetUrl">
 /// A <see cref="InternetUrlToIndex"/> that is served by a host we wish to check.
 /// </param>
 /// <returns>
 /// An integer containing the number of milliseconds a crawler thread must wait
 /// before visiting this host.
 /// </returns>
 public int FilterUrl(ref InternetUrlToIndex targetUrl)
 {
     string hostName = InternetUtils.HostName(targetUrl);
     return FilterHost(ref hostName);
 }
示例#5
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToIndex"/> object from an existing <see cref="InternetUrlToIndex"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToIndex"/> object</param>
 public InternetUrlToIndex(InternetUrlToIndex IUrl)
 {
     ID = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     flagRobots = IUrl.FlagRobots;
     flagDomain = IUrl.FlagDomain;
     priority = IUrl.Priority;
 }
示例#6
0
        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other 
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. It is passed by
        /// reference in order to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = new ArrayList();
            // It is important to notice that if the FlagFetchRobots of the contentUrl is
            // true then the TextParser must remember this value because during the Robots
            // Filtering it will become false so as not to download the robots.txt file
            // every time a Url must be filtered.
            //bool FlagFetchRobots = contentUrl.FlagFetchRobots;
            try
            {
                //make sure only one thread will parse contents at a time.
                //mutex.WaitOne();
                if(contentUrl.FlagDomain!=DomainFlagValue.MustVisit)
                {
                    contentUrl.FlagDomain = ExtractDomainFlag(ref content);

                    if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                        if (InternetUtils.HostName(contentUrl).Contains("ebay.com"))
                            contentUrl.FlagDomain = DomainFlagValue.MustVisit;
                }
                //perform the hyperlink matching
                MatchCollection matches = hrefRegex.Matches(content);

                if(matches.Count>0)
                {
                    string documentUrl = contentUrl.Url;
                    string baseUrl = BaseUrl(ref documentUrl);
                    byte priority = 0;

                    foreach(Match m in matches)
                    {
                        try
                        {
                            string url = m.Value.Trim();
                            url = NormalizeUrl(ref url, ref baseUrl);
                            priority = CleanUrlParams(ref url);
                            if(FilterUrl(ref url, ref documentUrl))
                            {
                                InternetUrlToIndex iurl = new InternetUrlToIndex(url);
                                iurl.Priority = priority;
                                iurl.FlagDomain = domainFilter.FilterUrl(ref url);
                                //[mod 24/2/05] No robots.txt checking is performed for non-greek urls
                                if(iurl.FlagDomain == DomainFlagValue.MustVisit)
                                {
                                    iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta);
                                }
                                else
                                {
                                    iurl.FlagRobots = false;
                                }
                                if(!links.Contains(iurl))
                                {
                                    links.Add(iurl);
                                }
                            }
                        }
                        catch
                        {
                            if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                            {
                                globals.FileLog.LogInfo("TextParser failed to parse " + m.Value);
                            }
                            continue;
                        }
                    }
                }
            }
            catch(Exception ex)
            {
                if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning(ex.Message);
                }
            }
            finally
            {
                //mutex.ReleaseMutex();
            }
            //contentUrl.FlagFetchRobots = FlagFetchRobots;
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
            OnExtractLinksComplete(e);
            links.TrimToSize();
            return links;
        }
示例#7
0
 /// <summary>
 /// Returns the host name of an <see cref="InternetUrlToIndex"/>
 /// </summary>
 /// <param name="url">The <see cref="InternetUrlToIndex"/> to examine</param>
 /// <returns>A string containing the Url's host name or IP Address.</returns>
 public static string HostName(InternetUrlToIndex url)
 {
     try
     {
         Uri uri = new Uri(url.Url);
         return uri.Host;
     }
     catch
     {
         return String.Empty;
     }
 }