/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. It is passed by /// reference in order to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); // It is important to notice that if the FlagFetchRobots of the contentUrl is // true then the TextParser must remember this value because during the Robots // Filtering it will become false so as not to download the robots.txt file // every time a Url must be filtered. //bool FlagFetchRobots = contentUrl.FlagFetchRobots; try { //make sure only one thread will parse contents at a time. //mutex.WaitOne(); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) { contentUrl.FlagDomain = ExtractDomainFlag(ref content); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) { if (InternetUtils.HostName(contentUrl).Contains("ebay.com")) { contentUrl.FlagDomain = DomainFlagValue.MustVisit; } } } //perform the hyperlink matching MatchCollection matches = hrefRegex.Matches(content); if (matches.Count > 0) { string documentUrl = contentUrl.Url; string baseUrl = BaseUrl(ref documentUrl); byte priority = 0; foreach (Match m in matches) { try { string url = m.Value.Trim(); url = NormalizeUrl(ref url, ref baseUrl); priority = CleanUrlParams(ref url); if (FilterUrl(ref url, ref documentUrl)) { InternetUrlToIndex iurl = new InternetUrlToIndex(url); iurl.Priority = priority; iurl.FlagDomain = domainFilter.FilterUrl(ref url); //[mod 24/2/05] No robots.txt checking is performed for non-greek urls if (iurl.FlagDomain == DomainFlagValue.MustVisit) { iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta); } else { iurl.FlagRobots = false; } if (!links.Contains(iurl)) { links.Add(iurl); } } } catch { if (globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("TextParser failed to parse " + m.Value); } continue; } } } } catch (Exception ex) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning(ex.Message); } } finally { //mutex.ReleaseMutex(); } //contentUrl.FlagFetchRobots = FlagFetchRobots; ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); links.TrimToSize(); return(links); }
/// <summary> /// Checks if at least 30 seconds have passed since the last request to a given host /// was made, in order not to slammer it with simultaneous or frequent requests. /// </summary> /// <param name="targetUrl"> /// A <see cref="InternetUrlToIndex"/> that is served by a host we wish to check. /// </param> /// <returns> /// An integer containing the number of milliseconds a crawler thread must wait /// before visiting this host. /// </returns> public int FilterUrl(ref InternetUrlToIndex targetUrl) { string hostName = InternetUtils.HostName(targetUrl); return(FilterHost(ref hostName)); }
/// <summary> /// Inserts a new Url in the system's database. /// </summary> /// <returns>True if the operation succeeds, false otherwise.</returns> private bool InsertUrl() { bool retVal = false; try { //Perform some extra validation and processing of the Url string url = txtUrl.Text; byte priority = 0; InternetUrlToIndex toInsert = null; priority = ProcessUrl(ref url); txtUrl.Text = url; toInsert = new InternetUrlToIndex(url); toInsert.FlagDomain = DomainFlagValue.Unknown; toInsert.FlagRobots = false; toInsert.Priority = priority; //Connect to the database SqlConnection dbcon = null; try { dbcon = new SqlConnection(globals.ProvideConnectionString()); dbcon.Open(); } catch (Exception e) { if (dbcon != null) { dbcon.Dispose(); dbcon = null; } throw e; } //perform the insertion SqlCommand hostcmd = new SqlCommand("cw_insert_host", dbcon); hostcmd.CommandType = CommandType.StoredProcedure; hostcmd.Parameters.Add("@host_id", SqlDbType.UniqueIdentifier); hostcmd.Parameters.Add("@host_name", SqlDbType.NVarChar, 100); string hostname = InternetUtils.HostName(toInsert); Guid host_id = new Guid(MD5Hash.md5(hostname)); hostcmd.Parameters[0].Value = host_id; hostcmd.Parameters[1].Value = hostname; hostcmd.ExecuteNonQuery(); SqlCommand urlcmd = new SqlCommand("cw_insert_url", dbcon); urlcmd.CommandType = CommandType.StoredProcedure; urlcmd.Parameters.Add("@url", SqlDbType.NVarChar, 500); urlcmd.Parameters.Add("@url_md5", SqlDbType.UniqueIdentifier); urlcmd.Parameters.Add("@url_host_id", SqlDbType.UniqueIdentifier); urlcmd.Parameters.Add("@url_priority", SqlDbType.TinyInt); urlcmd.Parameters.Add("@flag_domain", SqlDbType.TinyInt); urlcmd.Parameters.Add("@flag_robots", SqlDbType.TinyInt); urlcmd.Parameters.Add("@id", SqlDbType.Int); urlcmd.Parameters["@id"].Direction = ParameterDirection.Output; urlcmd.Parameters[0].Value = toInsert.Url; urlcmd.Parameters[1].Value = new Guid(toInsert.MD5); urlcmd.Parameters[2].Value = host_id; urlcmd.Parameters[3].Value = toInsert.Priority; urlcmd.Parameters[4].Value = (byte)toInsert.FlagDomain; urlcmd.Parameters[5].Value = (byte)((toInsert.FlagRobots)?1:0); urlcmd.ExecuteNonQuery(); urlcmd.Dispose(); dbcon.Close(); retVal = true; } catch (Exception e) { globals.Log.LogError("CrawlWave.ServerManager failed to insert an entry in the Banned Hosts table: " + e.ToString()); MessageBox.Show(this.Text + " failed to insert the provided host in the list of Banned Hosts:\n" + e.Message); GC.Collect(); } return(retVal); }