/// <summary> /// Extracts links from the contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <param name="contentUrl">The url of the document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return(links); }
/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. IIt is passed as /// an array of bytes containing the text contents in UTF8 binary format, in order /// to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList retVal = null; try { mutex.WaitOne(); string html = Encoding.UTF8.GetString(content); retVal = ExtractLinks(ref html, ref contentUrl); } catch {} finally { mutex.ReleaseMutex(); } return(retVal); }
/// <summary> /// Checks if at least 30 seconds have passed since the last request to a given host /// was made, in order not to slammer it with simultaneous or frequent requests. /// </summary> /// <param name="targetUrl"> /// A <see cref="InternetUrlToCrawl"/> that is served by a host we wish to check. /// </param> /// <returns> /// An integer containing the number of milliseconds a crawler thread must wait /// before visiting this host. /// </returns> public int FilterUrl(ref InternetUrlToCrawl targetUrl) { string hostName = InternetUtils.HostName(targetUrl); return(FilterHost(ref hostName)); }
/// <summary> /// Selects and returns a set of urls that are ready to be crawled. /// </summary> /// <param name="ci">The <see cref="ClientInfo"/> of the client requesting urls to crawl.</param> /// <param name="data">An array of <see cref="InternetUrlToCrawl"/> objects containing the selected urls.</param> /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/> /// encapsulating the error that occured if the operation fails.</returns> public SerializedException SelectUrlsToCrawl(ClientInfo ci, ref InternetUrlToCrawl[] data) { SerializedException sx = null; try { if (!ConnectToDatabase()) { throw new CWDBConnectionFailedException(); } //we must use a transaction to make sure that if something goes wrong the //changes to the database will be rolled back. SqlTransaction transaction = dbcon.BeginTransaction(IsolationLevel.Serializable); //perhaps | repeatableread try { //first select the urls to crawl SqlCommand cmd = new SqlCommand("cw_select_urls_to_crawl", dbcon, transaction); cmd.CommandType = CommandType.StoredProcedure; cmd.CommandTimeout = 120; SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); //now delete them from the table of urls to crawl data = new InternetUrlToCrawl[ds.Tables[0].Rows.Count]; if (data.Length > 0) { int i = 0; foreach (DataRow dr in ds.Tables[0].Rows) { try { InternetUrlToCrawl url = new InternetUrlToCrawl((int)dr[0], (string)dr[1]); if (dr[2] != DBNull.Value) { url.CRC = (long)dr[2]; } if (dr[3] != DBNull.Value) { url.FlagDomain = (DomainFlagValue)((byte)dr[3]); } if (dr[4] != DBNull.Value) { url.RobotsDisallowedPaths = (string)dr[4]; } else { RobotsTxtEntry entry = settings.Robots.GetEntry(InternetUtils.HostName(url)); if (entry != null) { url.RobotsDisallowedPaths = ConcatenatePaths(entry.DisallowedPaths); } else { url.FlagFetchRobots = true; } } data[i++] = url; } catch { continue; } } SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon, transaction); statscmd.CommandType = CommandType.StoredProcedure; statscmd.CommandTimeout = 120; statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier); statscmd.Parameters.Add("@assigned", SqlDbType.BigInt); statscmd.Parameters.Add("@returned", SqlDbType.BigInt); statscmd.Parameters.Add("@type", SqlDbType.TinyInt); statscmd.Parameters[0].Value = ci.ClientID; statscmd.Parameters[1].Value = data.Length; statscmd.Parameters[2].Value = DBNull.Value; statscmd.Parameters[3].Value = 0; statscmd.ExecuteNonQuery(); statscmd.Dispose(); transaction.Commit(); } } catch (Exception ex) { transaction.Rollback(); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("SelectUrlsToCrawl failed, Transaction was rolled back: " + ex.ToString()); } throw ex; } finally { UpdateClientLastActive(ci); LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl); if (!DisconnectFromDatabase()) { throw new CWDBConnectionFailedException("Disconnect from database failure."); } } } catch (Exception e) { sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString()); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("SelectUrlsToCrawl failed: " + e.ToString()); } } return(sx); }
/// <summary> /// Extracts the hypertext references (links) contained in a document. /// </summary> /// <param name="content"> /// An array of bytes holding the content of the document that will be parsed for links. /// </param> /// <param name="contentUrl"> /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the /// document to be parsed for links and its associated robots.txt file. /// </param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating /// the links contained in the parsed document. /// </returns> public abstract ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl);
/// <summary> /// Extracts the hypertext references (links) contained in a document. /// </summary> /// <param name="content"> /// The content of the document that will be parsed for links. /// </param> /// <param name="contentUrl"> /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the /// document to be parsed for links and its associated robots.txt file. /// </param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating /// the links contained in the parsed document. /// </returns> public abstract ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl);
/// <summary> /// Extracts links from the contents of a SWF document. /// </summary> /// <param name="content">The contents of the SWF document.</param> /// <param name="contentUrl">The url of the SWF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links = null; if ((content == null) || (content.Length == 0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string swfFileName = FileName + ".swf"; string htmFileName = FileName + ".htm"; FileStream swf = null; StreamReader htm = null; try { //store the swf file swf = new FileStream(swfFileName, FileMode.Create); swf.Write(content, 0, content.Length); swf.Close(); swf = null; //convert it to html bool success = converter.ConvertSwfFile(swfFileName, htmFileName); if (success) { htm = new StreamReader(htmFileName, encoding); string html = htm.ReadToEnd(); htm.Close(); htm = null; links = parser.ExtractLinks(ref html, ref contentUrl); } } catch (Exception ex) { if (swf != null) { try { swf.Close(); } catch {} } if (htm != null) { try { htm.Close(); } catch {} } if (globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogInfo("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString()); } } finally { File.Delete(swfFileName); File.Delete(htmFileName); } } catch (Exception ex) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.Message); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return(links); }
/// <summary> /// Extracts links from the contents of a SWF document. /// </summary> /// <param name="content">The contents of the SWF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="NotSupportedException">Whenever this method is called.</exception> /// <remarks> /// Since a SWF document can not be converted to a string this method <b>ALWAYS</b> /// throws a <see cref="NotSupportedException"/>. /// </remarks> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { throw new NotSupportedException(); }
/// <summary> /// Checks if the Robots Exclusion Standard allows the crawler to visit a url. /// </summary> /// <param name="targetUrl">The url that is to be validated.</param> /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param> /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the /// restrictions posed by the robots meta tag contained in the sourceUrl.</param> /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is /// allowed (false) or disallowed (true) to visit the target Url.</returns> /// <remarks>This method is safe for multi-threaded operations. However only one /// thread will be able to perform a check at any given time. /// </remarks> public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta) { bool retVal = false; //assume that it's allowed to crawl the targetUrl try { mutex.WaitOne(); //perhaphs we should use the hash code of the hostnames as keys. string targetHost = InternetUtils.HostName(targetUrl); string sourceHost = InternetUtils.HostName(sourceUrl); RobotsTxtEntry robots = null; //Do we need to fetch the robots.txt for the source Url? if (sourceUrl.FlagFetchRobots) { //we must fetch the robots.txt from the source url host and update sourceUrl. robots = FetchRobots(sourceHost); sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths); sourceUrl.FlagFetchRobots = false; //fetch it only once //check if it exists in the Hashtable, if so update it, otherwise add it if (robotsTable.ContainsKey(sourceHost)) { robotsTable[sourceHost] = robots; } else { robotsTable.Add(sourceHost, robots); } } else { //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl if (!robotsTable.TryGetValue(sourceHost, out robots)) { robots = new RobotsTxtEntry(); robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths); robotsTable.Add(sourceHost, robots); } else { if (robots.ExpirationDate < DateTime.Today) { robots = FetchRobots(sourceHost); robotsTable[sourceHost] = robots; } } } if (targetHost != sourceHost) { //the target url is on a different host, we must get its robots.txt if (!robotsTable.TryGetValue(targetHost, out robots)) { robots = FetchRobots(targetHost); robotsTable.Add(targetHost, robots); } else { if (robots.ExpirationDate < DateTime.Today) { robots = FetchRobots(targetHost); robotsTable[targetHost] = robots; } } } if ((robotsMeta & RobotsMetaTagValue.NoFollow) > 0) { //if the meta tag has the NoFollow option set then we cannot crawl targetUrl retVal = true; } else { robots = robotsTable[targetHost]; //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths if (robots.DisallowedPaths != null) { for (int i = 0; i < robots.DisallowedPaths.Length; i++) { if (targetUrl.IndexOf(robots.DisallowedPaths[i]) != -1) { //we found a match. It is therefore not allowed to crawl targetUrl retVal = true; break; //stop searching as soon as we have a match } } } } } catch (Exception e) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString()); } } finally { mutex.ReleaseMutex(); } return(retVal); }
/// <summary> /// Extracts links from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links = null; if ((content == null) || (content.Length == 0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string pdfFileName = FileName + ".pdf"; string txtFileName = FileName + ".txt"; FileStream pdf = null; StreamReader txt = null; try { //store the pdf file pdf = new FileStream(pdfFileName, FileMode.Create); pdf.Write(content, 0, content.Length); pdf.Close(); pdf = null; //convert it to text int success = converter.ConvertPdf2Text(pdfFileName, txtFileName); if (success == 0) { txt = new StreamReader(txtFileName, encoding); string text = txt.ReadToEnd(); txt.Close(); txt = null; links = parser.ExtractLinks(ref text, ref contentUrl); } else { txt.Close(); txt = null; } } catch (Exception ex) { if (pdf != null) { try { pdf.Close(); } catch {} } if (txt != null) { try { txt.Close(); } catch {} } if (globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString()); } } finally { File.Delete(pdfFileName); File.Delete(txtFileName); } } catch { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return(links); }
/// <summary> /// Extracts links from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="NotSupportedException">Whenever this method is called.</exception> /// <remarks> /// Since a PDF document can not be converted to a string this method <b>ALWAYS</b> /// throws a <see cref="NotSupportedException"/>. /// </remarks> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { throw new NotSupportedException("The ExtractLinks method of the PdfParser cannot accept a string as input."); }
/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. It is passed by /// reference in order to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); // It is important to notice that if the FlagFetchRobots of the contentUrl is // true then the TextParser must remember this value because during the Robots // Filtering it will become false so as not to download the robots.txt file // every time a Url must be filtered. //bool FlagFetchRobots = contentUrl.FlagFetchRobots; try { //make sure only one thread will parse contents at a time. //mutex.WaitOne(); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) { contentUrl.FlagDomain = ExtractDomainFlag(ref content); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) { if (InternetUtils.HostName(contentUrl).Contains("ebay.com")) { contentUrl.FlagDomain = DomainFlagValue.MustVisit; } } } //perform the hyperlink matching MatchCollection matches = hrefRegex.Matches(content); if (matches.Count > 0) { string documentUrl = contentUrl.Url; string baseUrl = BaseUrl(ref documentUrl); byte priority = 0; foreach (Match m in matches) { try { string url = m.Value.Trim(); url = NormalizeUrl(ref url, ref baseUrl); priority = CleanUrlParams(ref url); if (FilterUrl(ref url, ref documentUrl)) { InternetUrlToIndex iurl = new InternetUrlToIndex(url); iurl.Priority = priority; iurl.FlagDomain = domainFilter.FilterUrl(ref url); //[mod 24/2/05] No robots.txt checking is performed for non-greek urls if (iurl.FlagDomain == DomainFlagValue.MustVisit) { iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta); } else { iurl.FlagRobots = false; } if (!links.Contains(iurl)) { links.Add(iurl); } } } catch { if (globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("TextParser failed to parse " + m.Value); } continue; } } } } catch (Exception ex) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning(ex.Message); } } finally { //mutex.ReleaseMutex(); } //contentUrl.FlagFetchRobots = FlagFetchRobots; ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); links.TrimToSize(); return(links); }