//public static void GetLinks(string strString, string strRootUrl,string urlPrefix, ref ArrayList documents,string pattern,string regular) //{ // // Remove comments and JavaScript and fix links // strString = HTMLStringHelper.RemoveComments(strString); // strString = HTMLStringHelper.RemoveScripts(strString); // // Set root url // string rootUrl = string.Empty; // if (strRootUrl != null) // rootUrl = strRootUrl.Trim(); // if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) // rootUrl += "/"; // // Extract HREF targets // Regex regex = new Regex("href=[\"']?(?<group>[\\w\\d._=&?/;#-]+)[\"']?", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline); // // get all the matches depending upon the regular expression // MatchCollection mcl = regex.Matches(strString); // string strUrl = string.Empty; // foreach (Match ml in mcl) // { // if (ml.Groups.Count > 1) // { // strUrl = ml.Groups[1].Value.Replace("&", "&"); // // Get fully qualified url (best guess) // if (!strUrl.StartsWith("http://")) // { // try // { // UriBuilder uriBuilder = new UriBuilder(rootUrl); // uriBuilder.Path = strUrl; // strUrl = uriBuilder.Uri.ToString(); // } // catch (Exception) // { // strUrl = strUrl.Replace("../", string.Empty); // if (strUrl.StartsWith("..http")) // strUrl = strUrl.Substring(2); // if (!strUrl.StartsWith("http")) // strUrl = urlPrefix + strUrl; // } // } // // Add url to document list if not already present // if (strUrl.IndexOf(pattern) != -1) // { // if (regular != null && regular.Length > 0) // { // if (ValidateURL(strUrl, regular)) // if (!documents.Contains(strUrl)) // documents.Add(strUrl); // } // else // if (!documents.Contains(strUrl)) // documents.Add(strUrl); // } // } // } //} #region Overload GetLinkss public static void GetLinks(string strString, string strRootUrl, string urlPrefix, ref ArrayList documents, string pattern, string regular) { // Remove comments and JavaScript and fix links strString = HTMLStringHelper.RemoveComments(strString); strString = HTMLStringHelper.RemoveScripts(strString); // Set root url string rootUrl = string.Empty; if (strRootUrl != null) { rootUrl = strRootUrl.Trim(); } if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) { rootUrl += "/"; } // Extract HREF targets //Regex regex = new Regex("href=[\"']?(?<group>[\\w\\d._=&?/;#-]+)[\"']?", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline); //Todo: Regex regex = new Regex("href\\s*=\\s*(?<s>['\"]?)(?<group>[^ >]+)\\k<s>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline); //Regex regex = new Regex("href\\s*=\\s*(?<s>['\"])(?<group>[^>]+?)\\k<s>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline); // get all the matches depending upon the regular expression MatchCollection mcl = regex.Matches(strString); string strUrl = string.Empty; foreach (Match ml in mcl) { if (ml.Groups.Count > 1) { strUrl = ml.Groups["group"].Value.Replace("&", "&").Replace("\"", ""); // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = strUrl.Replace("../", string.Empty); if (strUrl.StartsWith("..http")) { strUrl = strUrl.Substring(2); } if (!strUrl.StartsWith("http")) { strUrl = urlPrefix + strUrl; } } } // Add url to document list if not already present if (pattern.Length > 0) { if (strUrl.IndexOf(pattern) != -1) { if (regular != null && regular.Length > 0) { if (ValidateURL(strUrl, regular)) { if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } else if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } else if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } }
/// <summary> /// Retrieves the collection of HTML links in a string. /// </summary> /// <param name="strString">The string.</param> /// <param name="strRootUrl">Root url (may be null).</param> /// <param name="documents">Collection of document link strings.</param> /// <param name="images">Collection of image link strings.</param> public static void GetLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { // Remove comments and JavaScript and fix links strString = HTMLStringHelper.RemoveComments(strString); strString = HTMLStringHelper.RemoveScripts(strString); HTMLStringHelper parser = new HTMLStringHelper(strString); parser.ReplaceEvery("\'", "\""); // Set root url string rootUrl = ""; if (strRootUrl != null) { rootUrl = strRootUrl.Trim(); } if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) { rootUrl += "/"; } // Extract HREF targets string strUrl = ""; parser.ResetPosition(); while (parser.SkipToEndOfNoCase("href=\"")) { if (parser.ExtractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { if (strUrl.IndexOf("mailto:") == -1) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to document list if not already present if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } } } // Extract SRC targets parser.ResetPosition(); while (parser.SkipToEndOfNoCase("src=\"")) { if (parser.ExtractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to images list if not already present if (!images.Contains(strUrl)) { images.Add(strUrl); } } } } }