/// <summary> /// Removes all HTML comments from a string. /// </summary> /// <param name="strString">The string.</param> /// <returns>Comment-free version of string.</returns> public static string RemoveComments(string strString) { // Return comment-free version of string string strCommentFreeString = ""; string strSegment = ""; HTMLStringHelper parser = new HTMLStringHelper(strString); while (parser.ExtractTo("<!--", ref strSegment)) { strCommentFreeString += strSegment; if (!parser.SkipToEndOf("-->")) { return(strString); } } parser.ExtractToEnd(ref strSegment); strCommentFreeString += strSegment; return(strCommentFreeString); }
/// <summary> /// Retrieves the collection of HTML links in a string. /// </summary> /// <param name="strString">The string.</param> /// <param name="strRootUrl">Root url (may be null).</param> /// <param name="documents">Collection of document link strings.</param> /// <param name="images">Collection of image link strings.</param> public static void GetLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { // Remove comments and JavaScript and fix links strString = HTMLStringHelper.RemoveComments(strString); strString = HTMLStringHelper.RemoveScripts(strString); HTMLStringHelper parser = new HTMLStringHelper(strString); parser.ReplaceEvery("\'", "\""); // Set root url string rootUrl = ""; if (strRootUrl != null) { rootUrl = strRootUrl.Trim(); } if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) { rootUrl += "/"; } // Extract HREF targets string strUrl = ""; parser.ResetPosition(); while (parser.SkipToEndOfNoCase("href=\"")) { if (parser.ExtractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { if (strUrl.IndexOf("mailto:") == -1) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to document list if not already present if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } } } // Extract SRC targets parser.ResetPosition(); while (parser.SkipToEndOfNoCase("src=\"")) { if (parser.ExtractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to images list if not already present if (!images.Contains(strUrl)) { images.Add(strUrl); } } } } }