///////////////// // Static methods /// <summary> /// Retrieves the collection of HTML links in a string. /// </summary> /// <param name="strString">The string.</param> /// <param name="strRootUrl">Root url (may be null).</param> /// <param name="documents">Collection of document link strings.</param> /// <param name="images">Collection of image link strings.</param> public static void getLinks (string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { // Remove comments and JavaScript and fix links strString = HTMLParser.removeComments(strString); strString = HTMLParser.removeScripts(strString); HTMLParser parser = new HTMLParser(strString); parser.replaceEvery("\'", "\""); // Set root url string rootUrl = ""; if (strRootUrl != null) rootUrl = strRootUrl.Trim(); if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) rootUrl += "/"; // Extract HREF targets string strUrl = ""; parser.resetPosition(); while (parser.skipToEndOfNoCase("href=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { if (strUrl.IndexOf("mailto:") == -1) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to document list if not already present if (!documents.Contains(strUrl)) documents.Add(strUrl); } } } } // Extract SRC targets parser.resetPosition(); while (parser.skipToEndOfNoCase("src=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to images list if not already present if (!images.Contains(strUrl)) images.Add(strUrl); } } } }
///////////////// // Static methods /// <summary> /// Retrieves the collection of HTML links in a string. /// </summary> /// <param name="strString">The string.</param> /// <param name="strRootUrl">Root url (may be null).</param> /// <param name="documents">Collection of document link strings.</param> /// <param name="images">Collection of image link strings.</param> public static void getLinks (string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { // Remove comments and JavaScript and fix links strString = HTMLParser.removeComments(strString); strString = HTMLParser.removeScripts(strString); HTMLParser parser = new HTMLParser(strString); parser.replaceEvery("\'", "\""); // Set root url string rootUrl = ""; if (strRootUrl != null) { rootUrl = strRootUrl.Trim(); } if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) { rootUrl += "/"; } // Extract HREF targets string strUrl = ""; parser.resetPosition(); while (parser.skipToEndOfNoCase("href=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { if (strUrl.IndexOf("mailto:") == -1) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to document list if not already present if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } } } // Extract SRC targets parser.resetPosition(); while (parser.skipToEndOfNoCase("src=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to images list if not already present if (!images.Contains(strUrl)) { images.Add(strUrl); } } } } }
/// <summary> /// Returns a version of a string without any HTML tags. /// </summary> /// <param name="strString">The string.</param> /// <returns>Version of string without HTML tags.</returns> public static string removeHtml (string strString) { // Do some common case-sensitive replacements Hashtable replacements = new Hashtable(); replacements.Add(" ", " "); replacements.Add("&", "&"); replacements.Add("å", ""); replacements.Add("ä", ""); replacements.Add("é", ""); replacements.Add("í", ""); replacements.Add("ì", ""); replacements.Add("ò", ""); replacements.Add("ö", ""); replacements.Add(""", "\""); replacements.Add("ß", ""); HTMLParser parser = new HTMLParser(strString); foreach (string key in replacements.Keys) { string val = replacements[key] as string; if (strString.IndexOf(key) != -1) parser.replaceEveryExact(key, val); } // Do some sequential replacements parser.replaceEveryExact("�", "&#"); parser.replaceEveryExact("'", "'"); parser.replaceEveryExact("</", " <~/"); parser.replaceEveryExact("<~/", "</"); // Case-insensitive replacements replacements.Clear(); replacements.Add("<br>", " "); replacements.Add("<br />", " "); replacements.Add("<br/>", " "); replacements.Add("<p>", " "); replacements.Add("<p />", " "); replacements.Add("<p/>", " "); foreach (string key in replacements.Keys) { string val = replacements[key] as string; if (strString.IndexOf(key) != -1) parser.replaceEvery(key, val); } strString = parser.Content; // Remove all tags string strClean = ""; int nIndex = 0; int nStartTag = 0; while ((nStartTag = strString.IndexOf("<", nIndex)) != -1) { // Extract to start of tag string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex)); strClean += strSubstring; nIndex = nStartTag + 1; // Skip over tag int nEndTag = strString.IndexOf(">", nIndex); if (nEndTag == (-1)) break; nIndex = nEndTag + 1; } // Gather remaining text if (nIndex < strString.Length) strClean += strString.Substring(nIndex, strString.Length - nIndex); strString = strClean; strClean = ""; // Finally, reduce spaces parser.Content = strString; parser.replaceEveryExact(" ", " "); strString = parser.Content.Trim(); // Return the de-HTMLized string return strString; }
/// <summary> /// Returns a version of a string without any HTML tags. /// </summary> /// <param name="strString">The string.</param> /// <returns>Version of string without HTML tags.</returns> public static string removeHtml (string strString) { // Do some common case-sensitive replacements Hashtable replacements = new Hashtable(); replacements.Add(" ", " "); replacements.Add("&", "&"); replacements.Add("å", ""); replacements.Add("ä", ""); replacements.Add("é", ""); replacements.Add("í", ""); replacements.Add("ì", ""); replacements.Add("ò", ""); replacements.Add("ö", ""); replacements.Add(""", "\""); replacements.Add("ß", ""); HTMLParser parser = new HTMLParser(strString); foreach (string key in replacements.Keys) { string val = replacements[key] as string; if (strString.IndexOf(key) != -1) { parser.replaceEveryExact(key, val); } } // Do some sequential replacements parser.replaceEveryExact("�", "&#"); parser.replaceEveryExact("'", "'"); parser.replaceEveryExact("</", " <~/"); parser.replaceEveryExact("<~/", "</"); // Case-insensitive replacements replacements.Clear(); replacements.Add("<br>", " "); replacements.Add("<br />", " "); replacements.Add("<br/>", " "); replacements.Add("<p>", " "); replacements.Add("<p />", " "); replacements.Add("<p/>", " "); foreach (string key in replacements.Keys) { string val = replacements[key] as string; if (strString.IndexOf(key) != -1) { parser.replaceEvery(key, val); } } strString = parser.Content; // Remove all tags string strClean = ""; int nIndex = 0; int nStartTag = 0; while ((nStartTag = strString.IndexOf("<", nIndex)) != -1) { // Extract to start of tag string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex)); strClean += strSubstring; nIndex = nStartTag + 1; // Skip over tag int nEndTag = strString.IndexOf(">", nIndex); if (nEndTag == (-1)) { break; } nIndex = nEndTag + 1; } // Gather remaining text if (nIndex < strString.Length) { strClean += strString.Substring(nIndex, strString.Length - nIndex); } strString = strClean; strClean = ""; // Finally, reduce spaces parser.Content = strString; parser.replaceEveryExact(" ", " "); strString = parser.Content.Trim(); // Return the de-HTMLized string return(strString); }