/// <summary> /// Removes all scripts from a string. /// </summary> /// <param name="strString">The string.</param> /// <returns>Version of string without any scripts.</returns> public static string removeScripts (string strString) { // Get script-free version of content string strStringSansScripts = ""; string strSegment = ""; HTMLParser parser = new HTMLParser(strString); while (parser.extractToNoCase("<script", ref strSegment)) { strStringSansScripts += strSegment; if (!parser.skipToEndOfNoCase("</script>")) { parser.Content = strStringSansScripts; return(strString); } } parser.extractToEnd(ref strSegment); strStringSansScripts += strSegment; return(strStringSansScripts); }
// Changed - IMDB changed HTML code private void FindIMDBActor(string strURL) { try { string absoluteUri; // UTF-8 have problem with special country chars, default IMDB enc is used string strBody = GetPage(strURL, "utf-8", out absoluteUri); string value = string.Empty; HTMLParser parser = new HTMLParser(strBody); if ((parser.skipToEndOf("<title>")) && (parser.extractTo("</title>", ref value)) && !value.ToLower().Equals("imdb name search")) { value = new HTMLUtil().ConvertHTMLToAnsi(value); value = Util.Utils.RemoveParenthesis(value).Trim(); IMDBUrl oneUrl = new IMDBUrl(absoluteUri, value, "IMDB"); _elements.Add(oneUrl); return; } parser.resetPosition(); while (parser.skipToEndOfNoCase("Exact Matches")) { string url = string.Empty; string name = string.Empty; //<a href="/name/nm0000246/" onclick="set_args('nm0000246', 1)">Bruce Willis</a> if (parser.skipToStartOf("href=\"/name/")) { parser.skipToEndOf("href=\""); parser.extractTo("\"", ref url); parser.skipToEndOf("<br><a"); parser.skipToEndOf(">"); parser.extractTo("</a>", ref name); name = new HTMLUtil().ConvertHTMLToAnsi(name); name = Util.Utils.RemoveParenthesis(name).Trim(); IMDBUrl newUrl = new IMDBUrl("http://akas.imdb.com" + url, name, "IMDB"); _elements.Add(newUrl); } else { parser.skipToEndOfNoCase("</a>"); } } // Maybe more actors with the similar name parser.resetPosition(); while (parser.skipToEndOfNoCase("Popular Names")) { string url = string.Empty; string name = string.Empty; //<a href="/name/nm0000246/" onclick="set_args('nm0000246', 1)">Bruce Willis</a> if (parser.skipToStartOf("href=\"/name/")) { parser.skipToEndOf("href=\""); parser.extractTo("\"", ref url); parser.skipToEndOf("<br><a"); parser.skipToEndOf(">"); parser.extractTo("</a>", ref name); name = new HTMLUtil().ConvertHTMLToAnsi(name); name = Util.Utils.RemoveParenthesis(name).Trim(); IMDBUrl newUrl = new IMDBUrl("http://akas.imdb.com" + url, name, "IMDB"); _elements.Add(newUrl); } else { parser.skipToEndOfNoCase("</a>"); } } } catch (Exception ex) { Log.Error("exception for imdb lookup of {0} err:{1} stack:{2}", strURL, ex.Message, ex.StackTrace); } }
///////////////// // Static methods /// <summary> /// Retrieves the collection of HTML links in a string. /// </summary> /// <param name="strString">The string.</param> /// <param name="strRootUrl">Root url (may be null).</param> /// <param name="documents">Collection of document link strings.</param> /// <param name="images">Collection of image link strings.</param> public static void getLinks (string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { // Remove comments and JavaScript and fix links strString = HTMLParser.removeComments(strString); strString = HTMLParser.removeScripts(strString); HTMLParser parser = new HTMLParser(strString); parser.replaceEvery("\'", "\""); // Set root url string rootUrl = ""; if (strRootUrl != null) rootUrl = strRootUrl.Trim(); if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) rootUrl += "/"; // Extract HREF targets string strUrl = ""; parser.resetPosition(); while (parser.skipToEndOfNoCase("href=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { if (strUrl.IndexOf("mailto:") == -1) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to document list if not already present if (!documents.Contains(strUrl)) documents.Add(strUrl); } } } } // Extract SRC targets parser.resetPosition(); while (parser.skipToEndOfNoCase("src=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to images list if not already present if (!images.Contains(strUrl)) images.Add(strUrl); } } } }
/// <summary> /// Removes all scripts from a string. /// </summary> /// <param name="strString">The string.</param> /// <returns>Version of string without any scripts.</returns> public static string removeScripts (string strString) { // Get script-free version of content string strStringSansScripts = ""; string strSegment = ""; HTMLParser parser = new HTMLParser(strString); while (parser.extractToNoCase("<script", ref strSegment)) { strStringSansScripts += strSegment; if (!parser.skipToEndOfNoCase("</script>")) { parser.Content = strStringSansScripts; return strString; } } parser.extractToEnd(ref strSegment); strStringSansScripts += strSegment; return (strStringSansScripts); }
///////////////// // Static methods /// <summary> /// Retrieves the collection of HTML links in a string. /// </summary> /// <param name="strString">The string.</param> /// <param name="strRootUrl">Root url (may be null).</param> /// <param name="documents">Collection of document link strings.</param> /// <param name="images">Collection of image link strings.</param> public static void getLinks (string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { // Remove comments and JavaScript and fix links strString = HTMLParser.removeComments(strString); strString = HTMLParser.removeScripts(strString); HTMLParser parser = new HTMLParser(strString); parser.replaceEvery("\'", "\""); // Set root url string rootUrl = ""; if (strRootUrl != null) { rootUrl = strRootUrl.Trim(); } if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/")) { rootUrl += "/"; } // Extract HREF targets string strUrl = ""; parser.resetPosition(); while (parser.skipToEndOfNoCase("href=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { if (strUrl.IndexOf("mailto:") == -1) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to document list if not already present if (!documents.Contains(strUrl)) { documents.Add(strUrl); } } } } } // Extract SRC targets parser.resetPosition(); while (parser.skipToEndOfNoCase("src=\"")) { if (parser.extractTo("\"", ref strUrl)) { strUrl = strUrl.Trim(); if (strUrl.Length > 0) { // Get fully qualified url (best guess) if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(rootUrl); uriBuilder.Path = strUrl; strUrl = uriBuilder.Uri.ToString(); } catch (Exception) { strUrl = "http://" + strUrl; } } // Add url to images list if not already present if (!images.Contains(strUrl)) { images.Add(strUrl); } } } } }