/// <summary>
        /// Removes all scripts from a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Version of string without any scripts.</returns>
        public static string removeScripts
            (string strString)
        {
            // Get script-free version of content
            string     strStringSansScripts = "";
            string     strSegment           = "";
            HTMLParser parser = new HTMLParser(strString);

            while (parser.extractToNoCase("<script", ref strSegment))
            {
                strStringSansScripts += strSegment;
                if (!parser.skipToEndOfNoCase("</script>"))
                {
                    parser.Content = strStringSansScripts;
                    return(strString);
                }
            }

            parser.extractToEnd(ref strSegment);
            strStringSansScripts += strSegment;
            return(strStringSansScripts);
        }
示例#2
0
 // Changed - IMDB changed HTML code
 private void FindIMDBActor(string strURL)
 {
   try
   {
     string absoluteUri;
     // UTF-8 have problem with special country chars, default IMDB enc is used
     string strBody = GetPage(strURL, "utf-8", out absoluteUri);
     string value = string.Empty;
     HTMLParser parser = new HTMLParser(strBody);
     if ((parser.skipToEndOf("<title>")) &&
         (parser.extractTo("</title>", ref value)) && !value.ToLower().Equals("imdb name search"))
     {
       value = new HTMLUtil().ConvertHTMLToAnsi(value);
       value = Util.Utils.RemoveParenthesis(value).Trim();
       IMDBUrl oneUrl = new IMDBUrl(absoluteUri, value, "IMDB");
       _elements.Add(oneUrl);
       return;
     }
     parser.resetPosition();
     
     while (parser.skipToEndOfNoCase("Exact Matches"))
     {
       string url = string.Empty;
       string name = string.Empty;
       //<a href="/name/nm0000246/" onclick="set_args('nm0000246', 1)">Bruce Willis</a>
       if (parser.skipToStartOf("href=\"/name/"))
       {
         parser.skipToEndOf("href=\"");
         parser.extractTo("\"", ref url);
         parser.skipToEndOf("<br><a");
         parser.skipToEndOf(">");
         parser.extractTo("</a>", ref name);
         name = new HTMLUtil().ConvertHTMLToAnsi(name);
         name = Util.Utils.RemoveParenthesis(name).Trim();
         IMDBUrl newUrl = new IMDBUrl("http://akas.imdb.com" + url, name, "IMDB");
         _elements.Add(newUrl);
       }
       else
       {
         parser.skipToEndOfNoCase("</a>");
       }
     }
     // Maybe more actors with the similar name
     parser.resetPosition();
     
     while (parser.skipToEndOfNoCase("Popular Names"))
     {
       string url = string.Empty;
       string name = string.Empty;
       //<a href="/name/nm0000246/" onclick="set_args('nm0000246', 1)">Bruce Willis</a>
       if (parser.skipToStartOf("href=\"/name/"))
       {
         parser.skipToEndOf("href=\"");
         parser.extractTo("\"", ref url);
         parser.skipToEndOf("<br><a");
         parser.skipToEndOf(">");
         parser.extractTo("</a>", ref name);
         name = new HTMLUtil().ConvertHTMLToAnsi(name);
         name = Util.Utils.RemoveParenthesis(name).Trim();
         IMDBUrl newUrl = new IMDBUrl("http://akas.imdb.com" + url, name, "IMDB");
         _elements.Add(newUrl);
       }
       else
       {
         parser.skipToEndOfNoCase("</a>");
       }
     }
   }
   catch (Exception ex)
   {
     Log.Error("exception for imdb lookup of {0} err:{1} stack:{2}", strURL, ex.Message, ex.StackTrace);
   }
 }
示例#3
0
    /////////////////
    // Static methods

    /// <summary>
    /// Retrieves the collection of HTML links in a string.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <param name="strRootUrl">Root url (may be null).</param>
    /// <param name="documents">Collection of document link strings.</param>
    /// <param name="images">Collection of image link strings.</param>
    public static void getLinks
      (string strString,
       string strRootUrl,
       ref ArrayList documents,
       ref ArrayList images)
    {
      // Remove comments and JavaScript and fix links
      strString = HTMLParser.removeComments(strString);
      strString = HTMLParser.removeScripts(strString);
      HTMLParser parser = new HTMLParser(strString);
      parser.replaceEvery("\'", "\"");

      // Set root url
      string rootUrl = "";
      if (strRootUrl != null)
        rootUrl = strRootUrl.Trim();
      if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
        rootUrl += "/";

      // Extract HREF targets
      string strUrl = "";
      parser.resetPosition();
      while (parser.skipToEndOfNoCase("href=\""))
      {
        if (parser.extractTo("\"", ref strUrl))
        {
          strUrl = strUrl.Trim();
          if (strUrl.Length > 0)
          {
            if (strUrl.IndexOf("mailto:") == -1)
            {
              // Get fully qualified url (best guess)
              if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
              {
                try
                {
                  UriBuilder uriBuilder = new UriBuilder(rootUrl);
                  uriBuilder.Path = strUrl;
                  strUrl = uriBuilder.Uri.ToString();
                }
                catch (Exception)
                {
                  strUrl = "http://" + strUrl;
                }
              }

              // Add url to document list if not already present
              if (!documents.Contains(strUrl))
                documents.Add(strUrl);
            }
          }
        }
      }

      // Extract SRC targets
      parser.resetPosition();
      while (parser.skipToEndOfNoCase("src=\""))
      {
        if (parser.extractTo("\"", ref strUrl))
        {
          strUrl = strUrl.Trim();
          if (strUrl.Length > 0)
          {
            // Get fully qualified url (best guess)
            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
            {
              try
              {
                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                uriBuilder.Path = strUrl;
                strUrl = uriBuilder.Uri.ToString();
              }
              catch (Exception)
              {
                strUrl = "http://" + strUrl;
              }
            }

            // Add url to images list if not already present
            if (!images.Contains(strUrl))
              images.Add(strUrl);
          }
        }
      }
    }
示例#4
0
    /// <summary>
    /// Removes all scripts from a string.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <returns>Version of string without any scripts.</returns>
    public static string removeScripts
      (string strString)
    {
      // Get script-free version of content
      string strStringSansScripts = "";
      string strSegment = "";
      HTMLParser parser = new HTMLParser(strString);

      while (parser.extractToNoCase("<script", ref strSegment))
      {
        strStringSansScripts += strSegment;
        if (!parser.skipToEndOfNoCase("</script>"))
        {
          parser.Content = strStringSansScripts;
          return strString;
        }
      }

      parser.extractToEnd(ref strSegment);
      strStringSansScripts += strSegment;
      return (strStringSansScripts);
    }
        /////////////////
        // Static methods

        /// <summary>
        /// Retrieves the collection of HTML links in a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <param name="strRootUrl">Root url (may be null).</param>
        /// <param name="documents">Collection of document link strings.</param>
        /// <param name="images">Collection of image link strings.</param>
        public static void getLinks
            (string strString,
            string strRootUrl,
            ref ArrayList documents,
            ref ArrayList images)
        {
            // Remove comments and JavaScript and fix links
            strString = HTMLParser.removeComments(strString);
            strString = HTMLParser.removeScripts(strString);
            HTMLParser parser = new HTMLParser(strString);

            parser.replaceEvery("\'", "\"");

            // Set root url
            string rootUrl = "";

            if (strRootUrl != null)
            {
                rootUrl = strRootUrl.Trim();
            }
            if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
            {
                rootUrl += "/";
            }

            // Extract HREF targets
            string strUrl = "";

            parser.resetPosition();
            while (parser.skipToEndOfNoCase("href=\""))
            {
                if (parser.extractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        if (strUrl.IndexOf("mailto:") == -1)
                        {
                            // Get fully qualified url (best guess)
                            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                            {
                                try
                                {
                                    UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                    uriBuilder.Path = strUrl;
                                    strUrl          = uriBuilder.Uri.ToString();
                                }
                                catch (Exception)
                                {
                                    strUrl = "http://" + strUrl;
                                }
                            }

                            // Add url to document list if not already present
                            if (!documents.Contains(strUrl))
                            {
                                documents.Add(strUrl);
                            }
                        }
                    }
                }
            }

            // Extract SRC targets
            parser.resetPosition();
            while (parser.skipToEndOfNoCase("src=\""))
            {
                if (parser.extractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        // Get fully qualified url (best guess)
                        if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                uriBuilder.Path = strUrl;
                                strUrl          = uriBuilder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strUrl = "http://" + strUrl;
                            }
                        }

                        // Add url to images list if not already present
                        if (!images.Contains(strUrl))
                        {
                            images.Add(strUrl);
                        }
                    }
                }
            }
        }