Exemplo n.º 1
0
    /////////////////
    // Static methods

    /// <summary>
    /// Retrieves the collection of HTML links in a string.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <param name="strRootUrl">Root url (may be null).</param>
    /// <param name="documents">Collection of document link strings.</param>
    /// <param name="images">Collection of image link strings.</param>
    public static void getLinks
      (string strString,
       string strRootUrl,
       ref ArrayList documents,
       ref ArrayList images)
    {
      // Remove comments and JavaScript and fix links
      strString = HTMLParser.removeComments(strString);
      strString = HTMLParser.removeScripts(strString);
      HTMLParser parser = new HTMLParser(strString);
      parser.replaceEvery("\'", "\"");

      // Set root url
      string rootUrl = "";
      if (strRootUrl != null)
        rootUrl = strRootUrl.Trim();
      if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
        rootUrl += "/";

      // Extract HREF targets
      string strUrl = "";
      parser.resetPosition();
      while (parser.skipToEndOfNoCase("href=\""))
      {
        if (parser.extractTo("\"", ref strUrl))
        {
          strUrl = strUrl.Trim();
          if (strUrl.Length > 0)
          {
            if (strUrl.IndexOf("mailto:") == -1)
            {
              // Get fully qualified url (best guess)
              if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
              {
                try
                {
                  UriBuilder uriBuilder = new UriBuilder(rootUrl);
                  uriBuilder.Path = strUrl;
                  strUrl = uriBuilder.Uri.ToString();
                }
                catch (Exception)
                {
                  strUrl = "http://" + strUrl;
                }
              }

              // Add url to document list if not already present
              if (!documents.Contains(strUrl))
                documents.Add(strUrl);
            }
          }
        }
      }

      // Extract SRC targets
      parser.resetPosition();
      while (parser.skipToEndOfNoCase("src=\""))
      {
        if (parser.extractTo("\"", ref strUrl))
        {
          strUrl = strUrl.Trim();
          if (strUrl.Length > 0)
          {
            // Get fully qualified url (best guess)
            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
            {
              try
              {
                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                uriBuilder.Path = strUrl;
                strUrl = uriBuilder.Uri.ToString();
              }
              catch (Exception)
              {
                strUrl = "http://" + strUrl;
              }
            }

            // Add url to images list if not already present
            if (!images.Contains(strUrl))
              images.Add(strUrl);
          }
        }
      }
    }
Exemplo n.º 2
0
        /////////////////
        // Static methods

        /// <summary>
        /// Retrieves the collection of HTML links in a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <param name="strRootUrl">Root url (may be null).</param>
        /// <param name="documents">Collection of document link strings.</param>
        /// <param name="images">Collection of image link strings.</param>
        public static void getLinks
            (string strString,
            string strRootUrl,
            ref ArrayList documents,
            ref ArrayList images)
        {
            // Remove comments and JavaScript and fix links
            strString = HTMLParser.removeComments(strString);
            strString = HTMLParser.removeScripts(strString);
            HTMLParser parser = new HTMLParser(strString);

            parser.replaceEvery("\'", "\"");

            // Set root url
            string rootUrl = "";

            if (strRootUrl != null)
            {
                rootUrl = strRootUrl.Trim();
            }
            if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
            {
                rootUrl += "/";
            }

            // Extract HREF targets
            string strUrl = "";

            parser.resetPosition();
            while (parser.skipToEndOfNoCase("href=\""))
            {
                if (parser.extractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        if (strUrl.IndexOf("mailto:") == -1)
                        {
                            // Get fully qualified url (best guess)
                            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                            {
                                try
                                {
                                    UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                    uriBuilder.Path = strUrl;
                                    strUrl          = uriBuilder.Uri.ToString();
                                }
                                catch (Exception)
                                {
                                    strUrl = "http://" + strUrl;
                                }
                            }

                            // Add url to document list if not already present
                            if (!documents.Contains(strUrl))
                            {
                                documents.Add(strUrl);
                            }
                        }
                    }
                }
            }

            // Extract SRC targets
            parser.resetPosition();
            while (parser.skipToEndOfNoCase("src=\""))
            {
                if (parser.extractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        // Get fully qualified url (best guess)
                        if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                uriBuilder.Path = strUrl;
                                strUrl          = uriBuilder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strUrl = "http://" + strUrl;
                            }
                        }

                        // Add url to images list if not already present
                        if (!images.Contains(strUrl))
                        {
                            images.Add(strUrl);
                        }
                    }
                }
            }
        }
Exemplo n.º 3
0
    /// <summary>
    /// Returns a version of a string without any HTML tags.
    /// </summary>
    /// <param name="strString">The string.</param>
    /// <returns>Version of string without HTML tags.</returns>
    public static string removeHtml
      (string strString)
    {
      // Do some common case-sensitive replacements
      Hashtable replacements = new Hashtable();
      replacements.Add("&nbsp;", " ");
      replacements.Add("&amp;", "&");
      replacements.Add("&aring;", "");
      replacements.Add("&auml;", "");
      replacements.Add("&eacute;", "");
      replacements.Add("&iacute;", "");
      replacements.Add("&igrave;", "");
      replacements.Add("&ograve;", "");
      replacements.Add("&ouml;", "");
      replacements.Add("&quot;", "\"");
      replacements.Add("&szlig;", "");
      HTMLParser parser = new HTMLParser(strString);
      foreach (string key in replacements.Keys)
      {
        string val = replacements[key] as string;
        if (strString.IndexOf(key) != -1)
          parser.replaceEveryExact(key, val);
      }

      // Do some sequential replacements
      parser.replaceEveryExact("&#0", "&#");
      parser.replaceEveryExact("&#39;", "'");
      parser.replaceEveryExact("</", " <~/");
      parser.replaceEveryExact("<~/", "</");

      // Case-insensitive replacements
      replacements.Clear();
      replacements.Add("<br>", " ");
      replacements.Add("<br />", " ");
      replacements.Add("<br/>", " ");
      replacements.Add("<p>", " ");
      replacements.Add("<p />", " ");
      replacements.Add("<p/>", " ");
      foreach (string key in replacements.Keys)
      {
        string val = replacements[key] as string;
        if (strString.IndexOf(key) != -1)
          parser.replaceEvery(key, val);
      }
      strString = parser.Content;

      // Remove all tags
      string strClean = "";
      int nIndex = 0;
      int nStartTag = 0;
      while ((nStartTag = strString.IndexOf("<", nIndex)) != -1)
      {
        // Extract to start of tag
        string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex));
        strClean += strSubstring;
        nIndex = nStartTag + 1;

        // Skip over tag
        int nEndTag = strString.IndexOf(">", nIndex);
        if (nEndTag == (-1))
          break;
        nIndex = nEndTag + 1;
      }

      // Gather remaining text
      if (nIndex < strString.Length)
        strClean += strString.Substring(nIndex, strString.Length - nIndex);
      strString = strClean;
      strClean = "";

      // Finally, reduce spaces
      parser.Content = strString;
      parser.replaceEveryExact("  ", " ");
      strString = parser.Content.Trim();

      // Return the de-HTMLized string
      return strString;
    }
Exemplo n.º 4
0
        /// <summary>
        /// Returns a version of a string without any HTML tags.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Version of string without HTML tags.</returns>
        public static string removeHtml
            (string strString)
        {
            // Do some common case-sensitive replacements
            Hashtable replacements = new Hashtable();

            replacements.Add("&nbsp;", " ");
            replacements.Add("&amp;", "&");
            replacements.Add("&aring;", "");
            replacements.Add("&auml;", "");
            replacements.Add("&eacute;", "");
            replacements.Add("&iacute;", "");
            replacements.Add("&igrave;", "");
            replacements.Add("&ograve;", "");
            replacements.Add("&ouml;", "");
            replacements.Add("&quot;", "\"");
            replacements.Add("&szlig;", "");
            HTMLParser parser = new HTMLParser(strString);

            foreach (string key in replacements.Keys)
            {
                string val = replacements[key] as string;
                if (strString.IndexOf(key) != -1)
                {
                    parser.replaceEveryExact(key, val);
                }
            }

            // Do some sequential replacements
            parser.replaceEveryExact("&#0", "&#");
            parser.replaceEveryExact("&#39;", "'");
            parser.replaceEveryExact("</", " <~/");
            parser.replaceEveryExact("<~/", "</");

            // Case-insensitive replacements
            replacements.Clear();
            replacements.Add("<br>", " ");
            replacements.Add("<br />", " ");
            replacements.Add("<br/>", " ");
            replacements.Add("<p>", " ");
            replacements.Add("<p />", " ");
            replacements.Add("<p/>", " ");
            foreach (string key in replacements.Keys)
            {
                string val = replacements[key] as string;
                if (strString.IndexOf(key) != -1)
                {
                    parser.replaceEvery(key, val);
                }
            }
            strString = parser.Content;

            // Remove all tags
            string strClean  = "";
            int    nIndex    = 0;
            int    nStartTag = 0;

            while ((nStartTag = strString.IndexOf("<", nIndex)) != -1)
            {
                // Extract to start of tag
                string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex));
                strClean += strSubstring;
                nIndex    = nStartTag + 1;

                // Skip over tag
                int nEndTag = strString.IndexOf(">", nIndex);
                if (nEndTag == (-1))
                {
                    break;
                }
                nIndex = nEndTag + 1;
            }

            // Gather remaining text
            if (nIndex < strString.Length)
            {
                strClean += strString.Substring(nIndex, strString.Length - nIndex);
            }
            strString = strClean;
            strClean  = "";

            // Finally, reduce spaces
            parser.Content = strString;
            parser.replaceEveryExact("  ", " ");
            strString = parser.Content.Trim();

            // Return the de-HTMLized string
            return(strString);
        }