Ejemplo n.º 1
0
        /// <summary>
        /// Removes all HTML comments from a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Comment-free version of string.</returns>
        public static string RemoveComments(string strString)
        {
            // Return comment-free version of string
            string           strCommentFreeString = "";
            string           strSegment           = "";
            HTMLStringHelper parser = new HTMLStringHelper(strString);

            while (parser.ExtractTo("<!--", ref strSegment))
            {
                strCommentFreeString += strSegment;
                if (!parser.SkipToEndOf("-->"))
                {
                    return(strString);
                }
            }

            parser.ExtractToEnd(ref strSegment);
            strCommentFreeString += strSegment;
            return(strCommentFreeString);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Removes all scripts from a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Version of string without any scripts.</returns>
        public static string RemoveScripts(string strString)
        {
            // Get script-free version of content
            string           strStringSansScripts = "";
            string           strSegment           = "";
            HTMLStringHelper parser = new HTMLStringHelper(strString);

            while (parser.ExtractToNoCase("<script", ref strSegment))
            {
                strStringSansScripts += strSegment;
                if (!parser.SkipToEndOfNoCase("</script>"))
                {
                    parser.Content = strStringSansScripts;
                    return(strString);
                }
            }

            parser.ExtractToEnd(ref strSegment);
            strStringSansScripts += strSegment;
            return(strStringSansScripts);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Returns a version of a string without any HTML tags.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <returns>Version of string without HTML tags.</returns>
        public static string RemoveHtml(string strString)
        {
            // Do some common case-sensitive replacements
            Hashtable replacements = new Hashtable();

            replacements.Add("&nbsp;", " ");
            replacements.Add("&amp;", "&");
            replacements.Add("&aring;", "");
            replacements.Add("&auml;", "");
            replacements.Add("&eacute;", "");
            replacements.Add("&iacute;", "");
            replacements.Add("&igrave;", "");
            replacements.Add("&ograve;", "");
            replacements.Add("&ouml;", "");
            replacements.Add("&quot;", "\"");
            replacements.Add("&szlig;", "");
            HTMLStringHelper parser = new HTMLStringHelper(strString);

            foreach (string key in replacements.Keys)
            {
                string val = replacements[key] as string;
                if (strString.IndexOf(key) != -1)
                {
                    parser.ReplaceEveryExact(key, val);
                }
            }

            // Do some sequential replacements
            parser.ReplaceEveryExact("&#0", "&#");
            parser.ReplaceEveryExact("&#39;", "'");
            parser.ReplaceEveryExact("</", " <~/");
            parser.ReplaceEveryExact("<~/", "</");

            // Case-insensitive replacements
            replacements.Clear();
            replacements.Add("<br>", " ");
            replacements.Add("<p>", " ");
            foreach (string key in replacements.Keys)
            {
                string val = replacements[key] as string;
                if (strString.IndexOf(key) != -1)
                {
                    parser.ReplaceEvery(key, val);
                }
            }
            strString = parser.Content;

            // Remove all tags
            string strClean  = "";
            int    nIndex    = 0;
            int    nStartTag = 0;

            while ((nStartTag = strString.IndexOf("<", nIndex)) != -1)
            {
                // Extract to start of tag
                string strSubstring = strString.Substring(nIndex, (nStartTag - nIndex));
                strClean += strSubstring;
                nIndex    = nStartTag + 1;

                // Skip over tag
                int nEndTag = strString.IndexOf(">", nIndex);
                if (nEndTag == (-1))
                {
                    break;
                }
                nIndex = nEndTag + 1;
            }

            // Gather remaining text
            if (nIndex < strString.Length)
            {
                strClean += strString.Substring(nIndex, strString.Length - nIndex);
            }
            strString = strClean;
            strClean  = "";

            // Finally, reduce spaces
            parser.Content = strString;
            parser.ReplaceEveryExact("  ", " ");
            strString = parser.Content.Trim();

            // Return the de-HTMLized string
            return(strString);
        }
Ejemplo n.º 4
0
        //public static void GetLinks(string strString, string strRootUrl,string urlPrefix, ref ArrayList documents,string pattern,string regular)
        //{
        //    // Remove comments and JavaScript and fix links
        //    strString = HTMLStringHelper.RemoveComments(strString);
        //    strString = HTMLStringHelper.RemoveScripts(strString);


        //    // Set root url
        //    string rootUrl = string.Empty;
        //    if (strRootUrl != null)
        //        rootUrl = strRootUrl.Trim();
        //    if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
        //        rootUrl += "/";


        //    // Extract HREF targets
        //    Regex regex = new Regex("href=[\"']?(?<group>[\\w\\d._=&?/;#-]+)[\"']?", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline);
        //    // get all the matches depending upon the regular expression
        //    MatchCollection mcl = regex.Matches(strString);
        //    string strUrl = string.Empty;
        //    foreach (Match ml in mcl)
        //    {
        //        if (ml.Groups.Count > 1)
        //        {
        //            strUrl = ml.Groups[1].Value.Replace("&amp;", "&");

        //            // Get fully qualified url (best guess)
        //            if (!strUrl.StartsWith("http://"))
        //            {
        //                try
        //                {
        //                    UriBuilder uriBuilder = new UriBuilder(rootUrl);
        //                    uriBuilder.Path = strUrl;
        //                    strUrl = uriBuilder.Uri.ToString();
        //                }
        //                catch (Exception)
        //                {
        //                    strUrl = strUrl.Replace("../", string.Empty);
        //                    if (strUrl.StartsWith("..http"))
        //                        strUrl = strUrl.Substring(2);
        //                    if (!strUrl.StartsWith("http"))
        //                        strUrl = urlPrefix + strUrl;
        //                }
        //            }

        //            // Add url to document list if not already present
        //            if (strUrl.IndexOf(pattern) != -1)
        //            {
        //                if (regular != null && regular.Length > 0)
        //                {
        //                    if (ValidateURL(strUrl, regular))
        //                        if (!documents.Contains(strUrl))
        //                            documents.Add(strUrl);
        //                }
        //                else
        //                    if (!documents.Contains(strUrl))
        //                        documents.Add(strUrl);
        //            }
        //        }
        //    }
        //}



        #region Overload GetLinkss
        public static void GetLinks(string strString, string strRootUrl, string urlPrefix, ref ArrayList documents, string pattern, string regular)
        {
            // Remove comments and JavaScript and fix links
            strString = HTMLStringHelper.RemoveComments(strString);
            strString = HTMLStringHelper.RemoveScripts(strString);


            // Set root url
            string rootUrl = string.Empty;

            if (strRootUrl != null)
            {
                rootUrl = strRootUrl.Trim();
            }
            if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
            {
                rootUrl += "/";
            }


            // Extract HREF targets
            //Regex regex = new Regex("href=[\"']?(?<group>[\\w\\d._=&?/;#-]+)[\"']?", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline);

            //Todo:
            Regex regex = new Regex("href\\s*=\\s*(?<s>['\"]?)(?<group>[^ >]+)\\k<s>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline);

            //Regex regex = new Regex("href\\s*=\\s*(?<s>['\"])(?<group>[^>]+?)\\k<s>", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline);

            // get all the matches depending upon the regular expression
            MatchCollection mcl    = regex.Matches(strString);
            string          strUrl = string.Empty;

            foreach (Match ml in mcl)
            {
                if (ml.Groups.Count > 1)
                {
                    strUrl = ml.Groups["group"].Value.Replace("&amp;", "&").Replace("\"", "");

                    // Get fully qualified url (best guess)
                    if (!strUrl.StartsWith("http://"))
                    {
                        try
                        {
                            UriBuilder uriBuilder = new UriBuilder(rootUrl);
                            uriBuilder.Path = strUrl;
                            strUrl          = uriBuilder.Uri.ToString();
                        }
                        catch (Exception)
                        {
                            strUrl = strUrl.Replace("../", string.Empty);
                            if (strUrl.StartsWith("..http"))
                            {
                                strUrl = strUrl.Substring(2);
                            }
                            if (!strUrl.StartsWith("http"))
                            {
                                strUrl = urlPrefix + strUrl;
                            }
                        }
                    }

                    // Add url to document list if not already present
                    if (pattern.Length > 0)
                    {
                        if (strUrl.IndexOf(pattern) != -1)
                        {
                            if (regular != null && regular.Length > 0)
                            {
                                if (ValidateURL(strUrl, regular))
                                {
                                    if (!documents.Contains(strUrl))
                                    {
                                        documents.Add(strUrl);
                                    }
                                }
                            }
                            else
                            if (!documents.Contains(strUrl))
                            {
                                documents.Add(strUrl);
                            }
                        }
                    }
                    else
                    if (!documents.Contains(strUrl))
                    {
                        documents.Add(strUrl);
                    }
                }
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Retrieves the collection of HTML links in a string.
        /// </summary>
        /// <param name="strString">The string.</param>
        /// <param name="strRootUrl">Root url (may be null).</param>
        /// <param name="documents">Collection of document link strings.</param>
        /// <param name="images">Collection of image link strings.</param>
        public static void GetLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
        {
            // Remove comments and JavaScript and fix links
            strString = HTMLStringHelper.RemoveComments(strString);
            strString = HTMLStringHelper.RemoveScripts(strString);
            HTMLStringHelper parser = new HTMLStringHelper(strString);

            parser.ReplaceEvery("\'", "\"");

            // Set root url
            string rootUrl = "";

            if (strRootUrl != null)
            {
                rootUrl = strRootUrl.Trim();
            }
            if ((rootUrl.Length > 0) && !rootUrl.EndsWith("/"))
            {
                rootUrl += "/";
            }

            // Extract HREF targets
            string strUrl = "";

            parser.ResetPosition();
            while (parser.SkipToEndOfNoCase("href=\""))
            {
                if (parser.ExtractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        if (strUrl.IndexOf("mailto:") == -1)
                        {
                            // Get fully qualified url (best guess)
                            if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                            {
                                try
                                {
                                    UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                    uriBuilder.Path = strUrl;
                                    strUrl          = uriBuilder.Uri.ToString();
                                }
                                catch (Exception)
                                {
                                    strUrl = "http://" + strUrl;
                                }
                            }

                            // Add url to document list if not already present
                            if (!documents.Contains(strUrl))
                            {
                                documents.Add(strUrl);
                            }
                        }
                    }
                }
            }

            // Extract SRC targets
            parser.ResetPosition();
            while (parser.SkipToEndOfNoCase("src=\""))
            {
                if (parser.ExtractTo("\"", ref strUrl))
                {
                    strUrl = strUrl.Trim();
                    if (strUrl.Length > 0)
                    {
                        // Get fully qualified url (best guess)
                        if (!strUrl.StartsWith("http://") && !strUrl.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder uriBuilder = new UriBuilder(rootUrl);
                                uriBuilder.Path = strUrl;
                                strUrl          = uriBuilder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strUrl = "http://" + strUrl;
                            }
                        }

                        // Add url to images list if not already present
                        if (!images.Contains(strUrl))
                        {
                            images.Add(strUrl);
                        }
                    }
                }
            }
        }