Пример #1
0
 public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
 {
     strString = removeComments(strString);
     strString = removeScripts(strString);
     StringParser parser = new StringParser(strString);
     parser.replaceEvery("'", "\"");
     string uri = "";
     if (strRootUrl != null)
     {
         uri = strRootUrl.Trim();
     }
     if ((uri.Length > 0) && !uri.EndsWith("/"))
     {
         uri = uri + "/";
     }
     string strExtract = "";
     parser.resetPosition();
     while (parser.skipToEndOfNoCase("href=\""))
     {
         if (parser.extractTo("\"", ref strExtract))
         {
             strExtract = strExtract.Trim();
             if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1))
             {
                 if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                 {
                     try
                     {
                         UriBuilder builder = new UriBuilder(uri);
                         builder.Path = strExtract;
                         strExtract = builder.Uri.ToString();
                     }
                     catch (Exception)
                     {
                         strExtract = "http://" + strExtract;
                     }
                 }
                 if (!documents.Contains(strExtract))
                 {
                     documents.Add(strExtract);
                 }
             }
         }
     }
     parser.resetPosition();
     while (parser.skipToEndOfNoCase("src=\""))
     {
         if (parser.extractTo("\"", ref strExtract))
         {
             strExtract = strExtract.Trim();
             if (strExtract.Length > 0)
             {
                 if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                 {
                     try
                     {
                         UriBuilder builder2 = new UriBuilder(uri);
                         builder2.Path = strExtract;
                         strExtract = builder2.Uri.ToString();
                     }
                     catch (Exception)
                     {
                         strExtract = "http://" + strExtract;
                     }
                 }
                 if (!images.Contains(strExtract))
                 {
                     images.Add(strExtract);
                 }
             }
         }
     }
 }
Пример #2
0
        public static string removeHtml(string strString)
        {
            Hashtable hashtable = new Hashtable();

            hashtable.Add(" ", " ");
            hashtable.Add("&", "&");
            hashtable.Add("å", "");
            hashtable.Add("ä", "");
            hashtable.Add("é", "");
            hashtable.Add("í", "");
            hashtable.Add("ì", "");
            hashtable.Add("ò", "");
            hashtable.Add("ö", "");
            hashtable.Add(""", "\"");
            hashtable.Add("ß", "");
            StringParser parser = new StringParser(strString);

            foreach (string str in hashtable.Keys)
            {
                string strReplacement = hashtable[str] as string;
                if (strString.IndexOf(str) != -1)
                {
                    parser.replaceEveryExact(str, strReplacement);
                }
            }
            parser.replaceEveryExact("&#0", "&#");
            parser.replaceEveryExact("'", "'");
            parser.replaceEveryExact("</", " <~/");
            parser.replaceEveryExact("<~/", "</");
            hashtable.Clear();
            hashtable.Add("<br>", " ");
            hashtable.Add("<p>", " ");
            foreach (string str3 in hashtable.Keys)
            {
                string str4 = hashtable[str3] as string;
                if (strString.IndexOf(str3) != -1)
                {
                    parser.replaceEvery(str3, str4);
                }
            }
            strString = parser.Content;
            string str5       = "";
            int    startIndex = 0;
            int    num2       = 0;

            while ((num2 = strString.IndexOf("<", startIndex)) != -1)
            {
                string str6 = strString.Substring(startIndex, num2 - startIndex);
                str5       = str5 + str6;
                startIndex = num2 + 1;
                int index = strString.IndexOf(">", startIndex);
                if (index == -1)
                {
                    break;
                }
                startIndex = index + 1;
            }
            if (startIndex < strString.Length)
            {
                str5 = str5 + strString.Substring(startIndex, strString.Length - startIndex);
            }
            strString      = str5;
            str5           = "";
            parser.Content = strString;
            parser.replaceEveryExact("  ", " ");
            strString = parser.Content.Trim();
            return(strString);
        }
Пример #3
0
 public static string removeHtml(string strString)
 {
     Hashtable hashtable = new Hashtable();
     hashtable.Add("&nbsp;", " ");
     hashtable.Add("&amp;", "&");
     hashtable.Add("&aring;", "");
     hashtable.Add("&auml;", "");
     hashtable.Add("&eacute;", "");
     hashtable.Add("&iacute;", "");
     hashtable.Add("&igrave;", "");
     hashtable.Add("&ograve;", "");
     hashtable.Add("&ouml;", "");
     hashtable.Add("&quot;", "\"");
     hashtable.Add("&szlig;", "");
     StringParser parser = new StringParser(strString);
     foreach (string str in hashtable.Keys)
     {
         string strReplacement = hashtable[str] as string;
         if (strString.IndexOf(str) != -1)
         {
             parser.replaceEveryExact(str, strReplacement);
         }
     }
     parser.replaceEveryExact("&#0", "&#");
     parser.replaceEveryExact("&#39;", "'");
     parser.replaceEveryExact("</", " <~/");
     parser.replaceEveryExact("<~/", "</");
     hashtable.Clear();
     hashtable.Add("<br>", " ");
     hashtable.Add("<p>", " ");
     foreach (string str3 in hashtable.Keys)
     {
         string str4 = hashtable[str3] as string;
         if (strString.IndexOf(str3) != -1)
         {
             parser.replaceEvery(str3, str4);
         }
     }
     strString = parser.Content;
     string str5 = "";
     int startIndex = 0;
     int num2 = 0;
     while ((num2 = strString.IndexOf("<", startIndex)) != -1)
     {
         string str6 = strString.Substring(startIndex, num2 - startIndex);
         str5 = str5 + str6;
         startIndex = num2 + 1;
         int index = strString.IndexOf(">", startIndex);
         if (index == -1)
         {
             break;
         }
         startIndex = index + 1;
     }
     if (startIndex < strString.Length)
     {
         str5 = str5 + strString.Substring(startIndex, strString.Length - startIndex);
     }
     strString = str5;
     str5 = "";
     parser.Content = strString;
     parser.replaceEveryExact("  ", " ");
     strString = parser.Content.Trim();
     return strString;
 }
Пример #4
0
        public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
        {
            strString = removeComments(strString);
            strString = removeScripts(strString);
            StringParser parser = new StringParser(strString);

            parser.replaceEvery("'", "\"");
            string uri = "";

            if (strRootUrl != null)
            {
                uri = strRootUrl.Trim();
            }
            if ((uri.Length > 0) && !uri.EndsWith("/"))
            {
                uri = uri + "/";
            }
            string strExtract = "";

            parser.resetPosition();
            while (parser.skipToEndOfNoCase("href=\""))
            {
                if (parser.extractTo("\"", ref strExtract))
                {
                    strExtract = strExtract.Trim();
                    if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1))
                    {
                        if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder builder = new UriBuilder(uri);
                                builder.Path = strExtract;
                                strExtract   = builder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strExtract = "http://" + strExtract;
                            }
                        }
                        if (!documents.Contains(strExtract))
                        {
                            documents.Add(strExtract);
                        }
                    }
                }
            }
            parser.resetPosition();
            while (parser.skipToEndOfNoCase("src=\""))
            {
                if (parser.extractTo("\"", ref strExtract))
                {
                    strExtract = strExtract.Trim();
                    if (strExtract.Length > 0)
                    {
                        if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder builder2 = new UriBuilder(uri);
                                builder2.Path = strExtract;
                                strExtract    = builder2.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strExtract = "http://" + strExtract;
                            }
                        }
                        if (!images.Contains(strExtract))
                        {
                            images.Add(strExtract);
                        }
                    }
                }
            }
        }