Пример #1
0
        public static string removeComments(string strString)
        {
            string       str        = "";
            string       strExtract = "";
            StringParser parser     = new StringParser(strString);

            while (parser.extractTo("<!--", ref strExtract))
            {
                str = str + strExtract;
                if (!parser.skipToEndOf("-->"))
                {
                    return(strString);
                }
            }
            parser.extractToEnd(ref strExtract);
            return(str + strExtract);
        }
Пример #2
0
 public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
 {
     strString = removeComments(strString);
     strString = removeScripts(strString);
     StringParser parser = new StringParser(strString);
     parser.replaceEvery("'", "\"");
     string uri = "";
     if (strRootUrl != null)
     {
         uri = strRootUrl.Trim();
     }
     if ((uri.Length > 0) && !uri.EndsWith("/"))
     {
         uri = uri + "/";
     }
     string strExtract = "";
     parser.resetPosition();
     while (parser.skipToEndOfNoCase("href=\""))
     {
         if (parser.extractTo("\"", ref strExtract))
         {
             strExtract = strExtract.Trim();
             if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1))
             {
                 if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                 {
                     try
                     {
                         UriBuilder builder = new UriBuilder(uri);
                         builder.Path = strExtract;
                         strExtract = builder.Uri.ToString();
                     }
                     catch (Exception)
                     {
                         strExtract = "http://" + strExtract;
                     }
                 }
                 if (!documents.Contains(strExtract))
                 {
                     documents.Add(strExtract);
                 }
             }
         }
     }
     parser.resetPosition();
     while (parser.skipToEndOfNoCase("src=\""))
     {
         if (parser.extractTo("\"", ref strExtract))
         {
             strExtract = strExtract.Trim();
             if (strExtract.Length > 0)
             {
                 if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                 {
                     try
                     {
                         UriBuilder builder2 = new UriBuilder(uri);
                         builder2.Path = strExtract;
                         strExtract = builder2.Uri.ToString();
                     }
                     catch (Exception)
                     {
                         strExtract = "http://" + strExtract;
                     }
                 }
                 if (!images.Contains(strExtract))
                 {
                     images.Add(strExtract);
                 }
             }
         }
     }
 }
Пример #3
0
 public static string removeComments(string strString)
 {
     string str = "";
     string strExtract = "";
     StringParser parser = new StringParser(strString);
     while (parser.extractTo("<!--", ref strExtract))
     {
         str = str + strExtract;
         if (!parser.skipToEndOf("-->"))
         {
             return strString;
         }
     }
     parser.extractToEnd(ref strExtract);
     return (str + strExtract);
 }
Пример #4
0
        public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
        {
            strString = removeComments(strString);
            strString = removeScripts(strString);
            StringParser parser = new StringParser(strString);

            parser.replaceEvery("'", "\"");
            string uri = "";

            if (strRootUrl != null)
            {
                uri = strRootUrl.Trim();
            }
            if ((uri.Length > 0) && !uri.EndsWith("/"))
            {
                uri = uri + "/";
            }
            string strExtract = "";

            parser.resetPosition();
            while (parser.skipToEndOfNoCase("href=\""))
            {
                if (parser.extractTo("\"", ref strExtract))
                {
                    strExtract = strExtract.Trim();
                    if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1))
                    {
                        if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder builder = new UriBuilder(uri);
                                builder.Path = strExtract;
                                strExtract   = builder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strExtract = "http://" + strExtract;
                            }
                        }
                        if (!documents.Contains(strExtract))
                        {
                            documents.Add(strExtract);
                        }
                    }
                }
            }
            parser.resetPosition();
            while (parser.skipToEndOfNoCase("src=\""))
            {
                if (parser.extractTo("\"", ref strExtract))
                {
                    strExtract = strExtract.Trim();
                    if (strExtract.Length > 0)
                    {
                        if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder builder2 = new UriBuilder(uri);
                                builder2.Path = strExtract;
                                strExtract    = builder2.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strExtract = "http://" + strExtract;
                            }
                        }
                        if (!images.Contains(strExtract))
                        {
                            images.Add(strExtract);
                        }
                    }
                }
            }
        }
Пример #5
0
        /// <summary>
        /// Parses the fetched content.
        /// </summary>
        protected override void parseContent()
        {
            // Initialize the scraper
                this.Translation = string.Empty;
                string strContent = this.Content;
                StringParser parser = new StringParser (strContent);

                // Scrape the translation
                string strTranslation = string.Empty;
                if (parser.skipToEndOf ("<span id=result_box")) {
                    if (parser.skipToEndOf ("onmouseout=\"this.style.backgroundColor='#fff'\">")) {
                        if (parser.extractTo("</span>", ref strTranslation)) {
                            strTranslation = StringParser.removeHtml (strTranslation);
                        }
                    }
                }

                #region Fix up the translation
                    int startClean = 0;
                    int endClean = 0;
                    int i=0;
                    while (i < strTranslation.Length) {
                        if (Char.IsLetterOrDigit (strTranslation[i])) {
                            startClean = i;
                            break;
                        }
                        i++;
                    }
                    i = strTranslation.Length - 1;
                    while (i > 0) {
                        char ch = strTranslation[i];
                        if (Char.IsLetterOrDigit (ch) ||
                            (Char.IsPunctuation (ch) && (ch != '\"'))) {
                            endClean = i;
                            break;
                        }
                        i--;
                    }
                    this.Translation = strTranslation.Substring (startClean, endClean - startClean + 1).Replace ("\"", "");
                #endregion
        }