예제 #1
0
파일: StringParser.cs 프로젝트: netonjm/OCR
 public static string removeScripts(string strString)
 {
     string str = "";
     string strExtract = "";
     StringParser parser = new StringParser(strString);
     while (parser.extractToNoCase("<script", ref strExtract))
     {
         str = str + strExtract;
         if (!parser.skipToEndOfNoCase("</script>"))
         {
             parser.Content = str;
             return strString;
         }
     }
     parser.extractToEnd(ref strExtract);
     return (str + strExtract);
 }
예제 #2
0
파일: StringParser.cs 프로젝트: netonjm/OCR
 public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
 {
     strString = removeComments(strString);
     strString = removeScripts(strString);
     StringParser parser = new StringParser(strString);
     parser.replaceEvery("'", "\"");
     string uri = "";
     if (strRootUrl != null)
     {
         uri = strRootUrl.Trim();
     }
     if ((uri.Length > 0) && !uri.EndsWith("/"))
     {
         uri = uri + "/";
     }
     string strExtract = "";
     parser.resetPosition();
     while (parser.skipToEndOfNoCase("href=\""))
     {
         if (parser.extractTo("\"", ref strExtract))
         {
             strExtract = strExtract.Trim();
             if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1))
             {
                 if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                 {
                     try
                     {
                         UriBuilder builder = new UriBuilder(uri);
                         builder.Path = strExtract;
                         strExtract = builder.Uri.ToString();
                     }
                     catch (Exception)
                     {
                         strExtract = "http://" + strExtract;
                     }
                 }
                 if (!documents.Contains(strExtract))
                 {
                     documents.Add(strExtract);
                 }
             }
         }
     }
     parser.resetPosition();
     while (parser.skipToEndOfNoCase("src=\""))
     {
         if (parser.extractTo("\"", ref strExtract))
         {
             strExtract = strExtract.Trim();
             if (strExtract.Length > 0)
             {
                 if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                 {
                     try
                     {
                         UriBuilder builder2 = new UriBuilder(uri);
                         builder2.Path = strExtract;
                         strExtract = builder2.Uri.ToString();
                     }
                     catch (Exception)
                     {
                         strExtract = "http://" + strExtract;
                     }
                 }
                 if (!images.Contains(strExtract))
                 {
                     images.Add(strExtract);
                 }
             }
         }
     }
 }
예제 #3
0
파일: StringParser.cs 프로젝트: netonjm/OCR
 public static string removeHtml(string strString)
 {
     Hashtable hashtable = new Hashtable();
     hashtable.Add("&nbsp;", " ");
     hashtable.Add("&amp;", "&");
     hashtable.Add("&aring;", "");
     hashtable.Add("&auml;", "");
     hashtable.Add("&eacute;", "");
     hashtable.Add("&iacute;", "");
     hashtable.Add("&igrave;", "");
     hashtable.Add("&ograve;", "");
     hashtable.Add("&ouml;", "");
     hashtable.Add("&quot;", "\"");
     hashtable.Add("&szlig;", "");
     StringParser parser = new StringParser(strString);
     foreach (string str in hashtable.Keys)
     {
         string strReplacement = hashtable[str] as string;
         if (strString.IndexOf(str) != -1)
         {
             parser.replaceEveryExact(str, strReplacement);
         }
     }
     parser.replaceEveryExact("&#0", "&#");
     parser.replaceEveryExact("&#39;", "'");
     parser.replaceEveryExact("</", " <~/");
     parser.replaceEveryExact("<~/", "</");
     hashtable.Clear();
     hashtable.Add("<br>", " ");
     hashtable.Add("<p>", " ");
     foreach (string str3 in hashtable.Keys)
     {
         string str4 = hashtable[str3] as string;
         if (strString.IndexOf(str3) != -1)
         {
             parser.replaceEvery(str3, str4);
         }
     }
     strString = parser.Content;
     string str5 = "";
     int startIndex = 0;
     int num2 = 0;
     while ((num2 = strString.IndexOf("<", startIndex)) != -1)
     {
         string str6 = strString.Substring(startIndex, num2 - startIndex);
         str5 = str5 + str6;
         startIndex = num2 + 1;
         int index = strString.IndexOf(">", startIndex);
         if (index == -1)
         {
             break;
         }
         startIndex = index + 1;
     }
     if (startIndex < strString.Length)
     {
         str5 = str5 + strString.Substring(startIndex, strString.Length - startIndex);
     }
     strString = str5;
     str5 = "";
     parser.Content = strString;
     parser.replaceEveryExact("  ", " ");
     strString = parser.Content.Trim();
     return strString;
 }
예제 #4
0
파일: StringParser.cs 프로젝트: netonjm/OCR
 public static string removeComments(string strString)
 {
     string str = "";
     string strExtract = "";
     StringParser parser = new StringParser(strString);
     while (parser.extractTo("<!--", ref strExtract))
     {
         str = str + strExtract;
         if (!parser.skipToEndOf("-->"))
         {
             return strString;
         }
     }
     parser.extractToEnd(ref strExtract);
     return (str + strExtract);
 }
예제 #5
0
        public static string removeHtml(string strString)
        {
            Hashtable hashtable = new Hashtable();

            hashtable.Add("&nbsp;", " ");
            hashtable.Add("&amp;", "&");
            hashtable.Add("&aring;", "");
            hashtable.Add("&auml;", "");
            hashtable.Add("&eacute;", "");
            hashtable.Add("&iacute;", "");
            hashtable.Add("&igrave;", "");
            hashtable.Add("&ograve;", "");
            hashtable.Add("&ouml;", "");
            hashtable.Add("&quot;", "\"");
            hashtable.Add("&szlig;", "");
            StringParser parser = new StringParser(strString);

            foreach (string str in hashtable.Keys)
            {
                string strReplacement = hashtable[str] as string;
                if (strString.IndexOf(str) != -1)
                {
                    parser.replaceEveryExact(str, strReplacement);
                }
            }
            parser.replaceEveryExact("&#0", "&#");
            parser.replaceEveryExact("&#39;", "'");
            parser.replaceEveryExact("</", " <~/");
            parser.replaceEveryExact("<~/", "</");
            hashtable.Clear();
            hashtable.Add("<br>", " ");
            hashtable.Add("<p>", " ");
            foreach (string str3 in hashtable.Keys)
            {
                string str4 = hashtable[str3] as string;
                if (strString.IndexOf(str3) != -1)
                {
                    parser.replaceEvery(str3, str4);
                }
            }
            strString = parser.Content;
            string str5       = "";
            int    startIndex = 0;
            int    num2       = 0;

            while ((num2 = strString.IndexOf("<", startIndex)) != -1)
            {
                string str6 = strString.Substring(startIndex, num2 - startIndex);
                str5       = str5 + str6;
                startIndex = num2 + 1;
                int index = strString.IndexOf(">", startIndex);
                if (index == -1)
                {
                    break;
                }
                startIndex = index + 1;
            }
            if (startIndex < strString.Length)
            {
                str5 = str5 + strString.Substring(startIndex, strString.Length - startIndex);
            }
            strString      = str5;
            str5           = "";
            parser.Content = strString;
            parser.replaceEveryExact("  ", " ");
            strString = parser.Content.Trim();
            return(strString);
        }
예제 #6
0
        public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images)
        {
            strString = removeComments(strString);
            strString = removeScripts(strString);
            StringParser parser = new StringParser(strString);

            parser.replaceEvery("'", "\"");
            string uri = "";

            if (strRootUrl != null)
            {
                uri = strRootUrl.Trim();
            }
            if ((uri.Length > 0) && !uri.EndsWith("/"))
            {
                uri = uri + "/";
            }
            string strExtract = "";

            parser.resetPosition();
            while (parser.skipToEndOfNoCase("href=\""))
            {
                if (parser.extractTo("\"", ref strExtract))
                {
                    strExtract = strExtract.Trim();
                    if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1))
                    {
                        if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder builder = new UriBuilder(uri);
                                builder.Path = strExtract;
                                strExtract   = builder.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strExtract = "http://" + strExtract;
                            }
                        }
                        if (!documents.Contains(strExtract))
                        {
                            documents.Add(strExtract);
                        }
                    }
                }
            }
            parser.resetPosition();
            while (parser.skipToEndOfNoCase("src=\""))
            {
                if (parser.extractTo("\"", ref strExtract))
                {
                    strExtract = strExtract.Trim();
                    if (strExtract.Length > 0)
                    {
                        if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://"))
                        {
                            try
                            {
                                UriBuilder builder2 = new UriBuilder(uri);
                                builder2.Path = strExtract;
                                strExtract    = builder2.Uri.ToString();
                            }
                            catch (Exception)
                            {
                                strExtract = "http://" + strExtract;
                            }
                        }
                        if (!images.Contains(strExtract))
                        {
                            images.Add(strExtract);
                        }
                    }
                }
            }
        }
예제 #7
0
        /// <summary>
        /// Parses the fetched content.
        /// </summary>
        protected override void parseContent()
        {
            // Initialize the scraper
                this.Translation = string.Empty;
                string strContent = this.Content;
                StringParser parser = new StringParser (strContent);

                // Scrape the translation
                string strTranslation = string.Empty;
                if (parser.skipToEndOf ("<span id=result_box")) {
                    if (parser.skipToEndOf ("onmouseout=\"this.style.backgroundColor='#fff'\">")) {
                        if (parser.extractTo("</span>", ref strTranslation)) {
                            strTranslation = StringParser.removeHtml (strTranslation);
                        }
                    }
                }

                #region Fix up the translation
                    int startClean = 0;
                    int endClean = 0;
                    int i=0;
                    while (i < strTranslation.Length) {
                        if (Char.IsLetterOrDigit (strTranslation[i])) {
                            startClean = i;
                            break;
                        }
                        i++;
                    }
                    i = strTranslation.Length - 1;
                    while (i > 0) {
                        char ch = strTranslation[i];
                        if (Char.IsLetterOrDigit (ch) ||
                            (Char.IsPunctuation (ch) && (ch != '\"'))) {
                            endClean = i;
                            break;
                        }
                        i--;
                    }
                    this.Translation = strTranslation.Substring (startClean, endClean - startClean + 1).Replace ("\"", "");
                #endregion
        }