public static string removeScripts(string strString) { string str = ""; string strExtract = ""; StringParser parser = new StringParser(strString); while (parser.extractToNoCase("<script", ref strExtract)) { str = str + strExtract; if (!parser.skipToEndOfNoCase("</script>")) { parser.Content = str; return strString; } } parser.extractToEnd(ref strExtract); return (str + strExtract); }
public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { strString = removeComments(strString); strString = removeScripts(strString); StringParser parser = new StringParser(strString); parser.replaceEvery("'", "\""); string uri = ""; if (strRootUrl != null) { uri = strRootUrl.Trim(); } if ((uri.Length > 0) && !uri.EndsWith("/")) { uri = uri + "/"; } string strExtract = ""; parser.resetPosition(); while (parser.skipToEndOfNoCase("href=\"")) { if (parser.extractTo("\"", ref strExtract)) { strExtract = strExtract.Trim(); if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1)) { if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://")) { try { UriBuilder builder = new UriBuilder(uri); builder.Path = strExtract; strExtract = builder.Uri.ToString(); } catch (Exception) { strExtract = "http://" + strExtract; } } if (!documents.Contains(strExtract)) { documents.Add(strExtract); } } } } parser.resetPosition(); while (parser.skipToEndOfNoCase("src=\"")) { if (parser.extractTo("\"", ref strExtract)) { strExtract = strExtract.Trim(); if (strExtract.Length > 0) { if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://")) { try { UriBuilder builder2 = new UriBuilder(uri); builder2.Path = strExtract; strExtract = builder2.Uri.ToString(); } catch (Exception) { strExtract = "http://" + strExtract; } } if (!images.Contains(strExtract)) { images.Add(strExtract); } } } } }
public static string removeHtml(string strString) { Hashtable hashtable = new Hashtable(); hashtable.Add(" ", " "); hashtable.Add("&", "&"); hashtable.Add("å", ""); hashtable.Add("ä", ""); hashtable.Add("é", ""); hashtable.Add("í", ""); hashtable.Add("ì", ""); hashtable.Add("ò", ""); hashtable.Add("ö", ""); hashtable.Add(""", "\""); hashtable.Add("ß", ""); StringParser parser = new StringParser(strString); foreach (string str in hashtable.Keys) { string strReplacement = hashtable[str] as string; if (strString.IndexOf(str) != -1) { parser.replaceEveryExact(str, strReplacement); } } parser.replaceEveryExact("�", "&#"); parser.replaceEveryExact("'", "'"); parser.replaceEveryExact("</", " <~/"); parser.replaceEveryExact("<~/", "</"); hashtable.Clear(); hashtable.Add("<br>", " "); hashtable.Add("<p>", " "); foreach (string str3 in hashtable.Keys) { string str4 = hashtable[str3] as string; if (strString.IndexOf(str3) != -1) { parser.replaceEvery(str3, str4); } } strString = parser.Content; string str5 = ""; int startIndex = 0; int num2 = 0; while ((num2 = strString.IndexOf("<", startIndex)) != -1) { string str6 = strString.Substring(startIndex, num2 - startIndex); str5 = str5 + str6; startIndex = num2 + 1; int index = strString.IndexOf(">", startIndex); if (index == -1) { break; } startIndex = index + 1; } if (startIndex < strString.Length) { str5 = str5 + strString.Substring(startIndex, strString.Length - startIndex); } strString = str5; str5 = ""; parser.Content = strString; parser.replaceEveryExact(" ", " "); strString = parser.Content.Trim(); return strString; }
public static string removeComments(string strString) { string str = ""; string strExtract = ""; StringParser parser = new StringParser(strString); while (parser.extractTo("<!--", ref strExtract)) { str = str + strExtract; if (!parser.skipToEndOf("-->")) { return strString; } } parser.extractToEnd(ref strExtract); return (str + strExtract); }
public static string removeHtml(string strString) { Hashtable hashtable = new Hashtable(); hashtable.Add(" ", " "); hashtable.Add("&", "&"); hashtable.Add("å", ""); hashtable.Add("ä", ""); hashtable.Add("é", ""); hashtable.Add("í", ""); hashtable.Add("ì", ""); hashtable.Add("ò", ""); hashtable.Add("ö", ""); hashtable.Add(""", "\""); hashtable.Add("ß", ""); StringParser parser = new StringParser(strString); foreach (string str in hashtable.Keys) { string strReplacement = hashtable[str] as string; if (strString.IndexOf(str) != -1) { parser.replaceEveryExact(str, strReplacement); } } parser.replaceEveryExact("�", "&#"); parser.replaceEveryExact("'", "'"); parser.replaceEveryExact("</", " <~/"); parser.replaceEveryExact("<~/", "</"); hashtable.Clear(); hashtable.Add("<br>", " "); hashtable.Add("<p>", " "); foreach (string str3 in hashtable.Keys) { string str4 = hashtable[str3] as string; if (strString.IndexOf(str3) != -1) { parser.replaceEvery(str3, str4); } } strString = parser.Content; string str5 = ""; int startIndex = 0; int num2 = 0; while ((num2 = strString.IndexOf("<", startIndex)) != -1) { string str6 = strString.Substring(startIndex, num2 - startIndex); str5 = str5 + str6; startIndex = num2 + 1; int index = strString.IndexOf(">", startIndex); if (index == -1) { break; } startIndex = index + 1; } if (startIndex < strString.Length) { str5 = str5 + strString.Substring(startIndex, strString.Length - startIndex); } strString = str5; str5 = ""; parser.Content = strString; parser.replaceEveryExact(" ", " "); strString = parser.Content.Trim(); return(strString); }
public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { strString = removeComments(strString); strString = removeScripts(strString); StringParser parser = new StringParser(strString); parser.replaceEvery("'", "\""); string uri = ""; if (strRootUrl != null) { uri = strRootUrl.Trim(); } if ((uri.Length > 0) && !uri.EndsWith("/")) { uri = uri + "/"; } string strExtract = ""; parser.resetPosition(); while (parser.skipToEndOfNoCase("href=\"")) { if (parser.extractTo("\"", ref strExtract)) { strExtract = strExtract.Trim(); if ((strExtract.Length > 0) && (strExtract.IndexOf("mailto:") == -1)) { if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://")) { try { UriBuilder builder = new UriBuilder(uri); builder.Path = strExtract; strExtract = builder.Uri.ToString(); } catch (Exception) { strExtract = "http://" + strExtract; } } if (!documents.Contains(strExtract)) { documents.Add(strExtract); } } } } parser.resetPosition(); while (parser.skipToEndOfNoCase("src=\"")) { if (parser.extractTo("\"", ref strExtract)) { strExtract = strExtract.Trim(); if (strExtract.Length > 0) { if (!strExtract.StartsWith("http://") && !strExtract.StartsWith("ftp://")) { try { UriBuilder builder2 = new UriBuilder(uri); builder2.Path = strExtract; strExtract = builder2.Uri.ToString(); } catch (Exception) { strExtract = "http://" + strExtract; } } if (!images.Contains(strExtract)) { images.Add(strExtract); } } } } }
/// <summary> /// Parses the fetched content. /// </summary> protected override void parseContent() { // Initialize the scraper this.Translation = string.Empty; string strContent = this.Content; StringParser parser = new StringParser (strContent); // Scrape the translation string strTranslation = string.Empty; if (parser.skipToEndOf ("<span id=result_box")) { if (parser.skipToEndOf ("onmouseout=\"this.style.backgroundColor='#fff'\">")) { if (parser.extractTo("</span>", ref strTranslation)) { strTranslation = StringParser.removeHtml (strTranslation); } } } #region Fix up the translation int startClean = 0; int endClean = 0; int i=0; while (i < strTranslation.Length) { if (Char.IsLetterOrDigit (strTranslation[i])) { startClean = i; break; } i++; } i = strTranslation.Length - 1; while (i > 0) { char ch = strTranslation[i]; if (Char.IsLetterOrDigit (ch) || (Char.IsPunctuation (ch) && (ch != '\"'))) { endClean = i; break; } i--; } this.Translation = strTranslation.Substring (startClean, endClean - startClean + 1).Replace ("\"", ""); #endregion }