/// <summary> /// Parses the fetched content. /// </summary> protected override void parseContent() { // Initialize the scraper this.Translation = string.Empty; string strContent = this.Content; StringParser parser = new StringParser (strContent); // Scrape the translation string strTranslation = string.Empty; if (parser.skipToEndOf ("<span id=result_box")) { if (parser.skipToEndOf ("onmouseout=\"this.style.backgroundColor='#fff'\">")) { if (parser.extractTo("</span>", ref strTranslation)) { strTranslation = StringParser.removeHtml (strTranslation); } } } #region Fix up the translation int startClean = 0; int endClean = 0; int i=0; while (i < strTranslation.Length) { if (Char.IsLetterOrDigit (strTranslation[i])) { startClean = i; break; } i++; } i = strTranslation.Length - 1; while (i > 0) { char ch = strTranslation[i]; if (Char.IsLetterOrDigit (ch) || (Char.IsPunctuation (ch) && (ch != '\"'))) { endClean = i; break; } i--; } this.Translation = strTranslation.Substring (startClean, endClean - startClean + 1).Replace ("\"", ""); #endregion }
public static void getLinks(string strString, string strRootUrl, ref ArrayList documents, ref ArrayList images) { strString = StringParser.removeComments(strString); strString = StringParser.removeScripts(strString); StringParser stringParser = new StringParser(strString); stringParser.replaceEvery("'", "\""); string text = ""; if (strRootUrl != null) { text = strRootUrl.Trim(); } if (text.Length > 0 && !text.EndsWith("/")) { text += "/"; } string text2 = ""; stringParser.resetPosition(); while (stringParser.skipToEndOfNoCase("href=\"")) { if (stringParser.extractTo("\"", ref text2)) { text2 = text2.Trim(); if (text2.Length > 0 && text2.IndexOf("mailto:") == -1) { if (!text2.StartsWith("http://") && !text2.StartsWith("ftp://")) { try { UriBuilder uriBuilder = new UriBuilder(text); uriBuilder.Path =text2; text2 = uriBuilder.Uri.ToString(); } catch (Exception) { text2 = "http://" + text2; } } if (!documents.Contains(text2)) { documents.Add(text2); } } } } stringParser.resetPosition(); while (stringParser.skipToEndOfNoCase("src=\"")) { if (stringParser.extractTo("\"", ref text2)) { text2 = text2.Trim(); if (text2.Length > 0) { if (!text2.StartsWith("http://") && !text2.StartsWith("ftp://")) { try { UriBuilder uriBuilder2 = new UriBuilder(text); uriBuilder2.Path = text2; text2 = uriBuilder2.Uri.ToString(); } catch (Exception) { text2 = "http://" + text2; } } if (!images.Contains(text2)) { images.Add(text2); } } } } }
public static string removeHtml(string strString) { Hashtable hashtable = new Hashtable(); hashtable.Add(" ", " "); hashtable.Add("&", "&"); hashtable.Add("å", ""); hashtable.Add("ä", ""); hashtable.Add("é", ""); hashtable.Add("í", ""); hashtable.Add("ì", ""); hashtable.Add("ò", ""); hashtable.Add("ö", ""); hashtable.Add(""", "\""); hashtable.Add("ß", ""); StringParser stringParser = new StringParser(strString); IEnumerator enumerator = hashtable.Keys.GetEnumerator(); try { while (enumerator.MoveNext()) { string text = (string)enumerator.Current; string strReplacement = hashtable[text] as string; if (strString.IndexOf(text) != -1) { stringParser.replaceEveryExact(text, strReplacement); } } } finally { IDisposable disposable = enumerator as IDisposable; if (disposable != null) { disposable.Dispose(); } } stringParser.replaceEveryExact("�", "&#"); stringParser.replaceEveryExact("'", "'"); stringParser.replaceEveryExact("</", " <~/"); stringParser.replaceEveryExact("<~/", "</"); hashtable.Clear(); hashtable.Add("<br>", " "); hashtable.Add("<p>", " "); enumerator = hashtable.Keys.GetEnumerator(); try { while (enumerator.MoveNext()) { string text2 = (string)enumerator.Current; string strReplacement2 = hashtable[text2] as string; if (strString.IndexOf(text2) != -1) { stringParser.replaceEvery(text2, strReplacement2); } } } finally { IDisposable disposable = enumerator as IDisposable; if (disposable != null) { disposable.Dispose(); } } strString = stringParser.Content; string text3 = ""; int num = 0; int num2; while ((num2 = strString.IndexOf("<", num)) != -1) { string text4 = strString.Substring(num, num2 - num); text3 += text4; num = num2 + 1; int num3 = strString.IndexOf(">", num); if (num3 == -1) { break; } num = num3 + 1; } if (num < strString.Length) { text3 += strString.Substring(num, strString.Length - num); } strString = text3; stringParser.Content = strString; stringParser.replaceEveryExact(" ", " "); strString = stringParser.Content.Trim(); return strString; }
public static string removeScripts(string strString) { string text = ""; string text2 = ""; StringParser stringParser = new StringParser(strString); while (stringParser.extractToNoCase("<script", ref text2)) { text += text2; if (!stringParser.skipToEndOfNoCase("</script>")) { stringParser.Content = text; return strString; } } stringParser.extractToEnd(ref text2); return text + text2; }
public static string removeComments(string strString) { string text = ""; string text2 = ""; StringParser stringParser = new StringParser(strString); while (stringParser.extractTo("<!--", ref text2)) { text += text2; if (!stringParser.skipToEndOf("-->")) { return strString; } } stringParser.extractToEnd(ref text2); return text + text2; }