private void button4_Click(object sender, EventArgs e) { //Method "Best" HtmlAgilityPack.1.6.12 library used in this method if (txtURL.Text == "") { return; } var url = txtURL.Text; var web = new HtmlWeb(); //object from HtmlAgilityPack class var doc = web.Load(url); var nodes = doc.DocumentNode.SelectNodes("//script|//style|//meta|//header|//title|//footer|//Template|//core"); //Remove the elements (Nodes): //script|//style|//meta|//head|//title|//footer foreach (var node in nodes) { node.ParentNode.RemoveChild(node); } Regex rRemScript = new Regex(@"<script[^>]*>[\s\S]*?</script>"); string TXT = HtmlRemoval.StripTagsRegex(doc.DocumentNode.OuterHtml.ToString()); //HtmlRemoval Class used here TXT = rRemScript.Replace(TXT, " ").Trim(); CleanText(TXT); }
private void button2_Click(object sender, EventArgs e) { //Method "Better" using (WebClient client = new WebClient()) { if (txtURL.Text == "") { return; } string htmlCode = client.DownloadString(txtURL.Text); //HtmlAgilityPack.1.6.12 library used in this method HtmlAgilityPack.HtmlDocument doc1 = new HtmlAgilityPack.HtmlDocument(); doc1.LoadHtml(htmlCode); var nodes = doc1.DocumentNode.SelectNodes("//script|//style|//meta|//head|//title|//footer"); //Remove the elements (Nodes): //script|//style|//meta|//head|//title|//footer foreach (var node in nodes) { node.ParentNode.RemoveChild(node); } Regex rRemScript = new Regex(@"<script[^>]*>[\s\S]*?</script>"); string TXT = rRemScript.Replace(doc1.DocumentNode.OuterHtml.ToString(), ""); TXT = HtmlRemoval.StripTagsRegex(TXT); CleanText(TXT); } }
public void button1_Click(object sender, EventArgs e) { //Bad Method if (txtURL.Text == "") { return; } string urlAddress = txtURL.Text; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { Stream receiveStream = response.GetResponseStream(); StreamReader readStream = null; if (response.CharacterSet == null) { readStream = new StreamReader(receiveStream); } else { readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet)); } string data = readStream.ReadToEnd(); data = HtmlRemoval.StripTagsRegex(data); Regex rRemScript = new Regex(@"<script[^>]*>[\s\S]*?</script>"); data = rRemScript.Replace(data, ""); CleanText(data); response.Close(); readStream.Close(); } }