public static async Task GetText(string url) { HttpClient http = new HttpClient(); var link = System.Uri.UnescapeDataString(url); var response = await http.GetByteArrayAsync(link); String source = Encoding.GetEncoding("utf-8").GetString(response, 0, response.Length - 1); source = WebUtility.HtmlDecode(source); HtmlDocument document = new HtmlDocument(); document.LoadHtml(source); document = HtmlAgilityPackServices.RemoveScripts(document); var text = document.DocumentNode.SelectNodes("//body//text()").Select(node => node.InnerText); StringBuilder output = new StringBuilder(); string[] temp = new string[2000]; Count = 0; foreach (string line in text) { if (!line.Contains("\n")) { temp[Count] = line; Count++; } } Text = new string[Count]; for (int i = 0; i < Count; i++) { Text[i] = temp[i]; } // string textOnly = WebUtility.HtmlDecode(output.ToString()); }
public static async Task <string> formatHtml(string url) { try { HtmlDocument document = new HtmlDocument(); document = await HtmlAgilityPackServices.InitAsync(url); document = HtmlAgilityPackServices.RemoveStyleAttributes(document); document = HtmlAgilityPackServices.RemoveScripts(document); document = HtmlAgilityPackServices.RemoveAllAttributes(document); // document = HtmlAgilityPackServices.RemoveAllText(document); //var parser = new HtmlParser(); //var document = parser.Parse(source); string source = document.DocumentNode.InnerHtml; return(source); //remove comment //var nodes = document.DocumentNode.SelectNodes("//comment()"); //if (nodes != null) //{ // foreach (var comment in nodes) // { // comment.ParentNode.RemoveChild(comment); // } //} //Content content = new Content(); //await ParserExtension.GetText(Search.Link); //string[] text = new string[ParserExtension.Count]; //text = ParserExtension.Text; //return new ParserViewModel(Search.Title, Search.Link, text); } catch (Exception e) { String error = e.StackTrace; } return(null); }