Exemplo n.º 1
0
        public static async Task GetText(string url)
        {
            HttpClient http     = new HttpClient();
            var        link     = System.Uri.UnescapeDataString(url);
            var        response = await http.GetByteArrayAsync(link);

            String source = Encoding.GetEncoding("utf-8").GetString(response, 0, response.Length - 1);

            source = WebUtility.HtmlDecode(source);
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(source);
            document = HtmlAgilityPackServices.RemoveScripts(document);
            var           text   = document.DocumentNode.SelectNodes("//body//text()").Select(node => node.InnerText);
            StringBuilder output = new StringBuilder();

            string[] temp = new string[2000];
            Count = 0;
            foreach (string line in text)
            {
                if (!line.Contains("\n"))
                {
                    temp[Count] = line;
                    Count++;
                }
            }
            Text = new string[Count];
            for (int i = 0; i < Count; i++)
            {
                Text[i] = temp[i];
            }
            // string textOnly = WebUtility.HtmlDecode(output.ToString());
        }
        public static async Task <string> formatHtml(string url)
        {
            try
            {
                HtmlDocument document = new HtmlDocument();
                document = await HtmlAgilityPackServices.InitAsync(url);

                document = HtmlAgilityPackServices.RemoveStyleAttributes(document);
                document = HtmlAgilityPackServices.RemoveScripts(document);
                document = HtmlAgilityPackServices.RemoveAllAttributes(document);
                // document = HtmlAgilityPackServices.RemoveAllText(document);
                //var parser = new HtmlParser();
                //var document = parser.Parse(source);
                string source = document.DocumentNode.InnerHtml;
                return(source);
                //remove comment
                //var nodes = document.DocumentNode.SelectNodes("//comment()");
                //if (nodes != null)
                //{
                //    foreach (var comment in nodes)
                //    {
                //        comment.ParentNode.RemoveChild(comment);
                //    }
                //}
                //Content content = new Content();

                //await ParserExtension.GetText(Search.Link);
                //string[] text = new string[ParserExtension.Count];
                //text = ParserExtension.Text;
                //return new ParserViewModel(Search.Title, Search.Link, text);
            }
            catch (Exception e)
            {
                String error = e.StackTrace;
            }
            return(null);
        }