Пример #1
0
        public static async Task GetText(string url)
        {
            HttpClient http     = new HttpClient();
            var        link     = System.Uri.UnescapeDataString(url);
            var        response = await http.GetByteArrayAsync(link);

            String source = Encoding.GetEncoding("utf-8").GetString(response, 0, response.Length - 1);

            source = WebUtility.HtmlDecode(source);
            HtmlDocument document = new HtmlDocument();

            document.LoadHtml(source);
            document = HtmlAgilityPackServices.RemoveScripts(document);
            var           text   = document.DocumentNode.SelectNodes("//body//text()").Select(node => node.InnerText);
            StringBuilder output = new StringBuilder();

            string[] temp = new string[2000];
            Count = 0;
            foreach (string line in text)
            {
                if (!line.Contains("\n"))
                {
                    temp[Count] = line;
                    Count++;
                }
            }
            Text = new string[Count];
            for (int i = 0; i < Count; i++)
            {
                Text[i] = temp[i];
            }
            // string textOnly = WebUtility.HtmlDecode(output.ToString());
        }
Пример #2
0
        public async Task <CXacDinhDongDLNDViewModel> KetNoiThanhCumDuLieuAsync(CXacDinhDongDLNDViewModel oldModel)
        {
            HtmlDocument doc = new HtmlDocument();

            doc = await HtmlAgilityPackServices.InitAsync(oldModel.Link);

            for (int i = 0; i < oldModel.Text.Count(); i++)
            {
                if (doc.DocumentNode.InnerHtml.ToString().Contains(oldModel.Text[i]))
                {
                    var parent = doc.DocumentNode.SelectNodes(string.Format("//*[contains(text(),'{0}')]", oldModel.Text[i]));
                    foreach (var link in parent)
                    {
                        Console.WriteLine(link.InnerText);
                    }
                }
            }
            return(null);
        }
        public static async Task <string> formatHtml(string url)
        {
            try
            {
                HtmlDocument document = new HtmlDocument();
                document = await HtmlAgilityPackServices.InitAsync(url);

                document = HtmlAgilityPackServices.RemoveStyleAttributes(document);
                document = HtmlAgilityPackServices.RemoveScripts(document);
                document = HtmlAgilityPackServices.RemoveAllAttributes(document);
                // document = HtmlAgilityPackServices.RemoveAllText(document);
                //var parser = new HtmlParser();
                //var document = parser.Parse(source);
                string source = document.DocumentNode.InnerHtml;
                return(source);
                //remove comment
                //var nodes = document.DocumentNode.SelectNodes("//comment()");
                //if (nodes != null)
                //{
                //    foreach (var comment in nodes)
                //    {
                //        comment.ParentNode.RemoveChild(comment);
                //    }
                //}
                //Content content = new Content();

                //await ParserExtension.GetText(Search.Link);
                //string[] text = new string[ParserExtension.Count];
                //text = ParserExtension.Text;
                //return new ParserViewModel(Search.Title, Search.Link, text);
            }
            catch (Exception e)
            {
                String error = e.StackTrace;
            }
            return(null);
        }
Пример #4
0
        public static async Task <ParserViewModel> getParserAsync(SearchViewModel Search)
        {
            try
            {
                HtmlDocument document = new HtmlDocument();
                document = await HtmlAgilityPackServices.InitAsync(Search.Link);

                var     threadItems = document.DocumentNode.SelectNodes(".//div").ToList();
                Content content     = new Content();

                //int nH2, nH3, nH4, nP, nB;
                //nH2 = nH3 = nH4 = nP = nB = 0;
                //foreach (var item in threadItems)
                //{
                //    var
                //    var h2Note = item.SelectNodes(".//h2");
                //    if (h2Note != null)
                //    {
                //        foreach (var h2Item in h2Note)
                //        {
                //            content.H2[4] = h2Item.InnerText;
                //            h2++;
                //            h3 = 1;
                //            var h3Note = item.SelectNodes(".//h3");
                //            if (h3Note != null)
                //            {
                //                // Khoi tao gia tri h3 mac dinh ban dau

                //                foreach (var h3Item in h3Note)
                //                {
                //                    Content[h2, h3, 0, 0] = h3Item.InnerText;
                //                    h3++;
                //                    h4 = 2;
                //                    var h4Note = item.SelectNodes(".//h4");
                //                    if (h4Note != null)
                //                    {
                //                        // Khoi tao gia tri h3 mac dinh ban dau
                //                        foreach (var h4Item in h4Note)
                //                        {
                //                            Content[h2, h3, h4, 0] = h4Item.InnerText;
                //                            h4++;
                //                            var ph4Note = item.SelectNodes(".//p");
                //                            text = 3;
                //                            if (ph4Note != null)
                //                            {

                //                                foreach (var textItem in ph4Note)
                //                                {
                //                                    Content[h2, h3, h4, text] = h3Item.InnerText;
                //                                    text++;
                //                                }
                //                            }
                //                            var bh4Note = item.SelectNodes(".//b");
                //                            if (bh4Note != null)
                //                            {
                //                                foreach (var textItem in bh4Note)
                //                                {
                //                                    Content[h2, h3, h4, text] = h3Item.InnerText;
                //                                    text++;
                //                                }

                //                            }
                //                        }

                //                    }
                //                    else
                //                    {
                //                        text = 3;
                //                        var ptextNote = item.SelectNodes(".//p");
                //                        if (ptextNote != null)
                //                        {
                //                            foreach (var pItem in ptextNote)
                //                            {
                //                                Content[h2, h3, h4, text] = pItem.InnerText;
                //                                text++;
                //                            }
                //                        }
                //                        var btextNote = item.SelectNodes(".//b");
                //                        if (btextNote != null)
                //                        {
                //                            foreach (var bItem in btextNote)
                //                            {
                //                                Content[h2, h3, h4, text] = bItem.InnerText;
                //                                text++;
                //                            }
                //                        }
                //                    }

                //                }
                //            }
                //            else
                //            {
                //                text = 3;
                //                var ptextNote = item.SelectNodes(".//p");
                //                if (ptextNote != null)
                //                {
                //                    foreach (var pItem in ptextNote)
                //                    {
                //                        Content[h2, h3, 0, text] = pItem.InnerText;
                //                        text++;
                //                    }
                //                }
                //                var btextNote = item.SelectNodes(".//b");
                //                if (btextNote != null)
                //                {
                //                    foreach (var bItem in btextNote)
                //                    {
                //                        Content[h2, h3, 0, text] = bItem.InnerText;
                //                        text++;
                //                    }
                //                }
                //            }


                //        }
                //    }
                //    else
                //    {
                //        text = 3;
                //        var ptextNote = item.SelectNodes(".//p");
                //        if (ptextNote != null)
                //        {
                //            foreach (var pItem in ptextNote)
                //            {
                //                Content[h2, 0, 0, text] = pItem.InnerText;
                //                text++;
                //            }
                //        }
                //        var btextNote = item.SelectNodes(".//b");
                //        if (btextNote != null)
                //        {
                //            foreach (var bItem in btextNote)
                //            {
                //                Content[h2, 0, 0, text] = bItem.InnerText;
                //                text++;
                //            }
                //        }
                //    }

                //}
                await ParserExtension.GetText(Search.Link);

                string[] text = new string[ParserExtension.Count];
                text = ParserExtension.Text;
                return(new ParserViewModel(Search.Title, Search.Link, text));
            }
            catch (Exception e) {
                String error = e.StackTrace;
            }
            return(null);
        }