public static async Task GetText(string url) { HttpClient http = new HttpClient(); var link = System.Uri.UnescapeDataString(url); var response = await http.GetByteArrayAsync(link); String source = Encoding.GetEncoding("utf-8").GetString(response, 0, response.Length - 1); source = WebUtility.HtmlDecode(source); HtmlDocument document = new HtmlDocument(); document.LoadHtml(source); document = HtmlAgilityPackServices.RemoveScripts(document); var text = document.DocumentNode.SelectNodes("//body//text()").Select(node => node.InnerText); StringBuilder output = new StringBuilder(); string[] temp = new string[2000]; Count = 0; foreach (string line in text) { if (!line.Contains("\n")) { temp[Count] = line; Count++; } } Text = new string[Count]; for (int i = 0; i < Count; i++) { Text[i] = temp[i]; } // string textOnly = WebUtility.HtmlDecode(output.ToString()); }
public async Task <CXacDinhDongDLNDViewModel> KetNoiThanhCumDuLieuAsync(CXacDinhDongDLNDViewModel oldModel) { HtmlDocument doc = new HtmlDocument(); doc = await HtmlAgilityPackServices.InitAsync(oldModel.Link); for (int i = 0; i < oldModel.Text.Count(); i++) { if (doc.DocumentNode.InnerHtml.ToString().Contains(oldModel.Text[i])) { var parent = doc.DocumentNode.SelectNodes(string.Format("//*[contains(text(),'{0}')]", oldModel.Text[i])); foreach (var link in parent) { Console.WriteLine(link.InnerText); } } } return(null); }
public static async Task <string> formatHtml(string url) { try { HtmlDocument document = new HtmlDocument(); document = await HtmlAgilityPackServices.InitAsync(url); document = HtmlAgilityPackServices.RemoveStyleAttributes(document); document = HtmlAgilityPackServices.RemoveScripts(document); document = HtmlAgilityPackServices.RemoveAllAttributes(document); // document = HtmlAgilityPackServices.RemoveAllText(document); //var parser = new HtmlParser(); //var document = parser.Parse(source); string source = document.DocumentNode.InnerHtml; return(source); //remove comment //var nodes = document.DocumentNode.SelectNodes("//comment()"); //if (nodes != null) //{ // foreach (var comment in nodes) // { // comment.ParentNode.RemoveChild(comment); // } //} //Content content = new Content(); //await ParserExtension.GetText(Search.Link); //string[] text = new string[ParserExtension.Count]; //text = ParserExtension.Text; //return new ParserViewModel(Search.Title, Search.Link, text); } catch (Exception e) { String error = e.StackTrace; } return(null); }
public static async Task <ParserViewModel> getParserAsync(SearchViewModel Search) { try { HtmlDocument document = new HtmlDocument(); document = await HtmlAgilityPackServices.InitAsync(Search.Link); var threadItems = document.DocumentNode.SelectNodes(".//div").ToList(); Content content = new Content(); //int nH2, nH3, nH4, nP, nB; //nH2 = nH3 = nH4 = nP = nB = 0; //foreach (var item in threadItems) //{ // var // var h2Note = item.SelectNodes(".//h2"); // if (h2Note != null) // { // foreach (var h2Item in h2Note) // { // content.H2[4] = h2Item.InnerText; // h2++; // h3 = 1; // var h3Note = item.SelectNodes(".//h3"); // if (h3Note != null) // { // // Khoi tao gia tri h3 mac dinh ban dau // foreach (var h3Item in h3Note) // { // Content[h2, h3, 0, 0] = h3Item.InnerText; // h3++; // h4 = 2; // var h4Note = item.SelectNodes(".//h4"); // if (h4Note != null) // { // // Khoi tao gia tri h3 mac dinh ban dau // foreach (var h4Item in h4Note) // { // Content[h2, h3, h4, 0] = h4Item.InnerText; // h4++; // var ph4Note = item.SelectNodes(".//p"); // text = 3; // if (ph4Note != null) // { // foreach (var textItem in ph4Note) // { // Content[h2, h3, h4, text] = h3Item.InnerText; // text++; // } // } // var bh4Note = item.SelectNodes(".//b"); // if (bh4Note != null) // { // foreach (var textItem in bh4Note) // { // Content[h2, h3, h4, text] = h3Item.InnerText; // text++; // } // } // } // } // else // { // text = 3; // var ptextNote = item.SelectNodes(".//p"); // if (ptextNote != null) // { // foreach (var pItem in ptextNote) // { // Content[h2, h3, h4, text] = pItem.InnerText; // text++; // } // } // var btextNote = item.SelectNodes(".//b"); // if (btextNote != null) // { // foreach (var bItem in btextNote) // { // Content[h2, h3, h4, text] = bItem.InnerText; // text++; // } // } // } // } // } // else // { // text = 3; // var ptextNote = item.SelectNodes(".//p"); // if (ptextNote != null) // { // foreach (var pItem in ptextNote) // { // Content[h2, h3, 0, text] = pItem.InnerText; // text++; // } // } // var btextNote = item.SelectNodes(".//b"); // if (btextNote != null) // { // foreach (var bItem in btextNote) // { // Content[h2, h3, 0, text] = bItem.InnerText; // text++; // } // } // } // } // } // else // { // text = 3; // var ptextNote = item.SelectNodes(".//p"); // if (ptextNote != null) // { // foreach (var pItem in ptextNote) // { // Content[h2, 0, 0, text] = pItem.InnerText; // text++; // } // } // var btextNote = item.SelectNodes(".//b"); // if (btextNote != null) // { // foreach (var bItem in btextNote) // { // Content[h2, 0, 0, text] = bItem.InnerText; // text++; // } // } // } //} await ParserExtension.GetText(Search.Link); string[] text = new string[ParserExtension.Count]; text = ParserExtension.Text; return(new ParserViewModel(Search.Title, Search.Link, text)); } catch (Exception e) { String error = e.StackTrace; } return(null); }