public HtmlDocument ReadLink(string url) { HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb(); htmlWeb.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(url); return(htmlDoc); }
/// <summary> /// Checks and convert image. /// </summary> /// <returns>converted FileStream</returns> private void CheckAndConvertImage() { HtmlWeb hw = new HtmlWeb(); HtmlAgilityPack.HtmlDocument htmlDoc = hw.Load(OutputDirecroy); if (htmlDoc.DocumentNode != null) { // clearing bullets htmlDoc.DocumentNode.InnerHtml = ClearFromBullets(htmlDoc.DocumentNode.InnerHtml); // create normal lists htmlDoc = CreateLists(htmlDoc); // images to base64 if (htmlDoc.DocumentNode.SelectNodes("//img") != null) { foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//img")) { String currSrc = link.Attributes["src"].Value; currSrc = currSrc.Replace("%20", " "); if (currSrc.Contains("file:")) { currSrc = currSrc.Substring(8); // for full patch } else { currSrc = String.Format("{1}{0}", currSrc, globalRoutes.OutputConvertRoute); // for part of patch } link.Attributes["src"].Value = MakeImageSrcData(currSrc); } } } if (htmlDoc.DocumentNode != null) { var t = new HtmlDocument(); t.LoadHtml(htmlDoc.DocumentNode.InnerHtml); t.Save(OutputDirecroy); } else { htmlDoc.Save(OutputDirecroy); } WhaitFileFree(OutputDirecroy); CreateddFile = new FileStream(OutputDirecroy, FileMode.Open); }
/// <summary> /// Размещает сформированный блок-цитату вверху редактируемого письма. /// TODO: каждую следующую цитату стоит размещать внизу последней /// </summary> /// <param name="item"></param> /// <param name="body"></param> /// <returns></returns> private string QuoteInsert(string item, string body) { HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(body); var bodyNode = doc.DocumentNode.SelectSingleNode("//body"); HtmlNode newNode = HtmlNode.CreateNode(item); bodyNode.PrependChild(HtmlNode.CreateNode("<p class=MsoNormal><o:p></o:p></p>")); bodyNode.PrependChild(newNode); return(doc.DocumentNode.OuterHtml); }
private HtmlDocument parsujHtml() { var html = webBrowser1.Document.Body.InnerHtml; var index = html.IndexOf("<TABLE"); html = html.Substring(index); index = html.IndexOf("</TABLE>"); html = html.Remove(index + 8); var inputHtml = html; HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(inputHtml); return(doc); }
/// <summary> /// Parses the HTML and create inages. /// </summary> /// <param name="input">The input.</param> private void ParseHtmlAndCreateInages(String input, Byte[] fileStream, String inputName, String inputExtension) { String inputTemp = String.Format("{2}{0}.{1}", inputName, inputExtension, globalRoutes.DiffTempRoute); FileStream wFile = new FileStream(inputTemp, FileMode.Create); wFile.Write(fileStream, 0, fileStream.Length); wFile.Close(); WhaitFileFree(inputTemp); HtmlWeb hw = new HtmlWeb(); HtmlAgilityPack.HtmlDocument htmlDoc = hw.Load(inputTemp); if (htmlDoc.DocumentNode != null) { htmlDoc.DocumentNode.InnerHtml = ClearFromGarbage(htmlDoc.DocumentNode.InnerHtml); if (htmlDoc.DocumentNode.SelectNodes("//img") != null) { foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//img")) { String currSrc = link.Attributes["src"].Value; if (currSrc.Contains("base64")) { String imgName = Path.GetRandomFileName(); String newImageName = globalRoutes.InputConvertRoute + imgName + ".bmp"; SaveByteArrayAsImage(newImageName, currSrc.Substring(22)); link.Attributes["src"].Value = imgName + ".bmp"; } } } } if (htmlDoc.DocumentNode != null) { var t = new HtmlDocument(); t.LoadHtml(htmlDoc.DocumentNode.InnerHtml); t.Save(input); } else { htmlDoc.Save(input); } }
private void button1_Click(object sender, EventArgs e) { //Khai báo đường dẫn URL web cần lấy nội dung HTML string _url = txtNhap.Text; //Khởi tạo 1 đối tượng htmlWeb, có thể hiểu đối tượng này như 1 trình duyệt ảo HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb(); //Set UserAgent string để máy chủ web biết là mình truy cập bằng trình duyệt gì //Hữu ích khi bạn cần lấy nội dung HTML phiên bản mobile hay desktop //Danh sách useragent string http://www.useragentstring.com/pages/useragentstring.php htmlWeb.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; try { //Khai báo đối tượng htmlDoc chứa nội dung HTML sẽ tải về HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(_url); string _html = htmlDoc.DocumentNode.InnerHtml; //lưu nội dung HTML vào biến _html HtmlDocument html = new HtmlDocument(); html.OptionFixNestedTags = true; html.OptionCheckSyntax = false; html.LoadHtml(_html); //responseString là đoạn html mà bạn lấy được ở phần trước. string titleNodes = htmlDoc.DocumentNode.SelectSingleNode("//h1[@class='viewtitle']").InnerHtml; string[] titleString = titleNodes.Split('-'); txtBaiHat.Text = titleString[0]; txtCaSi.Text = titleString[1]; var nodes = htmlDoc.DocumentNode.SelectNodes("//script[@type='text/javascript' and contains(.,'$(document).ready')]"); foreach (HtmlNode node in nodes) { var title = node.InnerText; // Tiêu đề bài viết muốn lấy. string[] chuoi = title.Split('{'); string[] ch = chuoi[3].Split('"'); txtLinkNhac.Text = ch[3]; } } catch (Exception ex) { MessageBox.Show(ex.Message); } }
private void btnOk_Click(object sender, EventArgs e) { try { HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(webBrowser1.DocumentText); foreach (HtmlNode value in doc.DocumentNode.SelectNodes("/html/body/div[@class='content']/div[2]/div[@class='copy-code']/input")) { AuthrizeCode = value.Attributes[1].Value; } if (!string.IsNullOrEmpty(AuthrizeCode) && AuthrizeCode.IndexOf("TOP-") >= 0) { this.DialogResult = DialogResult.OK; this.Close(); } } catch(Exception ex) { MessageBox.Show(ex.Message); } }
public void GetManager(string url360, string xpath, string title, string text, State state) { try { HtmlWeb webClient = new HtmlWeb(); webClient.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"; HtmlDocument doc1 = webClient.Load(url360); HtmlNodeCollection htmlNodes = null; HtmlNode node = null; //*[@id="pl_user_feedList"]/div[1]/div[2]/p[2]/span[2]/a node = doc1.DocumentNode.SelectSingleNode(xpath); while (node == null) { Proxy ip = new Proxy(); try { HttpWebRequest httpRequest = (HttpWebRequest)HttpWebRequest.Create(url360); httpRequest = SetHttpWebRequest(httpRequest, ref ip); using (HttpWebResponse rs = (HttpWebResponse)httpRequest.GetResponse()) { using (System.IO.StreamReader sr = new StreamReader(rs.GetResponseStream(), System.Text.Encoding.GetEncoding("utf-8"))) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.Load(sr); node = doc.DocumentNode.SelectSingleNode(xpath); if (node == null) { ChangeState(ip); Gettip(); continue; } else { name = "启用代理IP"; break; } } } } catch (Exception ex) { ChangeState(ip); Gettip(); this.Invoke(new EventHandler(delegate { textBox1.AppendText(num + title + name + text + ":" + ex.Message + "\r\n"); })); continue; } } if (state == State.Quantity) { label1.Text = num.ToString(); this.Invoke(new EventHandler(delegate { textBox1.AppendText(num + title + name + text + ":" + GetNumber(node.InnerHtml) + "\r\n"); })); } else if (state == State.Fan) { label1.Text = num.ToString(); this.Invoke(new EventHandler(delegate { textBox1.AppendText(num + title + name + text + ":" + PrintNumber(node.InnerText) + "\r\n"); })); } else if (state == State.Entry) { label1.Text = num.ToString(); this.Invoke(new EventHandler(delegate { textBox1.AppendText(num + title + name + text + ":" + htmlNodes.Count + "\r\n"); })); } else { this.Invoke(new EventHandler(delegate { textBox1.AppendText("信息有问题\r\n"); })); } } catch (Exception ex) { this.Invoke(new EventHandler(delegate { textBox1.AppendText(ex.Message + "\r\n"); })); } }
private void wbBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { try { tbAbsoluteUri.Text = e.Url.AbsoluteUri; if (_assignScraperIDs) { _assignScraperIDs = false; short scraperIdForDivs = 1; int numberOfDivsInHtmlDocument1 = 0; foreach (HtmlElement htmlElement in wbBrowser.Document.All) { switch (htmlElement.TagName.ToLowerInvariant()) { case "a": case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": case "table": case "th": case "tr": case "td": case "div": case "span": //default: numberOfDivsInHtmlDocument1++; if (!_clickHandlers.Contains(htmlElement)) { htmlElement.Click += new HtmlElementEventHandler(htmlElement_Click); htmlElement.TabIndex = scraperIdForDivs++; _clickHandlers.Add(htmlElement); } else { htmlElement.TabIndex = 0; } break; } } scraperIdForDivs = 1; int numberOfDivsInHtmlDocument2 = 0; _htmlDocument = new HtmlAgilityPack.HtmlDocument(); _htmlDocument.LoadHtml(wbBrowser.DocumentText); foreach (HtmlNode htmlNode in _htmlDocument.DocumentNode.DescendantsAndSelf()) { switch (htmlNode.Name.ToLowerInvariant()) { case "a": case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": case "table": case "th": case "tr": case "td": case "div": case "span": //default: numberOfDivsInHtmlDocument2++; htmlNode.Attributes.Add("arachnode_scraper_id", (scraperIdForDivs++).ToString()); break; } } if (numberOfDivsInHtmlDocument1 != numberOfDivsInHtmlDocument2) { } } /**/ tvBrowser.Nodes.Clear(); _treeNodes.Clear(); //inefficient... foreach (HtmlElement htmlElement in wbBrowser.Document.All) { TreeNode treeNode = new TreeNode(htmlElement.TagName); treeNode.ToolTipText = htmlElement.InnerHtml; if (string.IsNullOrEmpty(treeNode.ToolTipText)) { treeNode.ToolTipText = htmlElement.InnerText; } if (string.IsNullOrEmpty(treeNode.ToolTipText)) { treeNode.ToolTipText = htmlElement.OuterHtml; } if (string.IsNullOrEmpty(treeNode.ToolTipText)) { treeNode.ToolTipText = htmlElement.OuterText; } if (!string.IsNullOrEmpty(treeNode.ToolTipText)) { treeNode.ToolTipText = treeNode.ToolTipText.Trim(); } string toolTipText = treeNode.ToolTipText; if (!string.IsNullOrEmpty(treeNode.ToolTipText) && treeNode.ToolTipText.Length > 250) { treeNode.ToolTipText = treeNode.ToolTipText.Substring(0, 250) + "..."; } if (!string.IsNullOrEmpty(treeNode.ToolTipText)) { treeNode.ToolTipText += Environment.NewLine + "------------" + Environment.NewLine + UserDefinedFunctions.ExtractText(toolTipText).Value; } if (!string.IsNullOrEmpty(treeNode.ToolTipText) && treeNode.ToolTipText.Length > 500) { treeNode.ToolTipText = treeNode.ToolTipText.Substring(0, 500) + "..."; } treeNode.Tag = htmlElement; if (htmlElement.Parent == null) { tvBrowser.Nodes.Add(treeNode); _treeNodes.Add(treeNode); } else { foreach (TreeNode treeNode2 in _treeNodes) { if (((HtmlElement)treeNode.Tag).Parent == (HtmlElement)treeNode2.Tag) { treeNode2.Nodes.Add(treeNode); _treeNodes.Add(treeNode); break; } } } } /**/ rtbViewSource.Text = wbBrowser.DocumentText; HighlightRTF(rtbViewSource); } catch (Exception exception) { MessageBox.Show(exception.Message + Environment.NewLine + exception.StackTrace, _formText); } }
private void GetCaptchaImage(string input) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(input); string captcha = ""; try { HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//img"); CaptchaForm captchaForm = new CaptchaForm(url + bodyNode.Attributes["src"].Value); captchaForm.ShowDialog(); captcha = GetRequest(index, textBoxRequest.Text) + "&captcha=" + captchaForm.Captcha + "&submit=Отправить"; } catch { File.WriteAllText("log.html", input, Encoding.Default); } Captcha(captcha); }
private void ParseHtmlDocument(string input) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(input); try { if (count == 0) { HtmlNode bodyNodeCount = doc.DocumentNode.SelectSingleNode("//div[@id='gs_ab_md']"); string c = bodyNodeCount.InnerText; string b = ""; int l = 0; while (c[l] != '(') { if (Char.IsDigit(c[l])) { b += c[l]; } l++; } count = int.Parse(b) / 10; } HtmlNodeCollection bodyNodeNames = doc.DocumentNode.SelectNodes("//h3[@class='gs_rt']"); HtmlNodeCollection bodyNodeAutors = doc.DocumentNode.SelectNodes("//div[@class='gs_a']"); string[] names = new string[bodyNodeNames.Count]; string[] autors = new string[bodyNodeAutors.Count]; for (int i = 0; i < bodyNodeNames.Count; i++) { names[i] = bodyNodeNames[i].InnerText; autors[i] = bodyNodeAutors[i].InnerText; File.AppendAllText("log.txt", names[i] + "\n" + autors[i] + "\n\n"); } } catch { File.WriteAllText("log.html", input, Encoding.Default); } }