示例#1
0
 public HtmlDocument ReadLink(string url)
 {
     HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb();
     htmlWeb.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0";
     HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(url);
     return(htmlDoc);
 }
示例#2
0
        /// <summary>
        /// Checks and convert image.
        /// </summary>
        /// <returns>converted FileStream</returns>
        private void CheckAndConvertImage()
        {
            HtmlWeb hw = new HtmlWeb();

            HtmlAgilityPack.HtmlDocument htmlDoc = hw.Load(OutputDirecroy);
            if (htmlDoc.DocumentNode != null)
            {
                // clearing bullets
                htmlDoc.DocumentNode.InnerHtml = ClearFromBullets(htmlDoc.DocumentNode.InnerHtml);

                // create normal lists
                htmlDoc = CreateLists(htmlDoc);


                // images to base64
                if (htmlDoc.DocumentNode.SelectNodes("//img") != null)
                {
                    foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//img"))
                    {
                        String currSrc = link.Attributes["src"].Value;
                        currSrc = currSrc.Replace("%20", " ");
                        if (currSrc.Contains("file:"))
                        {
                            currSrc = currSrc.Substring(8); // for full patch
                        }
                        else
                        {
                            currSrc = String.Format("{1}{0}", currSrc, globalRoutes.OutputConvertRoute); // for part of patch
                        }

                        link.Attributes["src"].Value = MakeImageSrcData(currSrc);
                    }
                }
            }

            if (htmlDoc.DocumentNode != null)
            {
                var t = new HtmlDocument();
                t.LoadHtml(htmlDoc.DocumentNode.InnerHtml);
                t.Save(OutputDirecroy);
            }
            else
            {
                htmlDoc.Save(OutputDirecroy);
            }


            WhaitFileFree(OutputDirecroy);

            CreateddFile = new FileStream(OutputDirecroy, FileMode.Open);
        }
示例#3
0
        /// <summary>
        /// Размещает сформированный блок-цитату вверху редактируемого письма.
        /// TODO: каждую следующую цитату стоит размещать внизу последней
        /// </summary>
        /// <param name="item"></param>
        /// <param name="body"></param>
        /// <returns></returns>
        private string QuoteInsert(string item, string body)
        {
            HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

            doc.LoadHtml(body);
            var bodyNode = doc.DocumentNode.SelectSingleNode("//body");

            HtmlNode newNode = HtmlNode.CreateNode(item);

            bodyNode.PrependChild(HtmlNode.CreateNode("<p class=MsoNormal><o:p></o:p></p>"));
            bodyNode.PrependChild(newNode);

            return(doc.DocumentNode.OuterHtml);
        }
示例#4
0
        private HtmlDocument parsujHtml()
        {
            var html  = webBrowser1.Document.Body.InnerHtml;
            var index = html.IndexOf("<TABLE");

            html  = html.Substring(index);
            index = html.IndexOf("</TABLE>");
            html  = html.Remove(index + 8);
            var inputHtml = html;

            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(inputHtml);
            return(doc);
        }
示例#5
0
        /// <summary>
        /// Parses the HTML and create inages.
        /// </summary>
        /// <param name="input">The input.</param>
        private void ParseHtmlAndCreateInages(String input, Byte[] fileStream, String inputName, String inputExtension)
        {
            String     inputTemp = String.Format("{2}{0}.{1}", inputName, inputExtension, globalRoutes.DiffTempRoute);
            FileStream wFile     = new FileStream(inputTemp, FileMode.Create);

            wFile.Write(fileStream, 0, fileStream.Length);
            wFile.Close();

            WhaitFileFree(inputTemp);

            HtmlWeb hw = new HtmlWeb();

            HtmlAgilityPack.HtmlDocument htmlDoc = hw.Load(inputTemp);
            if (htmlDoc.DocumentNode != null)
            {
                htmlDoc.DocumentNode.InnerHtml = ClearFromGarbage(htmlDoc.DocumentNode.InnerHtml);

                if (htmlDoc.DocumentNode.SelectNodes("//img") != null)
                {
                    foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//img"))
                    {
                        String currSrc = link.Attributes["src"].Value;
                        if (currSrc.Contains("base64"))
                        {
                            String imgName      = Path.GetRandomFileName();
                            String newImageName = globalRoutes.InputConvertRoute + imgName + ".bmp";
                            SaveByteArrayAsImage(newImageName, currSrc.Substring(22));
                            link.Attributes["src"].Value = imgName + ".bmp";
                        }
                    }
                }
            }

            if (htmlDoc.DocumentNode != null)
            {
                var t = new HtmlDocument();
                t.LoadHtml(htmlDoc.DocumentNode.InnerHtml);
                t.Save(input);
            }
            else
            {
                htmlDoc.Save(input);
            }
        }
示例#6
0
        private void button1_Click(object sender, EventArgs e)
        {
            //Khai báo đường dẫn URL web cần lấy nội dung HTML
            string _url = txtNhap.Text;

            //Khởi tạo 1 đối tượng htmlWeb, có thể hiểu đối tượng này như 1 trình duyệt ảo
            HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb();
            //Set UserAgent string để máy chủ web biết là mình truy cập bằng trình duyệt gì
            //Hữu ích khi bạn cần lấy nội dung HTML phiên bản mobile hay desktop
            //Danh sách useragent string http://www.useragentstring.com/pages/useragentstring.php
            htmlWeb.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0";

            try
            {
                //Khai báo đối tượng htmlDoc chứa nội dung HTML sẽ tải về
                HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(_url);
                string _html = htmlDoc.DocumentNode.InnerHtml; //lưu nội dung HTML vào biến _html


                HtmlDocument html = new HtmlDocument();
                html.OptionFixNestedTags = true;
                html.OptionCheckSyntax   = false;
                html.LoadHtml(_html); //responseString là đoạn html mà bạn lấy được ở phần trước.
                string   titleNodes  = htmlDoc.DocumentNode.SelectSingleNode("//h1[@class='viewtitle']").InnerHtml;
                string[] titleString = titleNodes.Split('-');
                txtBaiHat.Text = titleString[0];
                txtCaSi.Text   = titleString[1];
                var nodes = htmlDoc.DocumentNode.SelectNodes("//script[@type='text/javascript' and contains(.,'$(document).ready')]");
                foreach (HtmlNode node in nodes)
                {
                    var      title = node.InnerText; // Tiêu đề bài viết muốn lấy.
                    string[] chuoi = title.Split('{');
                    string[] ch    = chuoi[3].Split('"');
                    txtLinkNhac.Text = ch[3];
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
        }
示例#7
0
        private void btnOk_Click(object sender, EventArgs e)
        {
            try
            {
                HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(webBrowser1.DocumentText);

                foreach (HtmlNode value in doc.DocumentNode.SelectNodes("/html/body/div[@class='content']/div[2]/div[@class='copy-code']/input"))
                {
                    AuthrizeCode = value.Attributes[1].Value;
                }

                if (!string.IsNullOrEmpty(AuthrizeCode) && AuthrizeCode.IndexOf("TOP-") >= 0)
                {
                    this.DialogResult = DialogResult.OK;
                    this.Close();
                }
            }
            catch(Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
        }
示例#8
0
        public void GetManager(string url360, string xpath, string title, string text, State state)
        {
            try
            {
                HtmlWeb webClient = new HtmlWeb();
                webClient.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36";
                HtmlDocument       doc1      = webClient.Load(url360);
                HtmlNodeCollection htmlNodes = null;
                HtmlNode           node      = null;
                //*[@id="pl_user_feedList"]/div[1]/div[2]/p[2]/span[2]/a
                node = doc1.DocumentNode.SelectSingleNode(xpath);

                while (node == null)
                {
                    Proxy ip = new Proxy();
                    try
                    {
                        HttpWebRequest httpRequest = (HttpWebRequest)HttpWebRequest.Create(url360);

                        httpRequest = SetHttpWebRequest(httpRequest, ref ip);

                        using (HttpWebResponse rs = (HttpWebResponse)httpRequest.GetResponse())
                        {
                            using (System.IO.StreamReader sr = new StreamReader(rs.GetResponseStream(), System.Text.Encoding.GetEncoding("utf-8")))
                            {
                                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                                doc.Load(sr);
                                node = doc.DocumentNode.SelectSingleNode(xpath);
                                if (node == null)
                                {
                                    ChangeState(ip);
                                    Gettip();
                                    continue;
                                }
                                else
                                {
                                    name = "启用代理IP";
                                    break;
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        ChangeState(ip);
                        Gettip();
                        this.Invoke(new EventHandler(delegate
                        {
                            textBox1.AppendText(num + title + name + text + ":" + ex.Message + "\r\n");
                        }));
                        continue;
                    }
                }



                if (state == State.Quantity)
                {
                    label1.Text = num.ToString();
                    this.Invoke(new EventHandler(delegate
                    {
                        textBox1.AppendText(num + title + name + text + ":" + GetNumber(node.InnerHtml) + "\r\n");
                    }));
                }
                else if (state == State.Fan)
                {
                    label1.Text = num.ToString();
                    this.Invoke(new EventHandler(delegate
                    {
                        textBox1.AppendText(num + title + name + text + ":" + PrintNumber(node.InnerText) + "\r\n");
                    }));
                }
                else if (state == State.Entry)
                {
                    label1.Text = num.ToString();
                    this.Invoke(new EventHandler(delegate
                    {
                        textBox1.AppendText(num + title + name + text + ":" + htmlNodes.Count + "\r\n");
                    }));
                }
                else
                {
                    this.Invoke(new EventHandler(delegate
                    {
                        textBox1.AppendText("信息有问题\r\n");
                    }));
                }
            }
            catch (Exception ex)
            {
                this.Invoke(new EventHandler(delegate
                {
                    textBox1.AppendText(ex.Message + "\r\n");
                }));
            }
        }
示例#9
0
        private void wbBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            try
            {
                tbAbsoluteUri.Text = e.Url.AbsoluteUri;

                if (_assignScraperIDs)
                {
                    _assignScraperIDs = false;

                    short scraperIdForDivs            = 1;
                    int   numberOfDivsInHtmlDocument1 = 0;

                    foreach (HtmlElement htmlElement in wbBrowser.Document.All)
                    {
                        switch (htmlElement.TagName.ToLowerInvariant())
                        {
                        case "a":
                        case "h1":
                        case "h2":
                        case "h3":
                        case "h4":
                        case "h5":
                        case "h6":
                        case "table":
                        case "th":
                        case "tr":
                        case "td":
                        case "div":
                        case "span":
                            //default:
                            numberOfDivsInHtmlDocument1++;

                            if (!_clickHandlers.Contains(htmlElement))
                            {
                                htmlElement.Click   += new HtmlElementEventHandler(htmlElement_Click);
                                htmlElement.TabIndex = scraperIdForDivs++;

                                _clickHandlers.Add(htmlElement);
                            }
                            else
                            {
                                htmlElement.TabIndex = 0;
                            }
                            break;
                        }
                    }

                    scraperIdForDivs = 1;
                    int numberOfDivsInHtmlDocument2 = 0;

                    _htmlDocument = new HtmlAgilityPack.HtmlDocument();
                    _htmlDocument.LoadHtml(wbBrowser.DocumentText);

                    foreach (HtmlNode htmlNode in _htmlDocument.DocumentNode.DescendantsAndSelf())
                    {
                        switch (htmlNode.Name.ToLowerInvariant())
                        {
                        case "a":
                        case "h1":
                        case "h2":
                        case "h3":
                        case "h4":
                        case "h5":
                        case "h6":
                        case "table":
                        case "th":
                        case "tr":
                        case "td":
                        case "div":
                        case "span":
                            //default:
                            numberOfDivsInHtmlDocument2++;
                            htmlNode.Attributes.Add("arachnode_scraper_id", (scraperIdForDivs++).ToString());

                            break;
                        }
                    }

                    if (numberOfDivsInHtmlDocument1 != numberOfDivsInHtmlDocument2)
                    {
                    }
                }

                /**/

                tvBrowser.Nodes.Clear();
                _treeNodes.Clear();

                //inefficient...
                foreach (HtmlElement htmlElement in wbBrowser.Document.All)
                {
                    TreeNode treeNode = new TreeNode(htmlElement.TagName);
                    treeNode.ToolTipText = htmlElement.InnerHtml;
                    if (string.IsNullOrEmpty(treeNode.ToolTipText))
                    {
                        treeNode.ToolTipText = htmlElement.InnerText;
                    }
                    if (string.IsNullOrEmpty(treeNode.ToolTipText))
                    {
                        treeNode.ToolTipText = htmlElement.OuterHtml;
                    }
                    if (string.IsNullOrEmpty(treeNode.ToolTipText))
                    {
                        treeNode.ToolTipText = htmlElement.OuterText;
                    }
                    if (!string.IsNullOrEmpty(treeNode.ToolTipText))
                    {
                        treeNode.ToolTipText = treeNode.ToolTipText.Trim();
                    }
                    string toolTipText = treeNode.ToolTipText;
                    if (!string.IsNullOrEmpty(treeNode.ToolTipText) && treeNode.ToolTipText.Length > 250)
                    {
                        treeNode.ToolTipText = treeNode.ToolTipText.Substring(0, 250) + "...";
                    }
                    if (!string.IsNullOrEmpty(treeNode.ToolTipText))
                    {
                        treeNode.ToolTipText += Environment.NewLine + "------------" + Environment.NewLine + UserDefinedFunctions.ExtractText(toolTipText).Value;
                    }
                    if (!string.IsNullOrEmpty(treeNode.ToolTipText) && treeNode.ToolTipText.Length > 500)
                    {
                        treeNode.ToolTipText = treeNode.ToolTipText.Substring(0, 500) + "...";
                    }
                    treeNode.Tag = htmlElement;

                    if (htmlElement.Parent == null)
                    {
                        tvBrowser.Nodes.Add(treeNode);
                        _treeNodes.Add(treeNode);
                    }
                    else
                    {
                        foreach (TreeNode treeNode2 in _treeNodes)
                        {
                            if (((HtmlElement)treeNode.Tag).Parent == (HtmlElement)treeNode2.Tag)
                            {
                                treeNode2.Nodes.Add(treeNode);
                                _treeNodes.Add(treeNode);

                                break;
                            }
                        }
                    }
                }

                /**/

                rtbViewSource.Text = wbBrowser.DocumentText;
                HighlightRTF(rtbViewSource);
            }
            catch (Exception exception)
            {
                MessageBox.Show(exception.Message + Environment.NewLine + exception.StackTrace, _formText);
            }
        }
 private void GetCaptchaImage(string input)
 {
     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
     doc.LoadHtml(input);
     string captcha = "";
     try
     {
         HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//img");
         CaptchaForm captchaForm = new CaptchaForm(url + bodyNode.Attributes["src"].Value);
         captchaForm.ShowDialog();
         captcha = GetRequest(index, textBoxRequest.Text) + "&captcha=" + captchaForm.Captcha + "&submit=Отправить";
     }
     catch
     {
         File.WriteAllText("log.html", input, Encoding.Default);
     }
     Captcha(captcha);
 }
 private void ParseHtmlDocument(string input)
 {
     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
     doc.LoadHtml(input);
     try
     {
         if (count == 0)
         {
             HtmlNode bodyNodeCount = doc.DocumentNode.SelectSingleNode("//div[@id='gs_ab_md']");
             string c = bodyNodeCount.InnerText;
             string b = "";
             int l = 0;
             while (c[l] != '(')
             {
                 if (Char.IsDigit(c[l]))
                 {
                     b += c[l];
                 }
                 l++;
             }
             count = int.Parse(b) / 10;
         }
         HtmlNodeCollection bodyNodeNames = doc.DocumentNode.SelectNodes("//h3[@class='gs_rt']");
         HtmlNodeCollection bodyNodeAutors = doc.DocumentNode.SelectNodes("//div[@class='gs_a']");
         string[] names = new string[bodyNodeNames.Count];
         string[] autors = new string[bodyNodeAutors.Count];
         for (int i = 0; i < bodyNodeNames.Count; i++)
         {
             names[i] = bodyNodeNames[i].InnerText;
             autors[i] = bodyNodeAutors[i].InnerText;
             File.AppendAllText("log.txt", names[i] + "\n" + autors[i] + "\n\n");
         }
     }
     catch
     {
         File.WriteAllText("log.html", input, Encoding.Default);
     }
 }