Пример #1
0
        static void Main(string[] args)
        {
            WebClient client = new WebClient();

            client.Encoding = System.Text.Encoding.UTF8;
            var html = client.DownloadString("https://blog.csdn.net/czjnoe/article/details/106600070");

            NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(html);
            //根据标签名获取节点
            NSoup.Select.Elements metaElements = doc.GetElementsByTag("meta");
            foreach (var item in metaElements)
            {
            }
            //根据id获取节点
            NSoup.Nodes.Element headClassElements = doc.GetElementById("head");
            //根据class获取节点
            var headIdElements = doc.GetElementsByClass("fm").ToList();

            foreach (var item in headIdElements)
            {
            }
            //根据属性名称获取节点
            List <Element> attributeNameElements = doc.GetElementsByAttribute("class").ToList();

            //根据属性值获取节点
            List <Element> attributeValueElements = doc.GetElementsByAttributeValue("id", "su").ToList();

            //根据jQuery选择器获取节点
            var selectElments = doc.Select("#head").ToList();
        }
Пример #2
0
        /// <summary>
        /// Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes 
        /// in the input HTML are allowed by the whitelist.
        /// </summary>
        /// <remarks>
        /// This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully 
        /// using the <see cref="Clean(Document)"/> document. If using as a validator, it is recommended to still clean the document 
        /// to ensure enforced attributes are set correctly, and that the output is tidied.
        /// </remarks>
        /// <param name="dirtyDocument">document to test</param>
        /// <returns>true if no tags or attributes need to be removed; false if they do</returns>
        public bool IsValid(Document dirtyDocument)
        {
            if (dirtyDocument == null)
            {
                throw new ArgumentNullException("dirtyDocument");
            }

            Document clean = Document.CreateShell(dirtyDocument.BaseUri);
            int numDiscarded = CopySafeNodes(dirtyDocument.Body, clean.Body);
            return numDiscarded == 0;
        }
Пример #3
0
		protected Document() { } // Used for Node.Clone().

		/// <summary>
		/// Create a valid, empty shell of a document, suitable for adding more elements to.
		/// </summary>
		/// <param name="baseUri">baseUri of document</param>
		/// <returns>document with html, head, and body elements.</returns>
		static public Document CreateShell(string baseUri)
		{
			if (baseUri == null)
			{
				throw new ArgumentNullException("baseUri");
			}

			Document doc = new Document(baseUri);
			Element html = doc.AppendElement("html");
			html.AppendElement("head");
			html.AppendElement("body");

			return doc;
		}
Пример #4
0
        /// <summary>
        /// Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. 
        /// The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
        /// </summary>
        /// <param name="dirtyDocument">Untrusted base document to clean.</param>
        /// <returns>cleaned document.</returns>
        public Document Clean(Document dirtyDocument)
        {
            if (dirtyDocument == null)
            {
                throw new ArgumentNullException("dirtyDocument");
            }

            Document clean = Document.CreateShell(dirtyDocument.BaseUri);
            if (dirtyDocument.Body != null) // frameset documents won't have a body. the clean doc will have empty body.
            {
                CopySafeNodes(dirtyDocument.Body, clean.Body);
            }

            return clean;
        }
Пример #5
0
        /// <summary>
        /// 获得地址代码
        /// </summary>
        private IDictionary <string, string> GetAddressCode()
        {
            string           url        = "http://www.mca.gov.cn/article/sj/xzqh/2020/2020/202003301019.html";
            HttpClientHelper httpClient = new HttpClientHelper();
            string           html       = httpClient.GetAsync(url).GetAwaiter().GetResult();

            NSoup.Nodes.Document         doc       = NSoup.NSoupClient.Parse(html);
            IDictionary <string, string> hashtable = new Dictionary <string, string>();
            var trs = doc.Select("tr[height=19]");

            foreach (Element tr in trs)
            {
                var tb = tr.Select("td");
                hashtable.Add(tb[1].Text(), tb[2].Text());
            }

            return(hashtable);
        }
Пример #6
0
        protected ParseErrorList _errors; // null when not tracking errors

        protected virtual void InitialiseParse(string input, string baseUri, ParseErrorList errors)
        {
            if (input == null)
            {
                throw new ArgumentNullException("String input must not be null");
            }
            if (baseUri == null)
            {
                throw new ArgumentNullException("BaseURI must not be null");
            }

            _doc = new Document(baseUri);
            _reader = new CharacterReader(input);
            _errors = errors;
            _tokeniser = new Tokeniser(_reader, errors);
            _stack = new DescendableLinkedList<Element>();
            this._baseUri = baseUri;
        }
Пример #7
0
        /// <summary>
        /// 从http://www.ip138.com/读取IP地址
        /// </summary>
        /// <returns></returns>
        public string GetIpAddress(string Ip)
        {
            string[] result;
            if (string.IsNullOrEmpty(Ip.Trim()))
            {
                return(null);
            }
            //WebClient client = new WebClient();
            //client.Encoding = System.Text.Encoding.GetEncoding("GB2312");
            //string url = "http://www.ip138.com/ips138.asp";
            //string post = "ip=" + Ip + "&action=2";
            //client.Headers.Set("Content-Type", "application/x-www-form-urlencoded");
            //string response = client.UploadString(url, post);

            //string p = @"<li>参考数据二:(?<location>[^<>]+?)</li>";
            //Match match = Regex.Match(response, p);
            //string m_Location = match.Groups["location"].Value.Trim();
            //result = m_Location.Split(' ');
            //return result[0];
            string strResult;
            string str;

            try
            {
                HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create("http://www.ip138.com/ips138.asp?ip=" + Ip + "&action=2");
                myReq.Timeout = 3000;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
                Stream          myStream  = HttpWResp.GetResponseStream();
                StreamReader    sr        = new StreamReader(myStream, Encoding.UTF8);
                strResult = sr.ReadToEnd();
                NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(strResult);
                Element element          = doc.Body.GetElementsByTag("table")[0];
                strResult = element.GetElementsByTag("tr")[2].GetElementsByTag("td")[0].GetElementsByTag("ul")[0].GetElementsByTag("li")[0].Text();
                str       = strResult.Substring(6);
            }
            catch (Exception exp)
            {
                str = "未知";
            }
            return(str);
        }
Пример #8
0
 private static string ihVal(string key, Document doc)
 {
     return doc.Select("th:contains(" + key + ") + td").First.Text();
 }
Пример #9
0
        private void download2()
        {
            // 下载地址
            string url = textBox1.Text;

            string html = getHtml(url);

            NSoup.Nodes.Document d = NSoup.NSoupClient.Parse(html);
            //获取标题
            String title = d.GetElementsByClass("tit").First.GetElementsByTag("h1").Text;

            //如果没有标题就用毫秒数
            title = (title != null && !title.Equals(""))?title:DateTime.Now.ToUniversalTime().Ticks + "";
            NSoup.Nodes.Element   el = d.GetElementById("vlink_1");
            NSoup.Select.Elements es = el.GetElementsByTag("li");

            richTextBox1.Text = "";

            foreach (var e in es)
            {
                string   subHtml = getHtml("http://www.ting56.com" + e.GetElementsByTag("a").Attr("href"));
                Document d1      = NSoup.NSoupClient.Parse(subHtml);

                Match mc = Regex.Match(subHtml, "FonHen_JieMa\\('([0-9,*]*)'\\)");
                //获取加密url
                string   miwen = mc.Groups[1].Value;
                string[] tArr  = Regex.Split(miwen, "\\*", RegexOptions.IgnoreCase);

                int    n = tArr.Length;
                string s = "";
                for (int i = 0; i < n; i++)
                {
                    if (!tArr[i].Equals(""))
                    {
                        s += (char)int.Parse(tArr[i]);
                    }
                }

                //下载地址
                string downUrl = Regex.Split(s, "\\&", RegexOptions.IgnoreCase)[0];
                //文件扩展名
                string downFileExt = Regex.Split(s, "\\&", RegexOptions.IgnoreCase)[2];


                string path = textBox2.Text + "\\" + title + "\\";


                string fileName  = e.Text();
                string localFile = path + fileName + downFileExt;
//              MessageBox.Show(localFile);


                richTextBox1.AppendText("开始下载 " + fileName + "\n\r" + downUrl + "\n\r");

                //用线程执行下载
//				Thread oGetArgThread = new Thread(new ThreadStart(HttpDownload));
//	            oGetArgThread.IsBackground = true;
//	            oGetArgThread.Start();

//				DownFile hd = new DownFile();
//				hd.downUrl=downUrl;
//				hd.localFile=localFile;

//				DownFile df = new DownFile(downUrl,localFile);

//              ThreadPool.QueueUserWorkItem(new WaitCallback(df.HttpDownload));

//				Thread oGetArgThread = new Thread(new ThreadStart(hd.HttpDownload));
//	            oGetArgThread.IsBackground = true;
//	            oGetArgThread.Start();

                HttpDownload(downUrl, localFile);
            }

//			MessageBox.Show(title);
        }
Пример #10
0
		public OutputSettings SetSyntax(Document.Syntax syntax)
		{
			this._syntax = syntax;
			return this;
		}
Пример #11
0
        private PageTitleFixture GoToWebSite()
        {

       /*     var thread = new Thread(new ThreadStart(StartWindows));

           thread.SetApartmentState(ApartmentState.STA);
           thread.Start();
           thread.Join();
            */

            
           

            var thread = new Thread(new ThreadStart(StartWindows));

            thread.SetApartmentState(ApartmentState.STA);
            thread.Start();
            thread.Join();


            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URL);
            request.MaximumAutomaticRedirections = 4;
            request.MaximumResponseHeadersLength = 4;
            // Set credentials to use for this request.
            request.Credentials = CredentialCache.DefaultCredentials;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Console.WriteLine("Content length is {0}", response.ContentLength);
            Console.WriteLine("Content type is {0}", response.ContentType);
            // Get the stream associated with the response.
            Stream receiveStream = response.GetResponseStream();

            // Pipes the stream to a higher level stream reader with the required encoding format. 
            StreamReader readStream = new StreamReader(receiveStream, Encoding.UTF8);

            Console.WriteLine("Response stream received.");


            string responseValue = readStream.ReadToEnd();            
            doc = NSoup.NSoupClient.ParseBodyFragment(responseValue);

           Elements element = doc.GetElementsByTag("Title");

           
            
            response.Close();
            readStream.Close();

              
            HttpStatusCode = response.StatusCode.GetHashCode().ToString();

            Console.WriteLine(response.StatusCode);

            
                        
            PageTitle = element.Text;
           
//            Console.WriteLine("Response stream received.\n {0}",responseValue);

            Elements links = doc.GetElementsByTag("a");
            int count = 0;
                        
                foreach (var _link in links)
                {

                    Element linkByLink = _link.TagName("a href");

                    if (!linkByLink.Text().ToString().Equals(""))
                    {

                        //var linkText = _link.TagName("a href").Text;
                        //Console.WriteLine("Link ={0}", linkByLink.Text());
                        //listOfLinks.Add(linkByLink.Text());

                        String absHref = linkByLink.Attr("abs:href");
                        Console.WriteLine("Link ={0}", absHref);

                        LinkNameAndLink.Add(count+"_"+linkByLink.Text(), absHref);

                        count++;
                    }
                }
            
            _noOfLinks = count.ToString();

            return null;
        }