public override SPCoder.Utils.Nodes.BaseNode GetSPStructure(string siteUrl) { if (string.IsNullOrEmpty(siteUrl)) { siteUrl = "https://www.dotnetperls.com/"; } if (!siteUrl.EndsWith("/")) { siteUrl += "/"; } base.Endpoint = siteUrl; HtmlAgilityPack.HtmlWeb.PreRequestHandler handler = delegate(HttpWebRequest request) { request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; request.CookieContainer = new System.Net.CookieContainer(); return(true); }; page.PreRequest += handler; BaseNode rootNode = new PageNode(); rootNode.NodeConnector = this; rootNode.IconPath = "dnp.png"; rootNode.Title = RootNodeTitle; rootNode.LoadedData = true; Visit(page, links, siteUrl, "", rootNode, rootNode); return(rootNode); }
public override SPCoder.Utils.Nodes.BaseNode GetSPStructure(string siteUrl) { BaseNode rootNode = new PageNode(); HtmlWeb page = new HtmlWeb(); HtmlAgilityPack.HtmlWeb.PreRequestHandler handler = delegate(HttpWebRequest request) { request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; request.CookieContainer = new System.Net.CookieContainer(); return(true); }; page.PreRequest += handler; Document = page.Load(siteUrl); page.Get(siteUrl, "/"); rootNode.Title = RootNodeTitle + page.ResponseUri.Host.ToString(); rootNode.IconPath = "html.png"; BaseNode my = new PageNode(Document.DocumentNode); my.RootNode = rootNode; my.ParentNode = rootNode; my.Title = Document.DocumentNode.Name; my.SPObject = Document; //return rootNode; //rootNode.SPObject = site; doPageNodes(Document.DocumentNode, rootNode, rootNode); return(rootNode); }
public static List <Job> GetJobs(string url, SourceType sourceType) { var htmlWeb = new HtmlWeb { OverrideEncoding = Encoding.GetEncoding("UTF-8") }; var jobs = new List <Job>(); switch (sourceType) { case SourceType.ZLZP: { HtmlWeb.PreRequestHandler preRequestHandler = new HtmlAgilityPack.HtmlWeb.PreRequestHandler((request) => { request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; request.CookieContainer = new System.Net.CookieContainer(); return(true); }); htmlWeb.PreRequest += preRequestHandler; HtmlDocument response = htmlWeb.Load(url); var tables = response.DocumentNode.SelectNodes("//*[@id='newlist_list_content_table']/table[@class='newlist']"); if (tables != null) { Job job = null; foreach (var table in tables) { job = new Job(); //item.SelectSingleNode("tr[1]/td[@class='zwyx']").InnerText //"6001-8000" //item.SelectSingleNode("tr[1]/td[@class='zwmc']/div/a").InnerText //"C#/Asp.Net软件工程师,研发工程师" //item.SelectSingleNode("tr[1]/td[@class='gzdd']").InnerText //"北京" //item.SelectSingleNode("tr[1]/td[@class='gxsj']").InnerText //"最新" //item.SelectSingleNode("tr[1]/td[@class='gsmc']/a[1]").InnerText //"北京乐鸟科技有限公司" job.Name = table.SelectSingleNode("tr[1]/td[@class='zwmc']/div/a")?.InnerText; if (job.Name == null) { continue; } job.Link = table.SelectSingleNode("tr[1]/td[@class='zwmc']/div/a")?.Attributes["href"]?.Value; job.City = table.SelectSingleNode("tr[1]/td[@class='gzdd']")?.InnerText; job.Date = table.SelectSingleNode("tr[1]/td[@class='gxsj']")?.InnerText; job.Company = table.SelectSingleNode("tr[1]/td[@class='gsmc']/a[1]")?.InnerText; job.Wages = table.SelectSingleNode("tr[1]/td[@class='zwyx']")?.InnerText; job.Source = "智联招聘"; jobs.Add(job); } } break; } } return(jobs); }
/// <summary> /// 根据url请求,返回详细信息 /// </summary> /// <param name="url"></param> /// <param name="type"></param> /// <returns></returns> public string GetUrlInfo(string url, DataType type) { var ulS = string.Empty; switch (type) { case DataType.智联招聘: #region 问题:“gzip”不是受支持的编码名 的处理方法 http: //www.cnblogs.com/soundcode/p/3785152.html HtmlAgilityPack.HtmlWeb.PreRequestHandler handler = delegate(HttpWebRequest request) { request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate"; request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; request.CookieContainer = new System.Net.CookieContainer(); return(true); }; htmlWeb.PreRequest += handler; #endregion htmlWeb.OverrideEncoding = Encoding.GetEncoding("UTF-8"); HtmlAgilityPack.HtmlDocument response = htmlWeb.Load(url); var fuli = response.DocumentNode.SelectNodes("/html/body/div[3]/div[1]/div[1]/div"); var jiben = response.DocumentNode.SelectNodes("/html/body/div[4]/div[1]/ul"); var miaoshu = response.DocumentNode.SelectNodes("/html/body/div[4]/div[1]/div[1]/div/div[1]"); if (fuli != null && fuli.Count >= 1 && !string.IsNullOrEmpty(fuli[0].InnerText.Trim())) { ulS += "<h3>福利诱惑:</h3>" + fuli[0].InnerText; } if (jiben != null && jiben.Count >= 1 && !string.IsNullOrEmpty(jiben[0].InnerText.Trim())) { ulS += "<h3>基本信息:</h3>" + jiben[0].InnerText; } if (miaoshu != null && miaoshu.Count >= 1 && !string.IsNullOrEmpty(miaoshu[0].InnerText.Trim())) { ulS += "<h3>职位描述:</h3>" + miaoshu[0].InnerText; } break; case DataType.猎聘网: htmlWeb.OverrideEncoding = Encoding.GetEncoding("UTF-8"); response = htmlWeb.Load(url); //--基本信息 var jbinfo = response.DocumentNode.SelectNodes("//*[@id='job-view-enterprise']/div[1]/div[1]/div[1]/div[3]/div") ?? response.DocumentNode.SelectNodes("//*[@id='job-hunter']/div[1]/div[1]/div[1]/div[3]/div"); //职位描述 var selectNodes = response.DocumentNode.SelectNodes("//*[@id='job-hunter']/div[1]/div[1]/div[1]/div[4]") ?? response.DocumentNode.SelectNodes("//*[@id='job-view-enterprise']/div[1]/div[1]/div[1]/div[4]"); //岗位要求 var ganwei = response.DocumentNode.SelectNodes("//*[@id='job-hunter']/div[1]/div[1]/div[1]/div[5]/div") ?? response.DocumentNode.SelectNodes("//*[@id='job-view-enterprise']/div[1]/div[1]/div[1]/div[5]/div"); ulS = "<h3>基本信息:</h3>" + jbinfo[0].InnerText + "<h3>职位描述:</h3>" + selectNodes[0].InnerText + "<h3>岗位要求:</h3>" + ganwei[0].InnerText; break; case DataType.前程无忧: htmlWeb.OverrideEncoding = Encoding.GetEncoding("GBK"); response = htmlWeb.Load(url); //-- ulS = "<h3>基本信息:</h3>" + response.DocumentNode.SelectNodes("/html/body/div[3]/div/div[2]/table[1]/tr[3]/td[1]")[0].InnerText + "<h3>职位描述:</h3>" + response.DocumentNode.SelectNodes("/html/body/div[3]/div/div[2]/div[1]/div[2]/div/table")[0].InnerText; break; case DataType.拉勾网: htmlWeb.OverrideEncoding = Encoding.GetEncoding("UTF-8"); response = htmlWeb.Load(url); ulS = "<h3>基本信息:</h3>" + response.DocumentNode.SelectNodes("//*[@id='container']/div[1]/div[1]/dl/dd[1]")[0].InnerText + "<h3>职位描述:</h3>" + response.DocumentNode.SelectNodes("//*[@id='container']/div[1]/div[1]/dl/dd[2]")[0].InnerText; break; } return(ulS.ToJson()); }