public void GetJobInfoList() { try { StringBuilder condition = new StringBuilder(); workAddress = HttpUtility.UrlEncode(workAddress, Encoding.GetEncoding("utf-8")); condition.Append("jl=" + workAddress); if (!string.IsNullOrEmpty(keyWord)) { keyWord = HttpUtility.UrlEncode(keyWord, Encoding.GetEncoding("utf-8")); condition.Append("&kw=" + keyWord); } condition.Append("&sm=1"); if (!string.IsNullOrEmpty(upperSalary)) { condition.Append("&sf=" + upperSalary); } if (!string.IsNullOrEmpty(lowerSalary)) { condition.Append("&st=" + lowerSalary); } url = url + condition.ToString(); string html = GetHtmlCode.GetByget(url, "utf-8"); GetJobInfoFromPage(html); //页面数量 string pageCountRegexStr = "(?<=onkeypress=\"zlapply.searchjob.enter2Page\\(this,event,)\\d+"; Regex pageCountRegex = new Regex(pageCountRegexStr); string pageCountStr = pageCountRegex.Match(html).Groups[0].Value; int pageCount = 0; int.TryParse(pageCountStr, out pageCount); for (int i = 2; i <= pageCount; i++) { string url0 = url + string.Format("&p={0}", i); html = GetHtmlCode.GetByget(url0, "utf-8"); GetJobInfoFromPage(html); } if (GetJobEnd != null) { GetJobEnd(null, null); } } catch (Exception exMsg) { throw new Exception(exMsg.Message); } }
public void GetJobInfoList() { try { StringBuilder condition = new StringBuilder(); condition.Append("jobarea=" + workAddressId); if (!string.IsNullOrEmpty(keyWord)) { keyWord = System.Web.HttpUtility.UrlEncode(keyWord, Encoding.GetEncoding("gb2312")); condition.Append("&keyword=" + keyWord); } condition.Append("&keywordtype=2"); url = url + condition.ToString(); string html = GetHtmlCode.GetByget(url, "gb2312"); GetJobInfoFromPage(html); int pageCount = 0; //页面数量 string pageCountRegexStr = "(?<=name=\"jobid_count\"\\s*?value=\")\\d+(?=\">)"; Regex pageCountRegex = new Regex(pageCountRegexStr); pageCount = (int.Parse(pageCountRegex.Match(html).Value) + 29) / 30; for (int i = 2; i <= pageCount; i++) { string url0 = url + string.Format("&curr_page={0}", i); html = GetHtmlCode.GetByget(url0, "gb2312"); GetJobInfoFromPage(html); } if (GetJobEnd != null) { GetJobEnd(null, null); } } catch (Exception exMsg) { throw new Exception(exMsg.Message); } }
public void GetJobInfoList() { try { StringBuilder condition = new StringBuilder(); condition.AppendFormat("dqs={0}", workAddressId); //地区 condition.Append("&searchField=3"); //行业 if (!string.IsNullOrEmpty(keyWord)) { keyWord = HttpUtility.UrlEncode(keyWord, Encoding.GetEncoding("utf-8")); condition.Append("&key=" + keyWord); //职位名关键词 } condition.Append("&pubTime=30"); //发布时间 string xurl = string.Empty; for (int i = 0; i < 100; i++) { if (i > 0) { xurl = url + condition.ToString() + "&curPage=" + i;//页数 } else { xurl = url + condition.ToString(); } string html = GetHtmlCode.GetByget(xurl, "utf-8"); if (string.IsNullOrEmpty(html)) { break; } GetJobInfoFromPage(html); } } catch (Exception exMsg) { throw new Exception(exMsg.Message); } }
private void GetJobInfoFromUrl(string url) { try { JobInfo info = new JobInfo(); //-- string pageStr = GetHtmlCode.GetByget(url, "gb2312"); if (string.IsNullOrEmpty(pageStr)) { return; } //-- pageStr = pageStr.Replace("\r\n", "");//替换换行符 // 获取html,body标签内容 string body = string.Empty; string bodyFilter = @"(?is)<body.*?</body>"; Match m = Regex.Match(pageStr, bodyFilter); if (m.Success) { body = m.ToString().Replace("<tr >", "<tr>").Replace("\r\n", ""); } // 过滤样式,脚本等不相干标签 foreach (var filter in Filters) { body = Regex.Replace(body, filter[0], filter[1]); } //-- if (!string.IsNullOrEmpty(mustKey) && !body.Contains(mustKey)) { return; } body = Regex.Replace(body, "\\s", ""); info.Url = url; string basicInfoRegexStr0 = "<tdclass=\"sr_bt\"colspan=\"2\">(.*?)</td>"; //职位名称 string position = Regex.Match(body, basicInfoRegexStr0).Value; if (string.IsNullOrEmpty(position)) { basicInfoRegexStr0 = "<tdclass=\"sr_bt\"colspan=\"3\">(.*?)</td>"; position = Regex.Match(body, basicInfoRegexStr0).Value; } info.Position = string.IsNullOrEmpty(position) ? "" : position.Substring(position.IndexOf(">") + 1, position.IndexOf("</") - position.IndexOf(">") - 1); string basicInfoRegexStr1 = ".html\">(.*?)</a>";//公司名称 string company = Regex.Match(body, basicInfoRegexStr1).Value; info.Company = string.IsNullOrEmpty(company) ? "" : company.Substring(company.IndexOf(">") + 1, company.IndexOf("</a>") - company.IndexOf(">") - 1); string basicInfoRegexStr2 = "工作地点:</td><tdclass=\"txt_2\">(.*?)</td>";//工作地点 string address = Regex.Match(body, basicInfoRegexStr2).Value; info.Address = string.IsNullOrEmpty(address) ? "" : address.Substring(address.IndexOf("\">") + 2, address.LastIndexOf("</td>") - address.IndexOf("\">") - 2); string basicInfoRegexStr3 = "公司性质:</strong> (.*?)<br><br><strong>";//公司性质 string nature = Regex.Match(body, basicInfoRegexStr3).Value; if (string.IsNullOrEmpty(nature)) { basicInfoRegexStr3 = "公司行业:</strong> (.*?)<br><br><strong>"; nature = Regex.Match(body, basicInfoRegexStr3).Value; } info.Nature = string.IsNullOrEmpty(nature) ? "" : nature.Substring(26, nature.IndexOf("<br>") - 26); //公司性质 string basicInfoRegexStr4 = "公司规模:</strong> (.*?)</td>"; //公司规模 string scale = Regex.Match(body, basicInfoRegexStr4).Value; info.Scale = string.IsNullOrEmpty(scale) ? "" : scale.Substring(26, scale.IndexOf("</td>") - 26); string basicInfoRegexStr5 = "工作年限:</td><tdclass=\"txt_2\">(.*?)</td>";//工作经验 string experience = Regex.Match(body, basicInfoRegexStr5).Value; info.Experience = string.IsNullOrEmpty(experience) ? "" : experience.Substring(experience.IndexOf("\">") + 2, experience.LastIndexOf("</td>") - experience.IndexOf("\">") - 2); string basicInfoRegexStr6 = "学 历:</td><tdclass=\"txt_2\">(.*?)</td>";//学历 string education = Regex.Match(body, basicInfoRegexStr6).Value; info.Education = string.IsNullOrEmpty(education) ? "" : education.Substring(education.IndexOf("\">") + 2, education.LastIndexOf("</td>") - education.IndexOf("\">") - 2); string basicInfoRegexStr7 = "薪水范围:</td><tdclass=\"txt_2\">(.*?)</td>";//月薪 string salary = Regex.Match(body, basicInfoRegexStr7).Value; info.Salary = string.IsNullOrEmpty(salary) ? "" : salary.Substring(salary.IndexOf("\">") + 2, salary.LastIndexOf("</td>") - salary.IndexOf("\">") - 2); string basicInfoRegexStr8 = "发布日期:</td><tdclass=\"txt_2\">(.*?)</td>";//发布时间 string time = Regex.Match(body, basicInfoRegexStr8).Value; info.Time = string.IsNullOrEmpty(time) ? "" : time.Substring(time.IndexOf("\">") + 2, time.LastIndexOf("</td>") - time.IndexOf("\">") - 2);; if (GetJobEnd != null) { GetJobEnd(pageStr, info); } } catch (Exception exMsg) { throw new Exception(exMsg.Message); } }
private void GetJobInfoFromUrl(string url) { try { JobInfo info = new JobInfo(); //-- string pageStr = GetHtmlCode.GetByget(url, "utf-8"); if (string.IsNullOrEmpty(pageStr)) { return; } //-- pageStr = pageStr.Replace("\r\n", "");//替换换行符 // 获取html,body标签内容 string body = string.Empty; string bodyFilter = @"(?is)<body.*?</body>"; Match m = Regex.Match(pageStr, bodyFilter); if (m.Success) { body = m.ToString().Replace("<tr >", "<tr>").Replace("\r\n", ""); } // 过滤样式,脚本等不相干标签 foreach (var filter in Filters) { body = Regex.Replace(body, filter[0], filter[1]); } //-- if (!string.IsNullOrEmpty(mustKey) && !body.Contains(mustKey)) { return; } body = Regex.Replace(body, "\\s", ""); info.Url = url; string basicInfoRegexStr0 = "<h1title=([\\s\\S]+?)>(.*?)</h1>"; //职位名称 string position = Regex.Match(body, basicInfoRegexStr0).Value; info.Position = string.IsNullOrEmpty(position) ? "" : position.Substring(position.IndexOf(">") + 1, position.IndexOf("</") - position.IndexOf(">") - 1); //职位名称 string basicInfoRegexStr1 = "</h1><h3>(.*?)</h3>"; //公司名称 string company = Regex.Match(body, basicInfoRegexStr1).Value; info.Company = string.IsNullOrEmpty(company) ? "" : company.Substring(company.IndexOf("<h3>") + 4, company.IndexOf("</h3>") - company.IndexOf("<h3>") - 4); //公司名称 string basicInfoRegexStr2 = "<divclass=\"resumeclearfix\"><span>(.*?)</span>"; //工作地点 string address = Regex.Match(body, basicInfoRegexStr2).Value; info.Address = string.IsNullOrEmpty(address) ? "" : address.Substring(address.IndexOf("<span>") + 6, address.IndexOf("</") - address.IndexOf("<span>") - 6); //工作地点 string basicInfoRegexStr3 = "<li><span>企业性质:</span>(.*?)</li>"; //公司性质 string nature = Regex.Match(body, basicInfoRegexStr3).Value; info.Nature = string.IsNullOrEmpty(nature) ? "" : nature.Substring(nature.IndexOf("</span>") + 7, nature.IndexOf("</li>") - nature.IndexOf("</span>") - 7); //公司性质 if (string.IsNullOrEmpty(info.Nature)) { string basicInfoRegexStr3_1 = "<br><span>性质:</span>(.*?)<br>"; string nature_1 = Regex.Match(body, basicInfoRegexStr3_1).Value; info.Nature = string.IsNullOrEmpty(nature_1) ? "" : nature_1.Substring(nature_1.IndexOf("</span>") + 7, nature_1.LastIndexOf("<br>") - nature_1.IndexOf("</span>") - 7);//公司性质 } string basicInfoRegexStr4 = "<li><span>企业规模:</span>(.*?)</li>"; //公司规模 string scale = Regex.Match(body, basicInfoRegexStr4).Value; info.Scale = string.IsNullOrEmpty(scale) ? "" : scale.Substring(scale.IndexOf("</span>") + 7, scale.IndexOf("</li>") - scale.IndexOf("</span>") - 7); //公司规模 if (string.IsNullOrEmpty(info.Scale)) { string basicInfoRegexStr4_1 = "<br><span>规模:</span>(.*?)<br>"; string scale_1 = Regex.Match(body, basicInfoRegexStr4_1).Value; info.Scale = info.Nature = string.IsNullOrEmpty(scale_1) ? "" : scale_1.Substring(scale_1.IndexOf("</span>") + 7, scale_1.LastIndexOf("<br>") - scale_1.IndexOf("</span>") - 7);//公司规模 } string basicInfoRegexStr5 = "<spanclass=\"noborder\">(.*?)</span>"; //工作经验 string experience = Regex.Match(body, basicInfoRegexStr5).Value; info.Experience = string.IsNullOrEmpty(experience) ? "" : experience.Substring(experience.IndexOf(">") + 1, experience.IndexOf("</") - experience.IndexOf(">") - 1); //工作经验 string basicInfoRegexStr6 = "</span><span>(.*?)</span><spanclass=\"noborder\">"; //最低学历 string education = Regex.Match(body, basicInfoRegexStr6).Value; info.Education = string.IsNullOrEmpty(education) ? "" : education.Substring(education.IndexOf("<span>") + 6, education.IndexOf("</span><spanclass=") - education.IndexOf("<span>") - 6); //最低学历 string basicInfoRegexStr7 = "<pclass=\"job-main-title\">(.*?)<"; //月薪 string salary = Regex.Match(body, basicInfoRegexStr7).Value; info.Salary = string.IsNullOrEmpty(salary) ? "" : salary.Substring(salary.IndexOf(">") + 1, salary.LastIndexOf("<") - salary.IndexOf(">") - 1); //月薪 string timeInfoRegexStr = "<pclass=\"release-time\">发布时间:<em>(.*?)</em></p>"; //发布时间 string time = Regex.Match(body, timeInfoRegexStr).Value; info.Time = string.IsNullOrEmpty(time) ? "" : time.Substring(time.IndexOf("<em>") + 4, time.IndexOf("</em>") - time.IndexOf("<em>") - 4); //发布时间 if (GetJobEnd != null) { GetJobEnd(pageStr, info); } } catch (Exception exMsg) { throw new Exception(exMsg.Message); } }