//读取 public List <PracticeInfo> getPracticeInfoFromFile(string file) { StreamReader sr = new StreamReader(file, Encoding.UTF8); String data = sr.ReadToEnd(); //用于存储读取的每一行数据 List <String> li = new List <String>(); List <PracticeInfo> list = new List <PracticeInfo>(); //按行切分,存入li string[] m = data.Split(new char[] { '\n' }); for (int k = 0; k < m.Length; k++) { li.Add(m[k]); } sr.Close(); //每行切分,存入具体的pracinfo for (int i = 0; i < li.Count - 1; i++) { PracticeInfo tempPrac = new PracticeInfo(); string tempLi = li[i]; string[] n = tempLi.Split(new char[] { '\t' }); SetPracticeInfoByString(tempPrac, n); list.Add(tempPrac); } return(list); }
public void SetPracticeInfoByString(PracticeInfo prf, String[] contentlist) { prf.name = contentlist[0]; prf.request = contentlist[1]; prf.duty = contentlist[2]; prf.location = contentlist[3]; prf.compname = contentlist[4]; prf.compinfo = contentlist[5]; prf.compweb = contentlist[6]; }
/// <summary> /// return List<practiceInfo>to store all information /// </summary> /// <param name="ID"></param> /// <param name="ComID"></param> /// <returns></returns> public List <PracticeInfo> getPracticeInfo(List <int> ID, List <int> ComID) { List <PracticeInfo> practiceInfo = new List <PracticeInfo>(); for (int i = 0; i < ID.Count; i++) { PracticeInfo tempInfo = new PracticeInfo(); string url = "https://www.nowcoder.com/recommend-intern/" + ComID[i] + "?jobId=" + ID[i]; string content = GetContent(url); //清楚html占位符 content = content.Replace(" ", " "); //获取岗位名称 content = content.Substring(content.IndexOf("rec-job") + 14); tempInfo.name = Insurance(content.Substring(0, content.IndexOf("<"))); //获取岗位职责 content = content.Substring(content.IndexOf("岗位职责")); string dutycontent = content.Substring(0, content.IndexOf("</dl>") + 5); //清除换行符对正则的影响 dutycontent = dutycontent.Replace("\n", ""); //清楚制表符对输出时切分的影响 dutycontent = dutycontent.Replace("\t", ""); //正则提取>和<之间的内容 Regex re = new Regex("(?<=>).*?(?=<)", RegexOptions.None); MatchCollection mc1 = re.Matches(dutycontent); string dutycontentr = ""; foreach (Match found in mc1) { dutycontentr += found; } tempInfo.duty = Insurance(dutycontentr); //获取岗位要求 content = content.Substring(content.IndexOf("岗位要求")); string reqcontent = content.Substring(0, content.IndexOf("</dl>") + 6); //清除换行符对正则的影响 reqcontent = reqcontent.Replace("\n", ""); //清楚制表符对输出时切分的影响 reqcontent = reqcontent.Replace("\t", ""); MatchCollection mc2 = re.Matches(reqcontent); string reqcontentr = ""; foreach (Match found in mc2) { reqcontentr += found; } tempInfo.request = Insurance(reqcontentr); //获取公司名称 content = content.Substring(content.IndexOf("teacher-name") + 14); tempInfo.compname = Insurance(content.Substring(0, content.IndexOf("<"))); //获取地址 content = content.Substring(content.IndexOf("com-lbs") + 9); tempInfo.location = Insurance(content.Substring(0, content.IndexOf("<"))); //获取公司简介 content = content.Substring(content.IndexOf("com-detail") - 12); string detailcontent = content.Substring(0, content.IndexOf("</p>") + 4); //清除换行符对正则的影响 detailcontent = detailcontent.Replace("\n", ""); //清楚制表符对输出时切分的影响 detailcontent = detailcontent.Replace("\t", ""); MatchCollection mc3 = re.Matches(detailcontent); string detailcontentr = ""; foreach (Match found in mc3) { detailcontentr += found; } tempInfo.compinfo = Insurance(detailcontentr); //获取公司网址 content = content.Substring(content.IndexOf("http")); tempInfo.compweb = Insurance(content.Substring(0, content.IndexOf("\""))); practiceInfo.Add(tempInfo); } return(practiceInfo); }