private void ParseIsLastPage(HtmlDocument parsedHtml, JobsParseResult parseResult) { // default to true to prevent any infinite loops in consuming applications looking for the last page parseResult.IsLastPage = true; var paging = parsedHtml.DocumentNode.SelectNodes("//span[@class='Lst-NavPage']"); if (paging != null) { var currentPageFound = false; foreach (var pagingNode in paging) { if (pagingNode.SelectSingleNode("./a") == null) { // this is the current page, because it's not linked currentPageFound = true; continue; } // we're looking for another page after the current page if (currentPageFound) { parseResult.IsLastPage = false; break; } } } }
private async Task ParseJobs(HtmlDocument parsedHtml, JobsParseResult jobs) { var links = parsedHtml.DocumentNode.SelectNodes("//td[@headers='th1']/a"); if (links != null) { foreach (var link in links) { var job = new Job(); var jobUrl = HttpUtility.HtmlDecode(link.Attributes["href"].Value); var absoluteUrl = new Uri(new Uri("http://example.org"), jobUrl); var query = HttpUtility.ParseQueryString(absoluteUrl.Query); job.Id = Int32.Parse(query["nPostingTargetId"], CultureInfo.InvariantCulture); job.JobTitle = HttpUtility.HtmlDecode(link.InnerText); job.Organisation = HttpUtility.HtmlDecode(link.ParentNode.ParentNode.SelectSingleNode("./td[@headers='th2']")?.InnerText?.Trim()); job.Locations.Add(HttpUtility.HtmlDecode(link.ParentNode.ParentNode.SelectSingleNode("./td[@headers='th3']")?.InnerText?.Trim())); job.Salary = await _salaryParser.ParseSalary(HttpUtility.HtmlDecode(link.ParentNode.ParentNode.SelectSingleNode("./td[@headers='th4']")?.InnerText?.Trim())); job.Salary.SearchRange = job.Salary.SalaryRange; job.ClosingDate = DateTime.Parse(link.ParentNode.ParentNode.SelectSingleNode("./td[@headers='th5']")?.InnerText?.Trim(), new CultureInfo("en-GB")); jobs.Jobs.Add(job); } } }
/// <summary> /// Parses jobs from the HTML stream. /// </summary> /// <param name="htmlStream">The HTML stream.</param> /// <returns></returns> public JobsParseResult Parse(Stream htmlStream) { var parsedHtml = new HtmlDocument(); parsedHtml.Load(htmlStream); var parseResult = new JobsParseResult(); ParseIsLastPage(parsedHtml, parseResult); ParseJobs(parsedHtml, parseResult); return(parseResult); }
/// <summary> /// Parses jobs from the HTML stream. /// </summary> /// <param name="stream">The stream.</param> /// <returns></returns> public async Task <JobsParseResult> Parse(Stream stream) { await EnsureLookupValues(); var xml = XDocument.Load(stream); var parseResult = new JobsParseResult(); var jobsXml = xml.Root.Element("jobs").Elements("job"); foreach (var jobXml in jobsXml) { parseResult.Jobs.Add(await ParseJob(jobXml)); } return(parseResult); }