private async Task <Job> ParseJob(XElement jobXml) { var job = new Job() { Id = Int32.Parse(jobXml.Element("job_id").Value.Trim(), CultureInfo.InvariantCulture), Reference = HttpUtility.HtmlDecode(jobXml.Element("reference")?.Value).Trim(), NumberOfPositions = Int32.Parse(jobXml.Element("no_of_positions").Value, CultureInfo.InvariantCulture), JobTitle = HttpUtility.HtmlDecode(jobXml.Element("job_title").Value).Trim(), Department = HttpUtility.HtmlDecode(jobXml.Element("business_unit").Value).Trim().Replace("ESCC: ", string.Empty), Organisation = HttpUtility.HtmlDecode(jobXml.Element("region").Value).Trim(), DatePublished = new DateTime(Int32.Parse(jobXml.Element("open_date").Value.Substring(0, 4), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("open_date").Value.Substring(5, 2), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("open_date").Value.Substring(8, 2), CultureInfo.InvariantCulture)), ClosingDate = new DateTime(Int32.Parse(jobXml.Element("expiry_date").Value.Substring(0, 4), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("expiry_date").Value.Substring(5, 2), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("expiry_date").Value.Substring(8, 2), CultureInfo.InvariantCulture)) }; if (job.Department.ToUpperInvariant() == "PARTNERSHIP") { if (job.JobTitle.Contains("(") && job.JobTitle.EndsWith(")")) { var orgStarts = job.JobTitle.LastIndexOf("("); job.Organisation = job.JobTitle.Substring(orgStarts + 1).TrimEnd(')'); job.JobTitle = job.JobTitle.Substring(0, orgStarts).TrimEnd(); } else { job.Organisation = job.Department; } job.Department = string.Empty; } job.Locations.Add(HttpUtility.HtmlDecode(jobXml.Element("location_city").Value).Trim()); var logo = jobXml.Element("media")?.Element("logo")?.Element("url")?.Value; if (!String.IsNullOrEmpty(logo)) { // Ignore width and height from TribePad as they can be wrong! job.LogoUrl = new Uri(logo); } var advertHtml = new StringBuilder(); advertHtml.Append(HttpUtility.HtmlDecode(jobXml.Element("package_description")?.Value)); advertHtml.Append(HttpUtility.HtmlDecode(jobXml.Element("summary_external")?.Value)); advertHtml.Append(HttpUtility.HtmlDecode(jobXml.Element("main_responsibilities")?.Value)); job.AdvertHtml = new HtmlString(advertHtml.ToString()); job.AdditionalInformationHtml = new HtmlString(HttpUtility.HtmlDecode(jobXml.Element("ideal_candidate")?.Value)); job.EqualOpportunitiesHtml = new HtmlString(HttpUtility.HtmlDecode(jobXml.Element("about_company")?.Value)); job.JobType = HttpUtility.HtmlDecode(jobXml.Element("category_name")?.Value).Replace(" & ", " and "); if (jobXml.Element("no_apply")?.Value == "0") { job.ApplyUrl = new Uri(String.Format(CultureInfo.InvariantCulture, _applyUrl.ToString(), job.Id), UriKind.RelativeOrAbsolute); } job.Salary = await _salaryParser.ParseSalary(jobXml.ToString()); job.WorkPattern = await _workPatternParser.ParseWorkPattern(jobXml.ToString()); var contractTypeId = jobXml.Element("job_type")?.Value; if (!String.IsNullOrEmpty(contractTypeId)) { job.ContractType = _contractTypes?.SingleOrDefault(x => x.LookupValueId == contractTypeId)?.Text; } return(job); }
private async Task <Job> ParseJob(XElement jobXml) { var job = new Job() { Id = Int32.Parse(jobXml.Element("job_id").Value.Trim(), CultureInfo.InvariantCulture), Reference = HttpUtility.HtmlDecode(jobXml.Element("reference")?.Value).Trim(), NumberOfPositions = Int32.Parse(jobXml.Element("no_of_positions").Value, CultureInfo.InvariantCulture), JobTitle = HttpUtility.HtmlDecode(jobXml.Element("job_title").Value).Trim(), Department = HttpUtility.HtmlDecode(jobXml.Element("business_unit").Value).Trim().Replace("ESCC: ", string.Empty), Organisation = HttpUtility.HtmlDecode(jobXml.Element("region").Value).Trim(), DatePublished = new DateTime(Int32.Parse(jobXml.Element("open_date").Value.Substring(0, 4), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("open_date").Value.Substring(5, 2), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("open_date").Value.Substring(8, 2), CultureInfo.InvariantCulture)) }; var noClosingDate = jobXml.Element("evergreen")?.Value == "1"; if (!noClosingDate) { job.ClosingDate = new DateTime(Int32.Parse(jobXml.Element("expiry_date").Value.Substring(0, 4), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("expiry_date").Value.Substring(5, 2), CultureInfo.InvariantCulture), Int32.Parse(jobXml.Element("expiry_date").Value.Substring(8, 2), CultureInfo.InvariantCulture)); } var canApplyForThisJob = true; var comparableDepartment = job.Department.ToUpperInvariant(); if (comparableDepartment == "PARTNERSHIP") { canApplyForThisJob = false; if (!ParseAndMoveOrganisationFromJobTitle(job)) { job.Organisation = job.Department; } job.Department = string.Empty; } else if (comparableDepartment == "ESCC SCHOOLS" || comparableDepartment == "ESCC ACADEMIES") { if (ParseAndMoveOrganisationFromJobTitle(job)) { job.Department = string.Empty; } else { // If it's a school job but the school name is not in the job title, it's unknown job.Organisation = string.Empty; job.Department = string.Empty; } canApplyForThisJob = false; } else if (comparableDepartment == "CHILDREN SERVICES") { job.Department = "Children's Services"; } var locations = _locationParser.ParseLocations(jobXml.ToString()); if (locations != null) { foreach (var location in locations) { job.Locations.Add(location); } } var logo = jobXml.Element("media")?.Element("logo")?.Element("url")?.Value; if (!String.IsNullOrEmpty(logo)) { // Ignore width and height from TribePad as they can be wrong! job.LogoUrl = new Uri(logo); } var advertHtml = new StringBuilder(); advertHtml.Append(HttpUtility.HtmlDecode(jobXml.Element("package_description")?.Value)); advertHtml.Append(HttpUtility.HtmlDecode(jobXml.Element("summary_external")?.Value)); advertHtml.Append(HttpUtility.HtmlDecode(jobXml.Element("main_responsibilities")?.Value)); var files = jobXml.Element("files")?.Elements("file"); if (files != null) { if (files.Count() == 1) { advertHtml.Append("<h2>Documents</h2><p>").AppendLinkToFile(files.First(), job.Id).Append("</p>"); } else { advertHtml.Append("<h2>Documents</h2><ul>"); foreach (var file in files) { advertHtml.Append("<li>").AppendLinkToFile(file, job.Id).Append("</li>"); } advertHtml.Append("</ul>"); } } job.AdvertHtml = new HtmlString(advertHtml.ToString()); job.AdditionalInformationHtml = new HtmlString(HttpUtility.HtmlDecode(jobXml.Element("ideal_candidate")?.Value)); job.EqualOpportunitiesHtml = new HtmlString(HttpUtility.HtmlDecode(jobXml.Element("about_company")?.Value)); job.JobType = HttpUtility.HtmlDecode(jobXml.Element("category_name")?.Value)?.Replace(" & ", " and "); if (jobXml.Element("no_apply")?.Value == "0" && canApplyForThisJob) { job.ApplyUrl = new Uri(String.Format(CultureInfo.InvariantCulture, _applyUrl.ToString(), job.Id), UriKind.RelativeOrAbsolute); } job.Salary = await _salaryParser.ParseSalary(jobXml.ToString()).ConfigureAwait(false); job.WorkPattern = await _workPatternParser.ParseWorkPattern(jobXml.ToString()).ConfigureAwait(false); var contractTypeId = jobXml.Element("job_type")?.Value; if (!String.IsNullOrEmpty(contractTypeId)) { job.ContractType = _contractTypes?.SingleOrDefault(x => x.LookupValueId == contractTypeId)?.Text; } return(job); }
/// <summary> /// Parses a job. /// </summary> /// <param name="sourceData">The source data for the job.</param> /// <returns></returns> public async Task <Job> ParseJob(string sourceData, string jobId) { try { HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.OptionFixNestedTags = true; htmlDocument.LoadHtml(sourceData); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDocument.ParseErrors != null && htmlDocument.ParseErrors.Count() > 0) { foreach (var error in htmlDocument.ParseErrors) { var exception = new HttpParseException("Unable to parse job HTML from TalentLink"); exception.Data.Add("Job ID", jobId); exception.Data.Add("Type of error", error.Code); exception.Data.Add("Reason", error.Reason); exception.Data.Add("Line", error.Line); exception.Data.Add("Position on line", error.LinePosition); exception.Data.Add("Source HTML", error.SourceText); exception.ToExceptionless().Submit(); } return(null); } else { if (htmlDocument.DocumentNode != null) { var job = new Job(); job.JobTitle = ParseValueFromElementById(htmlDocument, "h3", "JDText-Title"); job.Reference = ParseValueFromElementById(htmlDocument, "span", "JDText-Param2"); if (!String.IsNullOrEmpty(job.Reference)) { job.JobTitle = job.JobTitle.Replace(" (" + job.Reference + ")", String.Empty); } job.Locations.Add(ParseValueFromElementById(htmlDocument, "span", "JDText-Param3")); job.Organisation = ParseValueFromElementById(htmlDocument, "span", "JDText-Param4"); job.Department = ParseValueFromElementById(htmlDocument, "span", "JDText-Param5"); job.ContractType = ParseValueFromElementById(htmlDocument, "span", "JDText-Param6"); job.JobType = ParseValueFromElementById(htmlDocument, "span", "JDText-Param7"); job.Salary = await _salaryParser.ParseSalary(htmlDocument.DocumentNode.OuterHtml); DateTime closingDate; DateTime.TryParse(ParseValueFromElementById(htmlDocument, "span", "JDText-Param9"), new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal, out closingDate); job.ClosingDate = closingDate; job.DatePublished = DateTime.UtcNow; var agilityPackFormatters = new IHtmlAgilityPackHtmlFormatter[] { new RemoveUnwantedAttributesFormatter(new string[] { "style" }), new ReplaceElementNameFormatter("h5", "h2"), new RemoveUnwantedNodesFormatter(new[] { "u" }), new RemoveElementByNameAndContentFormatter("h2", "Job Details"), new RemoveElementsWithNoContentFormatter(new[] { "strong", "p" }), new TruncateLongLinksFormatter(new HtmlLinkFormatter()), new EmbeddedYouTubeVideosFormatter() }; foreach (var formatter in agilityPackFormatters) { formatter.FormatHtml(htmlDocument); } // JDText-Field4 is additional info (ie small print) and JDText-Field6 is information for redeployees (ie more small print) var additionalInfo = ParseValueFromElementById(htmlDocument, "span", "JDText-Field4") + Environment.NewLine + ParseValueFromElementById(htmlDocument, "span", "JDText-Field6"); if (!String.IsNullOrEmpty(additionalInfo)) { additionalInfo = ApplyStringFormatters(additionalInfo); job.AdditionalInformationHtml = new HtmlString(additionalInfo); } var equalOpportunities = ParseValueFromElementById(htmlDocument, "span", "JDText-Field5"); if (!String.IsNullOrEmpty(equalOpportunities)) { equalOpportunities = ApplyStringFormatters(equalOpportunities); job.EqualOpportunitiesHtml = new HtmlString(equalOpportunities); } var parsedHtml = ParseValueFromElementById(htmlDocument, "div", "JD-Field1") + Environment.NewLine + ParseValueFromElementById(htmlDocument, "div", "JD-Field2") + Environment.NewLine + ParseValueFromElementById(htmlDocument, "div", "JD-Documents"); parsedHtml = ApplyStringFormatters(parsedHtml); parsedHtml = new RemoveDuplicateTextFormatter("Closing date: " + job.ClosingDate.Value.ToBritishDate()).FormatHtml(parsedHtml); parsedHtml = new RemoveDuplicateTextFormatter("Closing date: " + job.ClosingDate.Value.ToBritishDateWithDay()).FormatHtml(parsedHtml); parsedHtml = new RemoveDuplicateTextFormatter("Salary: " + job.Salary.SalaryRange).FormatHtml(parsedHtml); parsedHtml = new RemoveDuplicateTextFormatter("Salary: " + job.Salary.SalaryRange?.Replace(" to ", " - ")).FormatHtml(parsedHtml); parsedHtml = new RemoveDuplicateTextFormatter("Contract type: " + job.ContractType).FormatHtml(parsedHtml); job.AdvertHtml = new HtmlString(parsedHtml); job.WorkPattern = await _workPatternParser.ParseWorkPattern(parsedHtml); var applyLink = htmlDocument.DocumentNode.SelectSingleNode($"//div[@id='JD-ActApplyDirect']/a"); if (applyLink != null) { job.ApplyUrl = new Uri(HttpUtility.HtmlDecode(applyLink.Attributes["href"].Value), UriKind.RelativeOrAbsolute); } return(job); } return(null); } } catch (Exception exception) { exception.Data.Add("Job ID", jobId); exception.ToExceptionless().Submit(); return(null); } }