public List <string> SearchJob(JobAnalysis jobAnalysis) { _logger.Information($"Search Job: {jobAnalysis.Title}"); List <string> urls = new List <string>(); string keyWords = jobAnalysis.Title; HtmlDocument document = null; Func <bool> hasNext = new Func <bool>(() => document.DocumentNode.DescendantsAndSelf().Any(n => n.Name.ToLower() == "a" && n.GetAttributeValue("data-automation", null) == "page-next" ) ); int index = 1; do { // get total number of jobs _logger.Information($" Search Job: {jobAnalysis.Title} - Page {index}"); List <HtmlNode> urlNodes = null; do { int chromeRetry = 0; var url = $"https://www.seek.com.au/{keyWords.Replace(" ", "-")}-jobs?page={index}"; try { chromeRetry++; document = WebExtensions.LoadPage(url); urlNodes = document.DocumentNode.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "a" && n.GetAttributeValue("data-automation", "") == "jobTitle").ToList(); } catch (Exception ex) { _logger.Error(ex, $"failed to get job list elements from url {url}. {chromeRetry} of {_searchOptions.MaxRetry} attempts."); } } while (urlNodes == null); foreach (var urlNode in urlNodes) { var href = urlNode.GetAttributeValue("href", null); if (href != null) { urls.Add($@"https://www.seek.com.au{href}"); } } index++; } while (hasNext()); return(urls); }
public async Task DownloadJob(string url, JobAnalysisEntry jobAnalysisEntry, IArangoDatabase client, Dictionary <string, Job> jobs) { var match = jobLink.Match(url); var key = match.Groups[1].Value; // if job has been saved in database, don't do it again. bool exists = false; { bool success = false; int retry = 0; while (!success && retry < _searchOptions.MaxRetry) { try { retry++; var jobsFound = await client.Query <Job>().Filter(j => j._key == key).ToListAsync(); var first = jobsFound.FirstOrDefault(); if (first != null && first.Description != null && first.Description != "") { jobs.Add(first._key, first); exists = true; } success = true; } catch (Exception ex) { _logger.Error(ex, $"failed to access job({key}) from arango. {retry} of {_searchOptions.MaxRetry} attempts."); } } } if (!exists) { bool success = false; int retry = 0; HtmlDocument document = null; while (!success && retry < _searchOptions.MaxRetry) { try { retry++; HtmlNode jobDescription = null; HtmlNode jobTitle = null; HtmlNode workType = null; HtmlNode publishDate = null; HtmlNode location = null; HtmlNode salary = null; HtmlNode city = null; int chromeRetry = 0; do { try { chromeRetry++; document = WebExtensions.LoadPage(url); HtmlNode jobBox = document.DocumentNode.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "div" && n.GetAttributeValue("data-automation", null) == "jobDescription") .FirstOrDefault(); jobDescription = jobBox.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "div" && n.HasClass("templatetext")).FirstOrDefault(); jobTitle = jobBox.DescendantsAndSelf() .Where(n => n.HasClass("jobtitle")).FirstOrDefault(); HtmlNode infoHeader = document.DocumentNode.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "section" && n.GetAttributeValue("aria-labelledby", null) == "jobInfoHeader") .FirstOrDefault(); publishDate = infoHeader.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "dd" && n.GetAttributeValue("data-automation", null) == "job-detail-date").FirstOrDefault(); workType = infoHeader.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "dd" && n.GetAttributeValue("data-automation", null) == "job-detail-work-type").FirstOrDefault(); HtmlNode dataList = infoHeader.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "dl").FirstOrDefault(); var dataTitles = infoHeader.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "dt").ToList(); var dataDetails = infoHeader.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "dd").ToList(); for (int i = 0; i < Math.Min(dataTitles.Count, dataDetails.Count); i++) { string innerText = dataTitles[i].InnerText; if (innerText.Contains("Location")) { location = dataDetails[i]; city = location.DescendantsAndSelf().Where(n => n.Name.ToLower() == "strong").FirstOrDefault(); } else if (innerText.Contains("Salary")) { salary = dataDetails[i]; } } } catch (Exception ex) { _logger.Error(ex, $"failed to get jobDescription element from url {url}. {chromeRetry} of {_searchOptions.MaxRetry} attempts."); } } while (jobDescription == null && chromeRetry < _searchOptions.MaxRetry); Job job = new Job() { _key = key, Description = jobDescription.InnerText.HtmlDecode(), TimeStamp = DateTime.Now }; if (jobTitle != null) { job.Title = jobTitle.InnerText.HtmlDecode(); } if (location != null) { job.Location = location.InnerText.HtmlDecode(); if (city != null) { job.City = city.InnerText.HtmlDecode(); } } if (salary != null) { job.RawSalary = salary.InnerText.HtmlDecode(); } if (workType != null) { job.WorkType = workType.InnerText.HtmlDecode(); } if (publishDate != null) { job.RawPublishDate = publishDate.InnerText.HtmlDecode(); DateTime publish; if (DateTime.TryParse(job.RawPublishDate, out publish)) { job.PublishDate = publish; } } if (jobs.ContainsKey(job._key)) { jobs[job._key] = job; } else { jobs.Add(job._key, job); } // save to arango db client.UpsertIgnoreNull(job); success = true; } catch (Exception ex) { _logger.Error(ex, $"failed to get job details from url {url}. {retry} of {_searchOptions.MaxRetry} attempts."); } } } }