public List <string> SearchJob(JobAnalysis jobAnalysis) { _logger.Information($"Search Job: {jobAnalysis.Title}"); List <string> urls = new List <string>(); string keyWords = jobAnalysis.Title; HtmlDocument document = null; Func <bool> hasNext = new Func <bool>(() => document.DocumentNode.DescendantsAndSelf().Any(n => n.Name.ToLower() == "a" && n.GetAttributeValue("data-automation", null) == "page-next" ) ); int index = 1; do { // get total number of jobs _logger.Information($" Search Job: {jobAnalysis.Title} - Page {index}"); List <HtmlNode> urlNodes = null; do { int chromeRetry = 0; var url = $"https://www.seek.com.au/{keyWords.Replace(" ", "-")}-jobs?page={index}"; try { chromeRetry++; document = WebExtensions.LoadPage(url); urlNodes = document.DocumentNode.DescendantsAndSelf() .Where(n => n.Name.ToLower() == "a" && n.GetAttributeValue("data-automation", "") == "jobTitle").ToList(); } catch (Exception ex) { _logger.Error(ex, $"failed to get job list elements from url {url}. {chromeRetry} of {_searchOptions.MaxRetry} attempts."); } } while (urlNodes == null); foreach (var urlNode in urlNodes) { var href = urlNode.GetAttributeValue("href", null); if (href != null) { urls.Add($@"https://www.seek.com.au{href}"); } } index++; } while (hasNext()); return(urls); }
public void AnalyzeJobs(JobAnalysis jobAnalysis, JobAnalysisEntry jobAnalysisEntry, Dictionary <string, Job> jobs) { _logger.Information($"Analyzing Jobs for {jobAnalysis.Title} Entry: {jobAnalysisEntry._key}"); if (jobAnalysis.Keywords != null) { HashSet <string> words = new HashSet <string>(new StringComparer()); foreach (string keyword in jobAnalysis.Keywords) { words.Add(keyword); } jobAnalysisEntry.KeywordStatistics = new Dictionary <string, int>(); foreach (string keyword in words) { words.Add(keyword); jobAnalysisEntry.KeywordStatistics.Add(keyword, jobs.Values.Count(j => Regex.IsMatch(j.Description, $@"(^|\W){keyword}(\W|$)", RegexOptions.IgnoreCase) )); } } // city jobAnalysisEntry.CityStatistics = new Dictionary <string, int>(); foreach (var job in jobs.Values) { if (jobAnalysisEntry.CityStatistics.ContainsKey(job.City)) { jobAnalysisEntry.CityStatistics[job.City] += 1; } else { jobAnalysisEntry.CityStatistics.Add(job.City, 1); } } // worktype jobAnalysisEntry.WorkTypeStatistics = new Dictionary <string, int>(); foreach (var job in jobs.Values) { if (jobAnalysisEntry.WorkTypeStatistics.ContainsKey(job.WorkType)) { jobAnalysisEntry.WorkTypeStatistics[job.WorkType] += 1; } else { jobAnalysisEntry.WorkTypeStatistics.Add(job.WorkType, 1); } } }
public async Task Search(JobAnalysis jobAnalysis, IArangoDatabase client, string timeStamp) { var urls = SearchJob(jobAnalysis); Dictionary <string, Job> jobs = new Dictionary <string, Job>(); JobAnalysisEntry jobAnalysisEntry = new JobAnalysisEntry() { _key = $"{jobAnalysis._key}__{timeStamp}", AnalysisTime = DateTime.Now, KeywordStatistics = new Dictionary <string, int>(), TotalJobs = urls.Count, }; int urlIndex = 0; // download jobs and add edge to entry foreach (var url in urls) { urlIndex++; _logger.Information($"Download Job Url ({urlIndex} of {urls.Count}): {url}"); await DownloadJob(url, jobAnalysisEntry, client, jobs); } AnalyzeJobs(jobAnalysis, jobAnalysisEntry, jobs); client.UpsertIgnoreNull(jobAnalysisEntry); client.UpsertEdge <EntryOf, JobAnalysisEntry, JobAnalysis>(jobAnalysisEntry, jobAnalysis); // add edges to jobs; foreach (var job in jobs.Values) { client.UpsertEdge <JobAnalysisOf, JobAnalysisEntry, Job>(jobAnalysisEntry, job); } }