Esempio n. 1
0
        public List <string> SearchJob(JobAnalysis jobAnalysis)
        {
            _logger.Information($"Search Job: {jobAnalysis.Title}");

            List <string> urls = new List <string>();

            string keyWords = jobAnalysis.Title;

            HtmlDocument document = null;

            Func <bool> hasNext = new Func <bool>(() =>
                                                  document.DocumentNode.DescendantsAndSelf().Any(n =>
                                                                                                 n.Name.ToLower() == "a" && n.GetAttributeValue("data-automation", null) == "page-next"
                                                                                                 )
                                                  );

            int index = 1;

            do
            {
                // get total number of jobs
                _logger.Information($"  Search Job: {jobAnalysis.Title} - Page {index}");

                List <HtmlNode> urlNodes = null;
                do
                {
                    int chromeRetry = 0;
                    var url         = $"https://www.seek.com.au/{keyWords.Replace(" ", "-")}-jobs?page={index}";
                    try
                    {
                        chromeRetry++;

                        document = WebExtensions.LoadPage(url);
                        urlNodes = document.DocumentNode.DescendantsAndSelf()
                                   .Where(n =>
                                          n.Name.ToLower() == "a" &&
                                          n.GetAttributeValue("data-automation", "") == "jobTitle").ToList();
                    }
                    catch (Exception ex)
                    {
                        _logger.Error(ex, $"failed to get job list elements from url {url}. {chromeRetry} of {_searchOptions.MaxRetry} attempts.");
                    }
                } while (urlNodes == null);



                foreach (var urlNode in urlNodes)
                {
                    var href = urlNode.GetAttributeValue("href", null);
                    if (href != null)
                    {
                        urls.Add($@"https://www.seek.com.au{href}");
                    }
                }

                index++;
            } while (hasNext());

            return(urls);
        }
Esempio n. 2
0
        public async Task DownloadJob(string url, JobAnalysisEntry jobAnalysisEntry, IArangoDatabase client, Dictionary <string, Job> jobs)
        {
            var match = jobLink.Match(url);
            var key   = match.Groups[1].Value;

            // if job has been saved in database, don't do it again.
            bool exists = false;

            {
                bool success = false;
                int  retry   = 0;
                while (!success && retry < _searchOptions.MaxRetry)
                {
                    try
                    {
                        retry++;
                        var jobsFound = await client.Query <Job>().Filter(j => j._key == key).ToListAsync();

                        var first = jobsFound.FirstOrDefault();
                        if (first != null && first.Description != null && first.Description != "")
                        {
                            jobs.Add(first._key, first);
                            exists = true;
                        }
                        success = true;
                    }
                    catch (Exception ex)
                    {
                        _logger.Error(ex, $"failed to access job({key}) from arango. {retry} of {_searchOptions.MaxRetry} attempts.");
                    }
                }
            }

            if (!exists)
            {
                bool success = false;
                int  retry   = 0;

                HtmlDocument document = null;

                while (!success && retry < _searchOptions.MaxRetry)
                {
                    try
                    {
                        retry++;

                        HtmlNode jobDescription = null;
                        HtmlNode jobTitle       = null;
                        HtmlNode workType       = null;
                        HtmlNode publishDate    = null;
                        HtmlNode location       = null;
                        HtmlNode salary         = null;
                        HtmlNode city           = null;
                        int      chromeRetry    = 0;

                        do
                        {
                            try
                            {
                                chromeRetry++;

                                document = WebExtensions.LoadPage(url);

                                HtmlNode jobBox = document.DocumentNode.DescendantsAndSelf()
                                                  .Where(n => n.Name.ToLower() == "div" && n.GetAttributeValue("data-automation", null) == "jobDescription")
                                                  .FirstOrDefault();
                                jobDescription = jobBox.DescendantsAndSelf()
                                                 .Where(n => n.Name.ToLower() == "div" && n.HasClass("templatetext")).FirstOrDefault();

                                jobTitle = jobBox.DescendantsAndSelf()
                                           .Where(n => n.HasClass("jobtitle")).FirstOrDefault();

                                HtmlNode infoHeader = document.DocumentNode.DescendantsAndSelf()
                                                      .Where(n => n.Name.ToLower() == "section" && n.GetAttributeValue("aria-labelledby", null) == "jobInfoHeader")
                                                      .FirstOrDefault();

                                publishDate = infoHeader.DescendantsAndSelf()
                                              .Where(n => n.Name.ToLower() == "dd" && n.GetAttributeValue("data-automation", null) == "job-detail-date").FirstOrDefault();

                                workType = infoHeader.DescendantsAndSelf()
                                           .Where(n => n.Name.ToLower() == "dd" && n.GetAttributeValue("data-automation", null) == "job-detail-work-type").FirstOrDefault();

                                HtmlNode dataList = infoHeader.DescendantsAndSelf()
                                                    .Where(n => n.Name.ToLower() == "dl").FirstOrDefault();

                                var dataTitles = infoHeader.DescendantsAndSelf()
                                                 .Where(n => n.Name.ToLower() == "dt").ToList();

                                var dataDetails = infoHeader.DescendantsAndSelf()
                                                  .Where(n => n.Name.ToLower() == "dd").ToList();


                                for (int i = 0; i < Math.Min(dataTitles.Count, dataDetails.Count); i++)
                                {
                                    string innerText = dataTitles[i].InnerText;
                                    if (innerText.Contains("Location"))
                                    {
                                        location = dataDetails[i];
                                        city     = location.DescendantsAndSelf().Where(n => n.Name.ToLower() == "strong").FirstOrDefault();
                                    }
                                    else if (innerText.Contains("Salary"))
                                    {
                                        salary = dataDetails[i];
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                _logger.Error(ex, $"failed to get jobDescription element from url {url}. {chromeRetry} of {_searchOptions.MaxRetry} attempts.");
                            }
                        } while (jobDescription == null && chromeRetry < _searchOptions.MaxRetry);

                        Job job = new Job()
                        {
                            _key        = key,
                            Description = jobDescription.InnerText.HtmlDecode(),
                            TimeStamp   = DateTime.Now
                        };

                        if (jobTitle != null)
                        {
                            job.Title = jobTitle.InnerText.HtmlDecode();
                        }

                        if (location != null)
                        {
                            job.Location = location.InnerText.HtmlDecode();
                            if (city != null)
                            {
                                job.City = city.InnerText.HtmlDecode();
                            }
                        }

                        if (salary != null)
                        {
                            job.RawSalary = salary.InnerText.HtmlDecode();
                        }

                        if (workType != null)
                        {
                            job.WorkType = workType.InnerText.HtmlDecode();
                        }

                        if (publishDate != null)
                        {
                            job.RawPublishDate = publishDate.InnerText.HtmlDecode();
                            DateTime publish;
                            if (DateTime.TryParse(job.RawPublishDate, out publish))
                            {
                                job.PublishDate = publish;
                            }
                        }

                        if (jobs.ContainsKey(job._key))
                        {
                            jobs[job._key] = job;
                        }
                        else
                        {
                            jobs.Add(job._key, job);
                        }

                        // save to arango db

                        client.UpsertIgnoreNull(job);

                        success = true;
                    }
                    catch (Exception ex)
                    {
                        _logger.Error(ex, $"failed to get job details from url {url}. {retry} of {_searchOptions.MaxRetry} attempts.");
                    }
                }
            }
        }