Ejemplo n.º 1
0
        public void AnalyzeJobs(JobAnalysis jobAnalysis, JobAnalysisEntry jobAnalysisEntry, Dictionary <string, Job> jobs)
        {
            _logger.Information($"Analyzing Jobs for {jobAnalysis.Title} Entry: {jobAnalysisEntry._key}");

            if (jobAnalysis.Keywords != null)
            {
                HashSet <string> words = new HashSet <string>(new StringComparer());

                foreach (string keyword in jobAnalysis.Keywords)
                {
                    words.Add(keyword);
                }

                jobAnalysisEntry.KeywordStatistics = new Dictionary <string, int>();

                foreach (string keyword in words)
                {
                    words.Add(keyword);
                    jobAnalysisEntry.KeywordStatistics.Add(keyword,
                                                           jobs.Values.Count(j =>
                                                                             Regex.IsMatch(j.Description, $@"(^|\W){keyword}(\W|$)", RegexOptions.IgnoreCase)
                                                                             ));
                }
            }

            // city
            jobAnalysisEntry.CityStatistics = new Dictionary <string, int>();
            foreach (var job in jobs.Values)
            {
                if (jobAnalysisEntry.CityStatistics.ContainsKey(job.City))
                {
                    jobAnalysisEntry.CityStatistics[job.City] += 1;
                }
                else
                {
                    jobAnalysisEntry.CityStatistics.Add(job.City, 1);
                }
            }

            // worktype
            jobAnalysisEntry.WorkTypeStatistics = new Dictionary <string, int>();
            foreach (var job in jobs.Values)
            {
                if (jobAnalysisEntry.WorkTypeStatistics.ContainsKey(job.WorkType))
                {
                    jobAnalysisEntry.WorkTypeStatistics[job.WorkType] += 1;
                }
                else
                {
                    jobAnalysisEntry.WorkTypeStatistics.Add(job.WorkType, 1);
                }
            }
        }
Ejemplo n.º 2
0
        public async Task Search(JobAnalysis jobAnalysis, IArangoDatabase client, string timeStamp)
        {
            var urls = SearchJob(jobAnalysis);

            Dictionary <string, Job> jobs = new Dictionary <string, Job>();

            JobAnalysisEntry jobAnalysisEntry = new JobAnalysisEntry()
            {
                _key              = $"{jobAnalysis._key}__{timeStamp}",
                AnalysisTime      = DateTime.Now,
                KeywordStatistics = new Dictionary <string, int>(),
                TotalJobs         = urls.Count,
            };

            int urlIndex = 0;

            // download jobs and add edge to entry
            foreach (var url in urls)
            {
                urlIndex++;
                _logger.Information($"Download Job Url ({urlIndex} of {urls.Count}): {url}");
                await DownloadJob(url, jobAnalysisEntry, client, jobs);
            }

            AnalyzeJobs(jobAnalysis, jobAnalysisEntry, jobs);

            client.UpsertIgnoreNull(jobAnalysisEntry);

            client.UpsertEdge <EntryOf, JobAnalysisEntry, JobAnalysis>(jobAnalysisEntry, jobAnalysis);

            // add edges to jobs;

            foreach (var job in jobs.Values)
            {
                client.UpsertEdge <JobAnalysisOf, JobAnalysisEntry, Job>(jobAnalysisEntry, job);
            }
        }
Ejemplo n.º 3
0
        public async Task DownloadJob(string url, JobAnalysisEntry jobAnalysisEntry, IArangoDatabase client, Dictionary <string, Job> jobs)
        {
            var match = jobLink.Match(url);
            var key   = match.Groups[1].Value;

            // if job has been saved in database, don't do it again.
            bool exists = false;

            {
                bool success = false;
                int  retry   = 0;
                while (!success && retry < _searchOptions.MaxRetry)
                {
                    try
                    {
                        retry++;
                        var jobsFound = await client.Query <Job>().Filter(j => j._key == key).ToListAsync();

                        var first = jobsFound.FirstOrDefault();
                        if (first != null && first.Description != null && first.Description != "")
                        {
                            jobs.Add(first._key, first);
                            exists = true;
                        }
                        success = true;
                    }
                    catch (Exception ex)
                    {
                        _logger.Error(ex, $"failed to access job({key}) from arango. {retry} of {_searchOptions.MaxRetry} attempts.");
                    }
                }
            }

            if (!exists)
            {
                bool success = false;
                int  retry   = 0;

                HtmlDocument document = null;

                while (!success && retry < _searchOptions.MaxRetry)
                {
                    try
                    {
                        retry++;

                        HtmlNode jobDescription = null;
                        HtmlNode jobTitle       = null;
                        HtmlNode workType       = null;
                        HtmlNode publishDate    = null;
                        HtmlNode location       = null;
                        HtmlNode salary         = null;
                        HtmlNode city           = null;
                        int      chromeRetry    = 0;

                        do
                        {
                            try
                            {
                                chromeRetry++;

                                document = WebExtensions.LoadPage(url);

                                HtmlNode jobBox = document.DocumentNode.DescendantsAndSelf()
                                                  .Where(n => n.Name.ToLower() == "div" && n.GetAttributeValue("data-automation", null) == "jobDescription")
                                                  .FirstOrDefault();
                                jobDescription = jobBox.DescendantsAndSelf()
                                                 .Where(n => n.Name.ToLower() == "div" && n.HasClass("templatetext")).FirstOrDefault();

                                jobTitle = jobBox.DescendantsAndSelf()
                                           .Where(n => n.HasClass("jobtitle")).FirstOrDefault();

                                HtmlNode infoHeader = document.DocumentNode.DescendantsAndSelf()
                                                      .Where(n => n.Name.ToLower() == "section" && n.GetAttributeValue("aria-labelledby", null) == "jobInfoHeader")
                                                      .FirstOrDefault();

                                publishDate = infoHeader.DescendantsAndSelf()
                                              .Where(n => n.Name.ToLower() == "dd" && n.GetAttributeValue("data-automation", null) == "job-detail-date").FirstOrDefault();

                                workType = infoHeader.DescendantsAndSelf()
                                           .Where(n => n.Name.ToLower() == "dd" && n.GetAttributeValue("data-automation", null) == "job-detail-work-type").FirstOrDefault();

                                HtmlNode dataList = infoHeader.DescendantsAndSelf()
                                                    .Where(n => n.Name.ToLower() == "dl").FirstOrDefault();

                                var dataTitles = infoHeader.DescendantsAndSelf()
                                                 .Where(n => n.Name.ToLower() == "dt").ToList();

                                var dataDetails = infoHeader.DescendantsAndSelf()
                                                  .Where(n => n.Name.ToLower() == "dd").ToList();


                                for (int i = 0; i < Math.Min(dataTitles.Count, dataDetails.Count); i++)
                                {
                                    string innerText = dataTitles[i].InnerText;
                                    if (innerText.Contains("Location"))
                                    {
                                        location = dataDetails[i];
                                        city     = location.DescendantsAndSelf().Where(n => n.Name.ToLower() == "strong").FirstOrDefault();
                                    }
                                    else if (innerText.Contains("Salary"))
                                    {
                                        salary = dataDetails[i];
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                _logger.Error(ex, $"failed to get jobDescription element from url {url}. {chromeRetry} of {_searchOptions.MaxRetry} attempts.");
                            }
                        } while (jobDescription == null && chromeRetry < _searchOptions.MaxRetry);

                        Job job = new Job()
                        {
                            _key        = key,
                            Description = jobDescription.InnerText.HtmlDecode(),
                            TimeStamp   = DateTime.Now
                        };

                        if (jobTitle != null)
                        {
                            job.Title = jobTitle.InnerText.HtmlDecode();
                        }

                        if (location != null)
                        {
                            job.Location = location.InnerText.HtmlDecode();
                            if (city != null)
                            {
                                job.City = city.InnerText.HtmlDecode();
                            }
                        }

                        if (salary != null)
                        {
                            job.RawSalary = salary.InnerText.HtmlDecode();
                        }

                        if (workType != null)
                        {
                            job.WorkType = workType.InnerText.HtmlDecode();
                        }

                        if (publishDate != null)
                        {
                            job.RawPublishDate = publishDate.InnerText.HtmlDecode();
                            DateTime publish;
                            if (DateTime.TryParse(job.RawPublishDate, out publish))
                            {
                                job.PublishDate = publish;
                            }
                        }

                        if (jobs.ContainsKey(job._key))
                        {
                            jobs[job._key] = job;
                        }
                        else
                        {
                            jobs.Add(job._key, job);
                        }

                        // save to arango db

                        client.UpsertIgnoreNull(job);

                        success = true;
                    }
                    catch (Exception ex)
                    {
                        _logger.Error(ex, $"failed to get job details from url {url}. {retry} of {_searchOptions.MaxRetry} attempts.");
                    }
                }
            }
        }