/// <summary> /// Extract data from webpage /// </summary> /// <param name="scanJob">Scan job details</param> /// <returns>Task</returns> public async Task Extract(ScanJob scanJob) { Uri uri = new Uri(scanJob.Url); var words = new Dictionary <string, int>(); var wordPairs = new Dictionary <string, int>(); var urls = new List <ExtractionStatus>() { new ExtractionStatus() { Url = scanJob.Url.ToLower().Trim(), Depth = 1, Picked = false } }; scanJob.JobStatus = Common.ScanJobStatus.Inprogress; while (urls.Any(u => u.Depth <= 4 && !u.Picked)) { var url = urls.FirstOrDefault(c => !c.Picked); if (url != null) { try { var text = await GetText(url.Url, url.Depth <= 4); await GetWords(text.Item1, words, wordPairs); url.Picked = true; if (text.Item2.Any()) { foreach (var u in text.Item2) { var tempUrl = u.ToLower().Trim(); if ((tempUrl.Contains("https") || tempUrl.Contains("http")) && tempUrl.Contains(uri.Host.ToLower()) && !urls.Any(c => c.Url.Equals(tempUrl))) { urls.Add(new ExtractionStatus() { Url = u, Depth = url.Depth + 1, Picked = false }); } } } var jobs = _mapper.Map <ScanJobEntity>(scanJob); UpdateCount(jobs, words, wordPairs); jobs.Message = $"{urls.Count(c => c.Picked)} url and dependent urls processed, {urls.Count(c => !c.Picked)} yet to process. "; _storageManager.UpdateScanJobs(jobs); } catch (Exception ex) { _logger.LogError(ex, $"Some error in extracting content got url {url.Url}", url); } } } scanJob.JobStatus = Common.ScanJobStatus.Completed; scanJob.Message = "Extraction done"; _storageManager.UpdateScanJobs(_mapper.Map <ScanJobEntity>(scanJob)); }
/// <summary> /// This is the method background worker will be calling to process the job /// </summary> /// <param name="scanJob">Scan job as input</param> /// <returns>task</returns> async Task Process(ScanJob scanJob) { await _dataExtractor.Extract(scanJob); }