Ejemplo n.º 1
0
        /// <summary>
        /// Extract data from webpage
        /// </summary>
        /// <param name="scanJob">Scan job details</param>
        /// <returns>Task</returns>
        public async Task Extract(ScanJob scanJob)
        {
            Uri uri       = new Uri(scanJob.Url);
            var words     = new Dictionary <string, int>();
            var wordPairs = new Dictionary <string, int>();
            var urls      = new List <ExtractionStatus>()
            {
                new ExtractionStatus()
                {
                    Url = scanJob.Url.ToLower().Trim(), Depth = 1, Picked = false
                }
            };

            scanJob.JobStatus = Common.ScanJobStatus.Inprogress;
            while (urls.Any(u => u.Depth <= 4 && !u.Picked))
            {
                var url = urls.FirstOrDefault(c => !c.Picked);

                if (url != null)
                {
                    try
                    {
                        var text = await GetText(url.Url, url.Depth <= 4);

                        await GetWords(text.Item1, words, wordPairs);

                        url.Picked = true;
                        if (text.Item2.Any())
                        {
                            foreach (var u in text.Item2)
                            {
                                var tempUrl = u.ToLower().Trim();
                                if ((tempUrl.Contains("https") || tempUrl.Contains("http")) &&
                                    tempUrl.Contains(uri.Host.ToLower()) &&
                                    !urls.Any(c => c.Url.Equals(tempUrl)))
                                {
                                    urls.Add(new ExtractionStatus()
                                    {
                                        Url = u, Depth = url.Depth + 1, Picked = false
                                    });
                                }
                            }
                        }
                        var jobs = _mapper.Map <ScanJobEntity>(scanJob);
                        UpdateCount(jobs, words, wordPairs);
                        jobs.Message = $"{urls.Count(c => c.Picked)} url and dependent urls processed, {urls.Count(c => !c.Picked)} yet to process. ";
                        _storageManager.UpdateScanJobs(jobs);
                    }
                    catch (Exception ex)
                    {
                        _logger.LogError(ex, $"Some error in extracting content got url {url.Url}", url);
                    }
                }
            }

            scanJob.JobStatus = Common.ScanJobStatus.Completed;
            scanJob.Message   = "Extraction done";
            _storageManager.UpdateScanJobs(_mapper.Map <ScanJobEntity>(scanJob));
        }
Ejemplo n.º 2
0
 /// <summary>
 /// This is the method background worker will be calling to process the job
 /// </summary>
 /// <param name="scanJob">Scan job as input</param>
 /// <returns>task</returns>
 async Task Process(ScanJob scanJob)
 {
     await _dataExtractor.Extract(scanJob);
 }