private async Task SaveScrapingResult(ScrapingResult scrapingResult) { if (scrapingResult == null) { // Some scrapers are only used to start other scrapers. Those don't return results. return; } _logger.LogInformation($"Saving scraping results for {scrapingResult.Url}"); // Find appropriate persister var persister = _persisterFactory.GetPersister(scrapingResult.ResultObjectType); if (persister == null) { // No persister for this type of scraped object _logger.LogWarning($"No persister found for object {scrapingResult.ResultObjectType}"); return; } // Use reflection to invoke method with generic parameter var method = persister.GetType().GetMethod("Persist"); var persisterTaskResult = (Task)method.Invoke(persister, new[] { scrapingResult.ResultObject }); if (persisterTaskResult.Exception?.InnerException != null) { _logger.LogError($"Error occured while persisting URL: {scrapingResult.Url}\n{persisterTaskResult.Exception.InnerException}"); } }
public async Task <ActionResult <ScrapingResult> > GetScrapedData([FromBody] string siteName) { if (siteName != "http://techbitsolution.com") { return(Ok(null)); } ScrapingResult scrapingResult = await CheckForUpdates(siteName, "Web-Scraper updates"); return(Ok(scrapingResult)); }
private ScrapingResult ScrapingTaskOperation(SearchCriteria sc, int pageNo, LIScraper scraper) { bool morerecords = false; ScrapingResult res = new ScrapingResult(); res.UserProfiles = scraper.StartUserProfilesScraping(sc, pageNo, out morerecords); res.AreThereMoreRecords = morerecords; //var res = new ScrapingResult(); //res.AreThereMoreRecords = false; //res.UserProfiles = new List<LIUserData>() { new LIUserData() { ConnectionDegree = "2nd", CurrentWorkingTitle = "jobee", ProfileTitle = "imran" } }; //Thread.Sleep(10000); return(res); }
public async Task <ScrapingResult> StartScraping(SearchCriteria sc, int pageNo, LIScraper scraper) { ScrapingResult result = await StartScraingTask(sc, pageNo, scraper); return(result); }
public ScrapingResult Create(ScrapingResult scapingData) { _scarpingResult.InsertOne(scapingData); return(scapingData); }
private async Task <ScrapingResult> GetPageData(string url, List <dynamic> results) { ScrapingResult result = new ScrapingResult(); result.Hyperlinks = new List <string>(); result.SocialMediaLinks = new List <string>(); var config = Configuration.Default.WithDefaultLoader(); var context = BrowsingContext.New(config); var document = await context.OpenAsync(url); result.SiteName = url; result.Title = (document.GetElementsByTagName("title"))[0].InnerHtml; var elements = getAllElements(document.Head); string headerData = string.Empty;; string BodyData = string.Empty;; foreach (var ele in elements) { if (ele.LocalName.ToLower() == "head") { headerData = ((AngleSharp.Html.Dom.IHtmlHeadElement)ele).InnerHtml; } if (ele.LocalName.ToLower() == "meta" && ((AngleSharp.Html.Dom.IHtmlMetaElement)ele).Name == "description") { result.MetaDescription = ((AngleSharp.Html.Dom.IHtmlMetaElement)ele).Content; } else if (ele.LocalName.ToLower() == "meta" && ((AngleSharp.Html.Dom.IHtmlMetaElement)ele).Name == "keywords") { result.MetaKeywords = ((AngleSharp.Html.Dom.IHtmlMetaElement)ele).Content; } } var bodyElement = getAllElements(document.Body); foreach (var ele in bodyElement) { if (ele.LocalName.ToLower() == "body") { BodyData = ((AngleSharp.Html.Dom.IHtmlBodyElement)ele).InnerHtml; } if (ele.LocalName.ToLower() == "a") { Uri oldUri = new Uri(((AngleSharp.Html.Dom.IHtmlAnchorElement)ele).Href); Uri NewUri; Uri.TryCreate(oldUri.ToString(), UriKind.Absolute, out NewUri); var s = NewUri.Authority; if (s == "www.facebook.com" || s == "www.youtube.com" || s == "twitter.com" || s == "www.linkedin.com") { result.SocialMediaLinks.Add(((AngleSharp.Html.Dom.IHtmlAnchorElement)ele).Href); } else { result.Hyperlinks.Add(((AngleSharp.Html.Dom.IHtmlAnchorElement)ele).Href); } } } //string ScreenHtml = headerData + BodyData; var converter = new HtmlConverter(); result.SocialMediaLinks = result.SocialMediaLinks.Distinct().ToList(); result.Hyperlinks = result.Hyperlinks.Where(stringToCheck => stringToCheck.Contains("https://techbitsolution.com/services/")).ToList(); result.Hyperlinks = result.Hyperlinks.Distinct().ToList(); return(result); }