public void ImgsHelper(ScraperEngineResponse response) { //var results = response.Doc.DocumentNode.SelectNodes("//img[@alt='']"); var results = response.Doc.DocumentNode.SelectNodes("//img[not(@alt)] | //img[@alt='']"); if (results != null) { List <Task> tasks = new List <Task>(); tasks.Add(Task.Run(() => { foreach (var result in results) { if (result.OuterHtml == null) { continue; } if (!ImgsWithNoAlt.ContainsKey(result.OuterHtml)) { ImgsWithNoAlt.Add(result.OuterHtml, response.Url); } } StateHasChangedDelegate?.Invoke(); })); } }
public void DescriptionsHelper(ScraperEngineResponse response) { var results = response.Doc.DocumentNode.SelectNodes("//meta[@name='description']"); if (results != null) { foreach (var result in results) { var innerText = result.GetAttributeValue("content", "none"); if (innerText.Equals("none") || innerText.Equals("")) { TitleDescCheck description = new TitleDescCheck("Missing description", innerText, response.Url); EmptyDescriptions.Add(description); AllDescriptions.Add(description); } else if (innerText.Length > 160) { TitleDescCheck description = new TitleDescCheck("Too long", innerText, response.Url); LongDescriptions.Add(description); AllDescriptions.Add(description); } else if (innerText.Length <= 50) { TitleDescCheck description = new TitleDescCheck("Too short", innerText, response.Url); ShortDescriptions.Add(description); AllDescriptions.Add(description); } else { TitleDescCheck description = new TitleDescCheck("Good", innerText, response.Url); HealthyDescriptions.Add(description); AllDescriptions.Add(description); } StateHasChangedDelegate?.Invoke(); } } }
public void TitlesHelper(ScraperEngineResponse response) { var results = response.Doc.DocumentNode.SelectNodes("//title"); if (results != null) { foreach (var result in results) { if (result.OuterHtml == null || result.InnerText == "") { TitleDescCheck title = new TitleDescCheck("Missing title", result.InnerText, response.Url); EmptyTitles.Add(title); AllTitles.Add(title); } else if (result.InnerText.Length >= 60) { TitleDescCheck title = new TitleDescCheck("Too long", result.InnerText, response.Url); LongTitles.Add(title); AllTitles.Add(title); } else if (result.InnerText.Length <= 40) { TitleDescCheck title = new TitleDescCheck("Too short", result.InnerText, response.Url); ShortTitles.Add(title); AllTitles.Add(title); } else { TitleDescCheck title = new TitleDescCheck("Good", result.InnerText, response.Url); HealthyTitles.Add(title); AllTitles.Add(title); } StateHasChangedDelegate?.Invoke(); } } }
private void GiveAds(ScraperEngineResponse response) { var doorstep = response.Doc.GetElementbyId("tads"); if (doorstep != null) { HtmlNodeCollection topads; try { topads = response.Doc.GetElementbyId("tads").SelectNodes("//li[@class='ads-ad']"); if (topads == null) { return; } } catch (Exception e) { Console.WriteLine(e); throw; } foreach (var adresult in topads) { var divClass = adresult.GetAttributeValue("class", ""); if (divClass != "ads-ad") { continue; } var url = adresult.SelectSingleNode(".//cite[@class='UdQCqe']").InnerText; var domain = url; var filtrirana = ""; var filtereddomain = ""; if (domain != null) { if (!domain.StartsWith("https://")) //if domain http || www { if (!domain.StartsWith("http://")) //uri needs http { domain = "http://" + domain; } Uri uri = new Uri(domain); filtereddomain = uri.Host; //e.g. http://www.example.com string[] subdomain = filtereddomain.Split(new char[] { '.' }); //split string if (filtereddomain.Contains("www.")) { var count = subdomain[0].Length; //remove www filtrirana = filtereddomain.Remove(0, count + 1); } else { var count = subdomain[0].Length; filtrirana = filtereddomain.Remove(0, count + 1); } } else //if domain https { domain = domain.Substring(8); domain = "http://" + domain; Uri uri = new Uri(domain); filtereddomain = uri.Host; if (filtereddomain.Contains("www.")) { filtereddomain = filtereddomain.Substring(4); } Console.WriteLine(filtereddomain); } } if (!filtereddomain.StartsWith("https://")) { filtereddomain = "https://" + filtereddomain; } var subject = adresult.SelectSingleNode(".//h3").InnerText; WebUtility.HtmlDecode(subject); var desc = adresult.SelectSingleNode(".//div[@class='ads-creative']").InnerText; Advert tempres = new Advert(filtereddomain, filtrirana, subject, desc); Reklame.Add(tempres); StateHasChangedDelegate?.Invoke(); } } }