public static List <OrganizationSite> StartImport(List <OrganizationSite> sites) { string charityFile = File.ReadAllText("CharityNavigator.json"); var json = JArray.Parse(charityFile); foreach (var charity in json) { if (charity["currentRating"] != null && charity["websiteURL"] != null && charity["websiteURL"].Value <string>() != null) { var domain = ExtractDomainNameFromURL(charity["websiteURL"].Value <string>()); if (sites.Any(s => s.Domain.Equals(domain))) { var site = sites.Where(s => s.Domain.Equals(domain)).Single(); if (site.Sources.Any(s => s.Organization == SourceOrganization.CharityNavigator && s.ClaimType == SourceClaimType.CharityRating)) { var source = site.Sources.Where(s => s.Organization == SourceOrganization.CharityNavigator && s.ClaimType == SourceClaimType.CharityRating).Single(); source.ClaimValue = charity["currentRating"]["rating"].Value <int>(); Console.WriteLine("Updating Source for " + site.Name); } else { var source = new Source(); source.ClaimType = SourceClaimType.CharityRating; source.ClaimValue = charity["currentRating"]["rating"].Value <int>(); source.Organization = SourceOrganization.CharityNavigator; source.URL = charity["orgID"].Value <string>(); site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } } else { var site = new OrganizationSite(); site.Name = charity["charityName"].Value <string>(); site.OrganizationType = OrgType.NonProfit; site.Domain = domain; var source = new Source(); source.ClaimType = SourceClaimType.CharityRating; source.ClaimValue = charity["currentRating"]["rating"].Value <int>(); source.Organization = SourceOrganization.CharityNavigator; source.URL = charity["orgID"].Value <string>(); site.Sources.Add(source); sites.Add(site); Console.WriteLine("Adding site " + site.Name); } } } return(sites); }
private static OrganizationSite LoadSite(string url) { if (url == "/news-source/test-source" || url == "/news-source/yahoo-360-media-bias") { return(null); } var doc = new HtmlAgilityPack.HtmlDocument(); var request = (HttpWebRequest)WebRequest.Create("https://www.allsides.com" + url); var response = (HttpWebResponse)request.GetResponse(); string html; using (var sr = new StreamReader(response.GetResponseStream())) { html = sr.ReadToEnd(); } doc.LoadHtml(html); var siteInfo = doc.QuerySelectorAll(".dynamic-grid > a").FirstOrDefault(); if (siteInfo != null) { var domain = ""; //Override for weird format if (url == "/news-source/suspend-belief-podcast") { domain = "suspendbeliefpodcast.com"; } else { domain = ExtractDomainNameFromURL(siteInfo.Attributes["href"].Value); } if (IgnoreUrl(domain)) { return(null); } if (_sites.Any(s => s.Domain.Equals(domain))) { var site = _sites.Where(s => s.Domain.Equals(domain)).Single(); if (site.Sources.Any(s => s.Organization == SourceOrganization.AllSides && s.ClaimType == SourceClaimType.Bias)) { var source = site.Sources.Where(s => s.Organization == SourceOrganization.AllSides && s.ClaimType == SourceClaimType.Bias).Single(); source.ClaimValue = (int)GetBias(doc.QuerySelector(".rating-area a").InnerHtml.ToLower()); } else { var source = new Source(); source.Organization = SourceOrganization.AllSides; source.URL = "https://www.allsides.com" + url; source.ClaimType = SourceClaimType.Bias; source.ClaimValue = (int)GetBias(doc.QuerySelector(".rating-area a").InnerHtml.ToLower()); site.Sources.Add(source); } Console.WriteLine("Updated " + site.Name); return(site); } else { var site = new OrganizationSite(); site.Name = HttpUtility.HtmlDecode(doc.QuerySelector(".latest_news_source h1").InnerText); site.Domain = ExtractDomainNameFromURL(siteInfo.Attributes["href"].Value); if (doc.QuerySelector(".latest_news_source p").InnerText == "Think Tank / Policy Group") { site.OrganizationType = OrgType.ThinkTank; } else { site.OrganizationType = OrgType.NewsMedia; } if (doc.QuerySelector("a[title=Wikipedia]") != null && doc.QuerySelector("a[title=Wikipedia]").Attributes["href"].Value.Contains("wikipedia")) { var wikiUrl = doc.QuerySelector("a[title=Wikipedia]").Attributes["href"].Value; site.Description = GetWikipediaDescription(wikiUrl); site.Wikipedia = wikiUrl; } var source = new Source(); source.Organization = SourceOrganization.AllSides; source.URL = "https://www.allsides.com" + url; source.ClaimType = SourceClaimType.Bias; source.ClaimValue = (int)GetBias(doc.QuerySelector(".rating-area a").InnerHtml.ToLower()); site.Sources.Add(source); Console.WriteLine("Loaded " + site.Name); return(site); } } else { return(null); } }
private static OrganizationSite LoadSite(string url) { try { var doc = new HtmlAgilityPack.HtmlDocument(); var request = WebRequest.Create(url); var response = (HttpWebResponse)request.GetResponse(); string html; using (var sr = new StreamReader(response.GetResponseStream())) { html = sr.ReadToEnd(); } doc.LoadHtml(html); var siteName = HttpUtility.HtmlDecode(doc.QuerySelector("h1.page-title").InnerHtml).Trim(); var domainLink = getDomain(doc, siteName); if (domainLink != null) { var domain = ExtractDomainNameFromURL(domainLink.Attributes["href"].Value); if (IgnoreUrl(domain)) { return(null); } //Start by defaulting based on fact reporting OrgType orgType = OrgType.ExtremelyUnreliable; var factRating = doc.QuerySelectorAll("div.entry-content p").Where(s => s.InnerText.Trim().ToLower().StartsWith("factual reporting:")).FirstOrDefault(); if (factRating == null) { factRating = doc.QuerySelectorAll("div.entry p").Where(s => s.InnerText.Trim().ToLower().StartsWith("factual reporting:")).FirstOrDefault(); } if (factRating != null) { if (factRating.InnerText.ToLower().Contains("very low")) { orgType = OrgType.Fake; } } //Examine the reasoning, and adjust based on listed answers. var biases = doc.QuerySelectorAll("div.entry-content p").Where(s => s.InnerText.Trim().ToLower().StartsWith("reasoning:")).FirstOrDefault(); if (biases != null && (biases.InnerText.ToLower().Contains("fake news"))) { orgType = OrgType.Fake; } if (biases != null && (biases.InnerText.ToLower().Contains("some fake news"))) { orgType = OrgType.ExtremelyUnreliable; } if ((biases != null && (biases.InnerText.ToLower().Contains("hate group") || biases.InnerText.ToLower().Contains("anti-lgbt") || biases.InnerText.ToLower().Contains("anti-islam")))) { orgType = OrgType.HateGroup; } if (_sites.Any(s => s.Domain.Equals(domain))) { var site = _sites.Where(s => s.Domain.Equals(domain)).Single(); site.Sources.RemoveAll(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.Veracity); site.OrganizationType = orgType; if (!site.Sources.Any(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.Veracity) && (orgType == OrgType.Fake || orgType == OrgType.ExtremelyUnreliable)) { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.Veracity; source.ClaimValue = (int)orgType; site.Sources.Add(source); Console.WriteLine("Added Veracity Source for " + site.Name); } if (!site.Sources.Any(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.OrgType) && orgType == OrgType.HateGroup) { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.OrgType; source.ClaimValue = (int)OrgType.HateGroup; site.Sources.Add(source); Console.WriteLine("Added Org Type Source for " + site.Name); } if (site.Sources.Any(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.Bias)) { var source = site.Sources.Where(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.Bias).Single(); source.ClaimValue = (int)GetBias(doc); Console.WriteLine("Updating Bias Source for " + site.Name); } else { if (GetBias(doc) != Bias.Unknown) { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.Bias; source.ClaimValue = (int)GetBias(doc); site.Sources.Add(source); Console.WriteLine("Added Bias Source for " + site.Name); } } return(site); } else { var site = new OrganizationSite(); site.Name = siteName; site.Domain = domain; site.OrganizationType = orgType; var notes = doc.QuerySelectorAll("div.entry-content p").Where(s => s.InnerText.Trim().ToLower().StartsWith("notes:")); if (notes.Count() > 0) { if (notes.FirstOrDefault().QuerySelectorAll("a").Any(s => s.Attributes["href"].Value.Contains("wikipedia"))) { var wikiUrl = notes.FirstOrDefault().QuerySelectorAll("a").Where(s => s.Attributes["href"].Value.Contains("wikipedia")).FirstOrDefault().Attributes["href"].Value; site.Wikipedia = wikiUrl; } } if (GetBias(doc) != Bias.Unknown) { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.Bias; source.ClaimValue = (int)GetBias(doc); site.Sources.Add(source); } if (orgType == OrgType.Fake || orgType == OrgType.ExtremelyUnreliable) { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.Veracity; source.ClaimValue = (int)orgType; site.Sources.Add(source); } else if (orgType == OrgType.HateGroup) { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.OrgType; source.ClaimValue = (int)OrgType.HateGroup; site.Sources.Add(source); } Console.WriteLine("Loaded " + site.Name); return(site); } } else { Console.WriteLine(siteName + ": Domain not found."); return(null); } } catch (Exception ex) { Console.WriteLine(ex.Message); return(null); } }
public static void Import(string url, int currentPage) { var doc = new HtmlAgilityPack.HtmlDocument(); var request = (HttpWebRequest)WebRequest.Create(string.Format(url, currentPage)); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"; var response = (HttpWebResponse)request.GetResponse(); string html; using (var sr = new StreamReader(response.GetResponseStream())) { html = sr.ReadToEnd(); } doc.LoadHtml(html); var rows = doc.DocumentNode.QuerySelectorAll("#primary article:not(.category-real)"); foreach (var row in rows) { var domain = ExtractDomainNameFromURL(row.QuerySelector("h2.entry-title").InnerText); if (IgnoreUrl(domain)) { continue; } var orgType = OrgType.Satire; if (row.Attributes["class"].Value.Contains("category-clickbait")) { orgType = OrgType.ClickBait; } if (_sites.Any(s => s.Domain.Equals(domain))) { var site = _sites.Where(s => s.Domain.Equals(domain)).Single(); if (!site.Sources.Any(s => s.Organization == SourceOrganization.RealOrSatire && s.ClaimType == SourceClaimType.Veracity)) { var source = new Source(); source.Organization = SourceOrganization.RealOrSatire; source.URL = row.QuerySelector("h2.entry-title a").Attributes["href"].Value; source.ClaimType = SourceClaimType.Veracity; source.ClaimValue = (int)orgType; site.Sources.Add(source); } } else { var site = new OrganizationSite(); site.Name = domain; site.Domain = domain; site.OrganizationType = orgType; var source = new Source(); source.Organization = SourceOrganization.RealOrSatire; source.URL = row.QuerySelector("h2.entry-title a").Attributes["href"].Value; source.ClaimType = SourceClaimType.Veracity; source.ClaimValue = (int)orgType; site.Sources.Add(source); if (!_sites.Contains(site)) { _sites.Add(site); } } } if (doc.QuerySelector("div.nav-previous") != null) { Import(url, ++currentPage); } }
private static OrganizationSite LoadSite(string url) { try { var doc = new HtmlAgilityPack.HtmlDocument(); var request = WebRequest.Create(url); var response = (HttpWebResponse)request.GetResponse(); string html; using (var sr = new StreamReader(response.GetResponseStream())) { html = sr.ReadToEnd(); } doc.LoadHtml(html); var siteName = HttpUtility.HtmlDecode(doc.QuerySelector("h1.page-title").InnerHtml).Trim(); var domainLink = getDomain(doc, siteName); if (domainLink != null) { var domain = ExtractDomainNameFromURL(domainLink.Attributes["href"].Value); if (IgnoreUrl(domain)) { return(null); } if (_sites.Any(s => s.Domain.Equals(domain))) { var site = _sites.Where(s => s.Domain.Equals(domain)).Single(); if (site.Sources.Any(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.Bias)) { var source = site.Sources.Where(s => s.Organization == SourceOrganization.MBFC && s.ClaimType == SourceClaimType.Bias).Single(); source.ClaimValue = (int)GetBias(doc); Console.WriteLine("Updating Source for " + site.Name); } else { var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.Bias; source.ClaimValue = (int)GetBias(doc); site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } return(site); } else { var site = new OrganizationSite(); site.Name = siteName; site.Domain = domain; var notes = doc.QuerySelectorAll("div.entry-content p").Where(s => s.InnerText.Trim().ToLower().StartsWith("notes:")); if (notes.Count() > 0) { site.OrganizationType = GetOrgType(notes.FirstOrDefault()); if (notes.FirstOrDefault().QuerySelectorAll("a").Any(s => s.Attributes["href"].Value.Contains("wikipedia"))) { var wikiUrl = notes.FirstOrDefault().QuerySelectorAll("a").Where(s => s.Attributes["href"].Value.Contains("wikipedia")).FirstOrDefault().Attributes["href"].Value; site.Wikipedia = wikiUrl; } } var source = new Source(); source.Organization = SourceOrganization.MBFC; source.URL = url; source.ClaimType = SourceClaimType.Bias; source.ClaimValue = (int)GetBias(doc); site.Sources.Add(source); Console.WriteLine("Loaded " + site.Name); return(site); } } else { Console.WriteLine(siteName + ": Domain not found."); return(null); } } catch (Exception ex) { Console.WriteLine(ex.Message); return(null); } }
public static List <OrganizationSite> StartImport(List <OrganizationSite> sites) { var request = (HttpWebRequest)WebRequest.Create("https://raw.githubusercontent.com/BigMcLargeHuge/opensources/master/sources/sources.json"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"; var response = (HttpWebResponse)request.GetResponse(); string jsonfile; using (var sr = new StreamReader(response.GetResponseStream())) { jsonfile = sr.ReadToEnd(); } var json = JObject.Parse(jsonfile); foreach (var jobj in json.Children <JProperty>()) { var domain = ExtractDomainNameFromURL(jobj.Name); var orgType = GetOrgType(jobj.Value <JToken>().ElementAt(0), 0); if (orgType == OrgType.Other) { continue; } if (sites.Any(s => s.Domain.Equals(domain))) { var site = sites.Where(s => s.Domain.Equals(domain)).Single(); if (site.OrganizationType != OrgType.HateGroup && site.OrganizationType != OrgType.Fake && site.OrganizationType != OrgType.ExtremelyUnreliable) { site.OrganizationType = orgType; } if (site.Sources.Any(s => s.Organization == SourceOrganization.OpenSources)) { if (orgType == OrgType.HateGroup) { if (site.Sources.Any(s => s.Organization == SourceOrganization.OpenSources && s.ClaimType == SourceClaimType.OrgType)) { var source = site.Sources.Where(s => s.Organization == SourceOrganization.OpenSources && s.ClaimType == SourceClaimType.OrgType).Single(); source.ClaimValue = (int)orgType; Console.WriteLine("Updating Source for " + site.Name); } } else { var source = site.Sources.Where(s => s.Organization == SourceOrganization.OpenSources && s.ClaimType == SourceClaimType.Veracity).Single(); source.ClaimValue = (int)orgType; Console.WriteLine("Updating Source for " + site.Name); } } else { if (orgType == OrgType.HateGroup) { var source = new Source(); source.ClaimType = SourceClaimType.OrgType; source.ClaimValue = (int)orgType; source.Organization = SourceOrganization.OpenSources; source.URL = "http://www.opensources.co/"; site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } else { var source = new Source(); source.ClaimType = SourceClaimType.Veracity; source.ClaimValue = (int)orgType; source.Organization = SourceOrganization.OpenSources; source.URL = "http://www.opensources.co/"; site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } } } else { var site = new OrganizationSite(); site.Name = domain; site.OrganizationType = orgType; site.Domain = domain; if (orgType == OrgType.HateGroup) { var source = new Source(); source.ClaimType = SourceClaimType.OrgType; source.ClaimValue = (int)orgType; source.Organization = SourceOrganization.OpenSources; source.URL = "http://www.opensources.co/"; site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } else { var source = new Source(); source.ClaimType = SourceClaimType.Veracity; source.ClaimValue = (int)orgType; source.Organization = SourceOrganization.OpenSources; source.URL = "http://www.opensources.co/"; site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } sites.Add(site); Console.WriteLine("Adding site " + site.Name); } } return(sites); }
public static void Import() { var doc = new HtmlAgilityPack.HtmlDocument(); var request = (HttpWebRequest)WebRequest.Create("http://www.fakenewscodex.com/"); var response = (HttpWebResponse)request.GetResponse(); string html; using (var sr = new StreamReader(response.GetResponseStream())) { html = sr.ReadToEnd(); } doc.LoadHtml(html); var rows = doc.DocumentNode.QuerySelectorAll("ul.list li"); foreach (var row in rows) { var domain = ExtractDomainNameFromURL(row.QuerySelector("div.c-sites-list__url").InnerText); if (IgnoreUrl(domain)) { continue; } if (_sites.Any(s => s.Domain.Equals(domain))) { var site = _sites.Where(s => s.Domain.Equals(domain)).Single(); if (row.Attributes["class"].Value.Contains("badge--fake")) { site.OrganizationType = OrgType.Fake; } else if (row.Attributes["class"].Value.Contains("badge--satire") && site.OrganizationType != OrgType.Fake) { //Ignore if already fake, want fake to take precidence over satire in case there's conflicting opinions site.OrganizationType = OrgType.Satire; } else { site.OrganizationType = OrgType.ExtremelyUnreliable; } if (site.Sources.Any(s => s.Organization == SourceOrganization.FakeNewsCodex && s.ClaimType == SourceClaimType.Veracity)) { var source = site.Sources.Where(s => s.Organization == SourceOrganization.FakeNewsCodex && s.ClaimType == SourceClaimType.Veracity).Single(); if (row.Attributes["class"].Value.Contains("badge--fake")) { source.ClaimValue = (int)OrgType.Fake; } else if (row.Attributes["class"].Value.Contains("badge--satire")) { source.ClaimValue = (int)OrgType.Satire; } else { source.ClaimValue = (int)OrgType.ExtremelyUnreliable; } Console.WriteLine("Updating Source for " + site.Name); } else { var source = new Source(); source.Organization = SourceOrganization.FakeNewsCodex; if (row.QuerySelector("a.read-more") != null) { source.URL = row.QuerySelector("a.read-more").Attributes["href"].Value; } else { //Try alternate method to get url source.URL = row.QuerySelector("div.c-sites-list__image-container a").Attributes["href"].Value; } source.ClaimType = SourceClaimType.Veracity; if (row.Attributes["class"].Value.Contains("badge--fake")) { source.ClaimValue = (int)OrgType.Fake; } else if (row.Attributes["class"].Value.Contains("badge--satire")) { source.ClaimValue = (int)OrgType.Satire; } else { source.ClaimValue = (int)OrgType.ExtremelyUnreliable; } site.Sources.Add(source); Console.WriteLine("Adding Source for " + site.Name); } } else { var site = new OrganizationSite(); site.Name = row.QuerySelector("a.c-sites-list__link").InnerText.Trim(); site.Domain = domain; if (row.Attributes["class"].Value.Contains("badge--fake")) { site.OrganizationType = OrgType.Fake; } else if (row.Attributes["class"].Value.Contains("badge--satire")) { site.OrganizationType = OrgType.Satire; } else { site.OrganizationType = OrgType.ExtremelyUnreliable; } var source = new Source(); source.Organization = SourceOrganization.FakeNewsCodex; if (row.QuerySelector("a.read-more") != null) { source.URL = row.QuerySelector("a.read-more").Attributes["href"].Value; } else { //Try alternate method to get url source.URL = row.QuerySelector("div.c-sites-list__image-container a").Attributes["href"].Value; } source.ClaimType = SourceClaimType.Veracity; if (row.Attributes["class"].Value.Contains("badge--fake")) { source.ClaimValue = (int)OrgType.Fake; } else if (row.Attributes["class"].Value.Contains("badge--satire")) { source.ClaimValue = (int)OrgType.Satire; } else { source.ClaimValue = (int)OrgType.ExtremelyUnreliable; } site.Sources.Add(source); if (!_sites.Contains(site)) { _sites.Add(site); } Console.WriteLine("Adding Entry for " + site.Name); } } }
public static void Import(string url) { var doc = new HtmlAgilityPack.HtmlDocument(); var request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"; var response = (HttpWebResponse)request.GetResponse(); string retval; using (var sr = new StreamReader(response.GetResponseStream())) { retval = sr.ReadToEnd().Trim(); } var json = JObject.Parse(retval); var stations = json.SelectToken("stations"); foreach (var station in stations) { var siteUrl = station.SelectToken("website").Value <string>(); if (!String.IsNullOrEmpty(siteUrl)) { var domain = ExtractDomainNameFromURL(siteUrl); if (!String.IsNullOrEmpty(domain) && !IgnoreUrl(domain)) { if (_sites.Any(s => s.Domain.Equals(domain))) { var site = _sites.Where(s => s.Domain.Equals(domain)).Single(); if (!site.Sources.Any(s => s.Organization == SourceOrganization.TVNewsCheck)) { var source = new Source(); source.Organization = SourceOrganization.TVNewsCheck; source.URL = "https://tvnewscheck.com/tv-station-directory/#/station/" + station.SelectToken("id").Value <string>(); source.ClaimType = SourceClaimType.OrgType; source.ClaimValue = (int)OrgType.NewsMedia; site.Sources.Add(source); Console.WriteLine("Added Source for " + site.Name); } else { var source = site.Sources.Where(s => s.Organization == SourceOrganization.TVNewsCheck).Single(); source.URL = "https://tvnewscheck.com/tv-station-directory/#/station/" + station.SelectToken("id").Value <string>(); Console.WriteLine("Updated Source for " + site.Name); } } else { var site = new OrganizationSite(); site.Name = station.SelectToken("call_sign").Value <string>(); site.Domain = domain; site.OrganizationType = OrgType.NewsMedia; var source = new Source(); source.Organization = SourceOrganization.TVNewsCheck; source.URL = "https://tvnewscheck.com/tv-station-directory/#/station/" + station.SelectToken("id").Value <string>(); source.ClaimType = SourceClaimType.OrgType; source.ClaimValue = (int)OrgType.NewsMedia; site.Sources.Add(source); Console.WriteLine("Loaded " + site.Name); _sites.Add(site); } } } } }