private static void GetIshadowsocksServers(List<ServerConfig> serverList) { var browser = new ScrapingBrowser(); //set UseDefaultCookiesParser as false if a website returns invalid cookies format //browser.UseDefaultCookiesParser = false; Console.WriteLine("Open website http://www.ishadowsocks.org/"); var homePage = browser.NavigateToPage(new Uri("http://www.ishadowsocks.org/")); var freeSection = homePage.Find("Section", By.Id("free")).FirstOrDefault(); if (freeSection == null) { Console.WriteLine("Can't find the Free section."); } var serverNodes = homePage.Html.CssSelect("#free >div.container > div.row > div.col-sm-4"); Console.WriteLine("Read Servers from HTML"); Console.WriteLine("Parse the server html"); foreach (var serverNode in serverNodes) { var h4nodes = serverNode.ChildNodes.Where(n => n.Name.Contains("h4")).ToList(); var server = new ServerConfig() { server = h4nodes[0].InnerText.Split(':')[1], server_port = int.Parse(h4nodes[1].InnerText.Split(':')[1]), password = h4nodes[2].InnerText.Split(':')[1], method = h4nodes[3].InnerText.Split(':')[1], remarks = h4nodes[0].InnerText.Split(':')[1], }; serverList.Add(server); } }
private async void MainForm_Load(object sender, EventArgs e) { ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.UTF8; //set UseDefaultCookiesParser as false if a website returns invalid cookies format //browser.UseDefaultCookiesParser = false; WebPage homePage = null; //WebPage homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/")); //Debug.WriteLine(homePage.Html.OuterHtml); homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"), HttpVerb.Post, "fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=il&u=0"); Debug.WriteLine(homePage.Html.OuterHtml); _iller = JsonConvert.DeserializeObject <ReturnData>(homePage.Html.OuterHtml); comboBox1.Items.Clear(); foreach (var city in _iller.yt) { comboBox1.Items.Add(city.text); } comboBox1.SelectedIndex = 0; //HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray(); //WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click(); }
private async void comboBox7_SelectedIndexChanged(object sender, EventArgs e) { var index = comboBox7.SelectedIndex; if (index == 0) { return; } index = index - 1; var dairekodu = _daireler[index].GetAttributeValue("id", "").Substring(1); var postData = $"fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=adr&u={dairekodu}"; ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.UTF8; WebPage homePage = null; homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"), HttpVerb.Post, postData); var adres = homePage.Html.InnerText; adres += "\r\nAdres Kodu:" + dairekodu; richTextBox1.Text = adres; }
private async void comboBox6_SelectedIndexChanged(object sender, EventArgs e) { var index = comboBox6.SelectedIndex; if (index == 0) { return; } index = index - 1; var binakodu = _binalar[index].GetAttributeValue("id", "").Substring(1); var postData = $"fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=ick&u={binakodu}&term="; ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.UTF8; WebPage homePage = null; homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"), HttpVerb.Post, postData); _daireler = homePage.Html.SelectNodes("//tbody/tr"); comboBox7.Items.Clear(); comboBox7.Items.Add("SEÇİNİZ"); foreach (var daire in _daireler) { var hucreler = daire.SelectNodes("td"); var daireIsmi = hucreler[0].InnerText + "-" + hucreler[1].InnerText; daireIsmi = daireIsmi.Replace(" ", " "); comboBox7.Items.Add(daireIsmi); Debug.WriteLine("Adres Kodu: " + daire.GetAttributeValue("id", "").Substring(1)); } comboBox7.SelectedIndex = 0; }
private async void comboBox4_SelectedIndexChanged(object sender, EventArgs e) { var index = comboBox4.SelectedIndex; if (index == 0) { return; } var mahallekodu = _mahalleler.yt[index].value; var postData = $"fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=sf&u={mahallekodu}&term="; ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.UTF8; WebPage homePage = null; homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"), HttpVerb.Post, postData); _caddeler = homePage.Html.SelectNodes("//tbody/tr"); comboBox5.Items.Clear(); comboBox5.Items.Add("SEÇİNİZ"); foreach (var cadde in _caddeler) { var hucreler = cadde.SelectNodes("td"); var caddeIsmi = hucreler[0].InnerText + "-" + hucreler[1].InnerText; caddeIsmi = caddeIsmi.Replace(" ", " "); comboBox5.Items.Add(caddeIsmi); } comboBox5.SelectedIndex = 0; _mahalleKodu = mahallekodu; _mahalle = _mahalleler.yt[index].text; buttonStart.Enabled = true; buttonStop.Enabled = false; }
private async void comboBox3_SelectedIndexChanged(object sender, EventArgs e) { var index = comboBox3.SelectedIndex; if (index == 0) { return; } var koykodu = _koyler.yt[index].value; var postData = "fZwxqvPOpAViHEOXXVuXBaTd+2018072821lryxuVH5Y45L6Yb8Wo05CB46f1vrJzTCd1vDE8EiSLYLbaFSn0O0MhabkTx4t3A+Q==&t=mh&u="; ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.UTF8; WebPage homePage = null; homePage = await browser.NavigateToPageAsync(new Uri("http://adreskodu.dask.gov.tr/site-element/control/load.ashx"), HttpVerb.Post, postData + koykodu); _mahalleler = JsonConvert.DeserializeObject <ReturnData>(homePage.Html.OuterHtml); comboBox4.Items.Clear(); foreach (var mahalle in _mahalleler.yt) { comboBox4.Items.Add(mahalle.text); } comboBox4.SelectedIndex = 0; _koyKodu = koykodu; _koy = _koyler.yt[index].text; buttonStart.Enabled = false; buttonStop.Enabled = false; }
public GoogleMatchService() { browser = new ScrapingBrowser(); ignoreList = new List <string> { "how are you", "what is that", "how is it going" }; }
public HtmlNode GetNcdcHtmlData() { var scrapingBrowser = new ScrapingBrowser(); WebPage page = scrapingBrowser.NavigateToPage(new Uri(NodeUrlHelper.NcdcReportUrl)); return(page.Html); }
public IgromaniaNewsParser() { web = new ScrapingBrowser(); web.IgnoreCookies = false; web.Encoding = System.Text.Encoding.UTF8; newsItems = new List <NewsItemInfo>(); }
public static HtmlNode GetHtml(string url) { ScrapingBrowser browser = new ScrapingBrowser(); WebPage webpage = browser.NavigateToPage(new Uri(url)); return(webpage.Html); }
private static HtmlNode GetHtml(string url) { var browser = new ScrapingBrowser(); var webpage = browser.NavigateToPage(new Uri(url)); return(webpage.Html); }
public int GetValuationRank(string ticker) { ScrapingBrowser browser = new ScrapingBrowser(); WebPage page; try { page = browser.NavigateToPage(new Uri($"https://www.gurufocus.com/stock/{ticker}/summary")); } catch { return(0); } int ValRank = 0; var divs = page.Html.CssSelect("div"); var divRatios = divs.FirstOrDefault(x => x.Attributes["id"]?.Value == "ratios"); if (divRatios != null) { var tdElements = divRatios.CssSelect("td"); Regex rgx = new Regex(@"\d/\d"); var tdValRank = tdElements.FirstOrDefault(x => rgx.IsMatch(x.InnerText)); if (tdValRank != null) { int.TryParse(tdValRank.InnerText.Split('/').FirstOrDefault(), out ValRank); } } return(ValRank); }
private string[] GetWikiMonsters() { string monsterlisturl = $"https://tibia.fandom.com/wiki/List_of_Creatures_(Ordered)"; IList <string> names = new List <string>(); ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.UTF8; WebPage monsterspage = browser.NavigateToPage(new Uri(monsterlisturl)); var orderedLists = monsterspage.Html.CssSelect("ol"); // Links are HTML encoded // %27 is HTML encode for ' character // %27%C3% is HTML encode for ñ character var nameregex = new Regex("/wiki/(?<name>[[a-zA-Z.()_%27%C3%B1-]+)"); foreach (var ol in orderedLists) { foreach (var child in ol.ChildNodes) { if (nameregex.IsMatch(child.InnerHtml)) { var namematches = nameregex.Matches(child.InnerHtml); names.Add(namematches.FindNamedGroupValue("name").Replace("%27", "'").Replace("%C3%B1", "ñ")); } } } return(names.ToArray()); }
private async Task <string> ScrapePage() { var Browser = new ScrapingBrowser(); var pageResult = await Browser.NavigateToPageAsync(new Uri(SCRAPING_PATH)); return(pageResult.Html.CssSelect("p>a").Last().InnerText); }
void getanslink(string name, string url, DataTable dt_sa, DataRow r) { List <String> qa = new List <String>(); qa.Clear(); var uri = new Uri(url.ToString()); var browser1 = new ScrapingBrowser(); browser1.UserAgent = FakeUserAgents.Chrome; var html1 = browser1.DownloadString(uri); var htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(html1); var html = htmlDocument.DocumentNode; JsonData data = JsonMapper.ToObject(html1); for (int i = 0; i < data.Count; i++) { qa.Add(data[i]["AnswerLink"].ToString()); } if (qa.Count() != 0) { string json_data = JsonConvert.SerializeObject(qa); r["Answers"] = json_data.ToString(); getuser(name, qa, dt_sa, r); } }
public async Task Short(CommandContext ctx, [RemainingText] string nickname) { await ctx.TriggerTypingAsync(); ScrapingBrowser browser = new ScrapingBrowser(); //set UseDefaultCookiesParser as false if a website returns invalid cookies format //browser.UseDefaultCookiesParser = false; WebPage homePage = browser.NavigateToPage(new Uri("http://divisiontracker.com/profile/uplay/" + nickname)); List <HtmlNode> resultNames = homePage.Html.CssSelect("div.stats-stat>div.name").ToList(); List <HtmlNode> resultValues = homePage.Html.CssSelect("div.stats-stat>div.value").ToList(); Dictionary <string, string> namesAndValues = new Dictionary <string, string>(); for (int i = 0, j = 0; i < resultNames.Count; i++, j++) { namesAndValues.Add(resultNames[i].InnerText, resultValues[j].InnerText); } DiscordEmbedBuilder builder = new DiscordEmbedBuilder(); builder.WithTitle($"{ctx.Client.CurrentUser.Username}\n{ctx.Command.Name}\n{nickname}").WithDescription($"PLAYTIME: {namesAndValues["\nPlaytime \n"]}ROGUE PLAYERS KILLED: \n{namesAndValues["\nRogue Players Killed \n"]}").WithColor(DiscordColor.Orange); await ctx.RespondAsync("", false, builder.Build()); }
static void Main(string[] args) { Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); Console.WriteLine("Hello World!"); Manga manga = new Manga { Name = "擅长捉弄的高木同学", Charset = "GBK", BaseUrl = "http://comic.ikkdm.com", IndexUrl = "http://comic.ikkdm.com/comiclist/2112/index.htm" }; ScrapingBrowser browser = new ScrapingBrowser(); browser.Encoding = Encoding.GetEncoding(manga.Charset); //set UseDefaultCookiesParser as false if a website returns invalid cookies format //browser.UseDefaultCookiesParser = false; WebPage homePage = browser.NavigateToPage(new Uri(manga.IndexUrl)); IMangaSeeker seeker = new Kuku(manga); var indexes = seeker.GetIndexes(homePage.Html); int i = 0; foreach (var index in indexes) { _ = DownloadIndexAsync(index, i, manga, seeker); i++; } Console.ReadLine(); }
public static void DownloadRaceList(DateTime date, List <int?> raceIds, RacingPostRacesEntities db) { string country; var url = string.Format(@"https://www.racingpost.com/results/{0}/time-order", String.Format("{0:yyyy-MM-dd}", date)); var Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup Browser.AllowMetaRedirect = true; //go to the home page //var PageResult = Browser.NavigateToPage(new Uri(url)); var web = new HtmlWeb(); var doc = web.Load(url); var nodes = doc.QuerySelectorAll("div .rp-timeView__raceInfo").ToList(); // List<HtmlNode> nodes = doc.QuerySelectorAll("div .rp-timeView__buttons > a").ToList(); foreach (var item in nodes) { var courseUrl = baseUrl + item.ChildNodes[1].ChildNodes[1].Attributes["href"].Value; var courseId = Helper.GetIdfromUrl(courseUrl, "https://www.racingpost.com/profile/course/"); string raceUrl = ""; if (item.ChildNodes[3].ChildNodes[1].Attributes.Any(a => a.Name == "href")) { raceUrl = baseUrl + item.ChildNodes[3].ChildNodes[1].Attributes["href"].Value; } else { continue; } int?raceId = Convert.ToInt32(raceUrl.Split('/').LastOrDefault()); if (!raceIds.Any(r => r == raceId)) { //save url to be scraped ScrapeRace scrapeRace = new ScrapeRace(); scrapeRace.Link = raceUrl; scrapeRace.RaceId = raceId; scrapeRace.RaceDate = date; scrapeRace.Scraped = false; scrapeRace.Required = true; scrapeRace.CourseUrl = courseUrl; var course = AllCourses.Where(c => c.Id == courseId).FirstOrDefault(); if (course == null) { RPCourse c = new RPCourse { Id = courseId, Name = courseUrl.Split('/').LastOrDefault().ToUpper() }; db.RPCourses.Add(c); db.SaveChanges(); AllCourses.Add(c); } country = AllCourses.Where(c => c.Id == courseId).FirstOrDefault().Country; scrapeRace.Country = string.IsNullOrEmpty(country) ? "GB" : country; db.ScrapeRaces.Add(scrapeRace); db.SaveChanges(); } } }
static void Main(string[] args) { // setup the browser ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup Browser.AllowMetaRedirect = true; //go to the home page WebPage PageResult = Browser.NavigateToPage(new Uri("http://localhost:51621/")); // get first piece of data, the page title HtmlNode TitleNode = PageResult.Html.CssSelect(".navbar-brand").First(); string PageTitle = TitleNode.InnerText; // get a list of data from a table List <String> Names = new List <string>(); var Table = PageResult.Html.CssSelect("#PersonTable").First(); foreach (var row in Table.SelectNodes("tbody/tr")) { foreach (var cell in row.SelectNodes("td[1]")) { Names.Add(cell.InnerText); } } // find a form and send back data PageWebForm form = PageResult.FindFormById("dataForm"); // assign values to the form fields form["UserName"] = "******"; form["Gender"] = "M"; form.Method = HttpVerb.Post; WebPage resultsPage = form.Submit(); }
static void Scrape(int newestId, int range, string baseUrl = "https://vozforums.com/showthread.php") { var degreeOfParallelism = Environment.ProcessorCount; var tasks = new Task[degreeOfParallelism]; int throttle = newestId - range; for (int taskNumber = 0; taskNumber < degreeOfParallelism; taskNumber++) { // capturing taskNumber in lambda wouldn't work correctly int taskNumberCopy = taskNumber; tasks[taskNumber] = Task.Factory.StartNew( () => { ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup Browser.AllowMetaRedirect = true; WebPage PageResult; var max = throttle + (newestId - throttle) * (taskNumberCopy + 1) / degreeOfParallelism; var min = throttle + (newestId - throttle) * (taskNumberCopy) / degreeOfParallelism; logger.InfoFormat("max-min:{0}-{1}", max, min); for (int i = min; i < max; i++) { var url = $"{baseUrl}?t={i}"; PageResult = Browser.NavigateToPage(new Uri(url)); RemoveComment(PageResult.Html); var navbars = PageResult.Html.CssSelect(".navbar"); string title = ""; if (navbars != null && navbars.Any()) { title = navbars.Last().InnerText.Trim(); logger.Warn(title); } var rawPosts = PageResult.Html.CssSelect(".voz-post-message"); List <Post> Posts = new List <Post>(); foreach (var rawPost in rawPosts) { Posts.Add(new Post(rawPost.InnerHtml)); } if (!string.IsNullOrEmpty(title) && Posts.Any()) { repository.Create <Advert>(new Advert() { Title = title, URL = "https://vozforums.com/showthread.php?t=6134837", Posts = Posts }); } } }); } Task.WaitAll(tasks); }
/// <summary> /// Will get you Guild: name, char amount, fame, most active on and description. /// </summary> /// <param name="guildName"></param> /// <param name="guild"></param> /// <returns></returns> public static bool GetGuildSummary(string guildName, Guild guild) { guildName = guildName.Replace(" ", "%20"); result = false; ScrapingBrowser browser = new ScrapingBrowser(); browser.AllowAutoRedirect = true; browser.AllowMetaRedirect = true; try { WebPage Main = browser.NavigateToPage(new Uri("https://www.realmeye.com/guild/" + guildName)); HtmlNode Username = Main.Html.CssSelect(".entity-name").First(); guild.Name = Username.InnerText; try { var Table1 = Main.Html.CssSelect("#d").First(); guild.Desc1 = Table1.FirstChild.InnerText; guild.Desc2 = Table1.FirstChild.NextSibling.InnerText; guild.Desc3 = Table1.FirstChild.NextSibling.NextSibling.InnerText; } catch { guild.Desc1 = "Private"; guild.Desc2 = "Private"; guild.Desc3 = "Private"; } var Table2 = Main.Html.CssSelect(".summary").First(); foreach (var row in Table2.SelectNodes("tr")) { foreach (var cell in row.SelectNodes("td[1]")) { if (cell.InnerText == "Members") { guild.MemberCount = cell.NextSibling.InnerText; } else if (cell.InnerText == "Characters") { guild.Chars = cell.NextSibling.InnerText; } else if (cell.InnerText == "Fame") { guild.Fame = cell.NextSibling.InnerText; } else if (cell.InnerText == "Most active on") { guild.MostActiveOn = cell.NextSibling.InnerText; } } } result = true; } catch (Exception) { guild.Name = "Private"; } return(result); }
static void Main(string[] args) { Neo db = new Neo(); ScrapingBrowser browser = new ScrapingBrowser(); List <HtmlNode> memberLinks = new List <HtmlNode>(); string startPoint = "http://www.meetup.com/Chester-Devs/members/?offset={0}&desc=1&sort=social_sort"; for (int pages = 1; pages <= 12; pages++) { WebPage memberList = browser.NavigateToPage(new Uri(String.Format(startPoint, ((pages - 1) * 20).ToString()))); memberLinks.AddRange(memberList.Html.CssSelect("a.memName").ToList()); } List <Person> people = new List <Person>(); people.AddRange(memberLinks.Select <HtmlNode, Person>(ml => new Person() { Name = ml.InnerText, LinkToProfile = ml.Attributes["href"].Value }).ToList()); foreach (Person person in people) { WebPage memberPage = browser.NavigateToPage(new Uri(person.LinkToProfile)); db.Create <Person>(person); Location location = new Location() { Name = memberPage.Html.CssSelect("span.locality").Select <HtmlNode, string>(l => l.InnerText).First() }; db.Create <Location>(location); db.RelatePersonToLocation(person.UniqueId, location.Name); List <Interest> interests = new List <Interest>(); interests.AddRange(memberPage.Html.CssSelect("ul#memberTopicList > li.D_group > div > a.topic-widget").Select <HtmlNode, Interest>(i => new Interest() { Name = i.InnerText }).ToList()); interests.ForEach(i => db.Create <Interest>(i)); interests.ForEach(i => db.RelatePersonToInterest(person.UniqueId, i.Name)); List <Meetup> meetups = new List <Meetup>(); meetups.AddRange(memberPage.Html.CssSelect("div.D_name").Select <HtmlNode, Meetup>(g => new Meetup() { Name = g.InnerText }).ToList()); if (meetups.Where(g => g.Name == "Chester Devs").Count() == 0) { meetups.Add(new Meetup() { Name = "Chester Devs" }); } meetups.ForEach(g => db.Create <Meetup>(g)); meetups.ForEach(g => db.RelatePersonToMeetup(person.UniqueId, g.Name)); } }
private static void SinglePassScrapeTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, HtmlNode scraperPointer, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates) { var absoluteUri = scrapingBrowser.Referer.AbsoluteUri; var rootUri = absoluteUri.Substring(0, absoluteUri.IndexOf('/', 8)); foreach (var templateField in sourceTemplate.TemplateFields) { if (templateField.OrderedScrapingSteps.Count() == 0) { continue; } try { PerformScrapingForTemplateField(scraperPointer, templateField, scrapingEvent, rootUri); } catch (Exception ex) { Console.WriteLine($"\t\"{templateField.Name}\" field missing from entry."); } // Perform navigation action if the last step of a template's scraping instructions dictates it. var lastStep = templateField.OrderedScrapingSteps.Last(); if (lastStep.ActionType == ScrapingActionType.TravelToChildTemplate) { var jsonSerializer = new JavaScriptSerializer(); var travelToChildTemplateParams = new TravelToChildTemplateParams(); travelToChildTemplateParams = jsonSerializer .Deserialize <TravelToChildTemplateParams>(lastStep.Parameters); // Need child template instructions. var childTemplateId = travelToChildTemplateParams.Child; var childTemplate = allTemplates .First(t => t.TemplateId == childTemplateId && !t.IsTopLevel); int targetFieldId; var sourceIsNumericId = int.TryParse(travelToChildTemplateParams.Source, out targetFieldId); string targetUri; // The target URI was saved as a temporary field. if (travelToChildTemplateParams.IsFromTemporaryField) { targetUri = scrapingEvent.Records.Last().TemporaryFieldIdToValueDictionary[targetFieldId.ToString()]; } // The target URI is a standard, non-temporary field. else { targetUri = scrapingEvent.Records.Last().TargetFieldIdToValueDictionary[targetFieldId.ToString()]; } // Set record lock so that the same record is used. scrapingEvent.LockOnCurrentRecord = true; ScrapeFromTemplate(scrapingBrowser, scrapingEvent, rootUri + targetUri, childTemplate, allTemplates); } } }
public override Task <List <Article> > GetArticles(string url) { _log.Info("DvCvitMediteranaScraper scraping " + url); return(Task.Run(() => { var articles = new List <Article>(); var browser = new ScrapingBrowser { AllowAutoRedirect = true, AllowMetaRedirect = true, Encoding = Encoding.UTF8 }; var homePage = browser.NavigateToPage(new Uri(url)); var pageArticles = homePage.Html.CssSelect("div.post"); foreach (var pageArticle in pageArticles) { var title = pageArticle.CssSelect("h2 > a").Single().InnerText.Replace(" ", " ").Replace(""", "\""); var shortText = pageArticle.CssSelect("div.post-text").Single().InnerText; var time = pageArticle.CssSelect("div.post-text > p").First().InnerText; var firstChar = time.Trim().ToCharArray().ElementAt(0); var date = DateTime.Now; if (!string.IsNullOrWhiteSpace(time) && char.IsDigit(firstChar)) { date = Convert.ToDateTime(time); } var link = pageArticle.CssSelect("h2 > a").Single().Attributes["href"].Value; var text = ""; try { var homedetailsPage = browser.NavigateToPage(new Uri(link)); var psDetails = homedetailsPage.Html.CssSelect("div.post"); text = psDetails.Aggregate(text, (current, psDetail) => current + (psDetail.InnerText + Environment.NewLine)).Replace(" ", " "); } catch (Exception ex) { _log.Error("NzjzScraper error: " + ex.Message + Environment.NewLine + ex.StackTrace); } if (string.IsNullOrWhiteSpace(text)) { text = link; } _log.Info("DvCvitMediteranaScraper Creating " + title); CreateArticle(text, articles, title, shortText, link, ArticleType.DvCvitMediterana, date); } return articles; })); }
public List <HeadingDto> GetTextofHeadingTags(string url) { List <HeadingDto> headingDtos = new List <HeadingDto>(); ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; Browser.AllowMetaRedirect = true; WebPage PageResult = Browser.NavigateToPage(new Uri(url)); HtmlNode TitleNode = PageResult.Html; string PageTitle = TitleNode.InnerHtml; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(PageTitle); if (doc.DocumentNode.SelectNodes("//h1") != null) { foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h1")) { HeadingDto headingDto = new HeadingDto() { Content = node.InnerText, HeadingType = "h1" }; headingDtos.Add(headingDto); } } if (doc.DocumentNode.SelectNodes("//h2") != null) { foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2")) { HeadingDto headingDto = new HeadingDto() { Content = node.InnerText, HeadingType = "h2" }; headingDtos.Add(headingDto); } } if (doc.DocumentNode.SelectNodes("//h3") != null) { foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h3")) { HeadingDto headingDto = new HeadingDto() { Content = node.InnerText, HeadingType = "h3" }; headingDtos.Add(headingDto); } } return(headingDtos); }
public KinoNewsParser() { System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance); web = new ScrapingBrowser(); web.IgnoreCookies = false; web.Encoding = System.Text.CodePagesEncodingProvider.Instance.GetEncoding(1251); newsItems = new List <NewsItemInfo>(); }
public void ScrapingCrawler() { Uri uri = new Uri(initurl); var browser1 = new ScrapingBrowser(); result = browser1.DownloadString(uri); doHtml(); }
public override Producto[] getProductos() { browser = new ScrapingBrowser(); WebPage homePage = browser.NavigateToPage(new Uri(url)); HtmlNode[] components = homePage.Html.CssSelect("li.products__item div.products__wrap.clearfix").ToArray(); return(Producto.assemble(components, getNombre, getPrecio)); }
public static string NavigateTo(this ScrapingBrowser scrapingBrowser, string url, NameValueCollection data, Action <ScrapingBrowser, string> action, HttpVerb httpVerb = HttpVerb.Get) { var html = scrapingBrowser.NavigateTo(new Uri(url), httpVerb, data); action.Invoke(scrapingBrowser, html); return(html); }
private static ScrapingBrowser BrowserInstance() { if (_browser == null) { _browser = new ScrapingBrowser(); } return(_browser); }
static void Main(string[] args) { var b = new ScrapingBrowser(); WebPage PageResult = b.NavigateToPage(new Uri("https://www.autotrader.co.uk/car-search?sort=price-desc&radius=1500&postcode=e148dw&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&make=NISSAN&model=JUKE&aggregatedTrim=Nismo%20RS&page=1")); //HtmlNode countNode = PageResult.Html.SelectNodes("//html/body/main/section[2]/div[1]/header/nav/ul/li[3]").FirstOrDefault(); var nodes = PageResult.Html.SelectNodes("//*[@id=\"main-content\"]/div[1]/header/nav/ul/li[3]"); //*[@id="main-content"]/div[1]/header/nav/ul/li[3] string count = nodes.First().InnerText; int current = int.Parse(count.Split(' ')[1]); int total = int.Parse(count.Split(' ')[3]); using (var csv = File.AppendText(@".\cars.csv")) { int id = 0; csv.WriteLine("id,description,price,mileage,year,age,transmission,size,power,fuel"); for (int page = 2; page <= total; page++) { var results = PageResult.Html.SelectNodes("//html/body/main/section[2]/div[1]/ul/li"); foreach (var result in results) { if (result.InnerText.StartsWith("\nFeatrured")) { continue; } if (result.SelectNodes("article/section[1]/div") == null) { continue; } var also = result.SelectNodes("span"); if (also != null && also.First().InnerText.Contains("also like")) { continue; } var info = result.SelectNodes("article/section[1]/div").First(); var price = result.SelectNodes("article/section[2]/a/div").First().InnerText.Trim('?').Replace(",", ""); string description = info.SelectNodes("h2/a").First().InnerText; var subInfo = info.SelectNodes("ul/li"); string mileage = subInfo[2].InnerText.Replace(" miles", "").Replace(",", ""); string year = subInfo[0].InnerText.Substring(0, 4); int age = DateTime.Now.Year - int.Parse(year); string size = subInfo[3].InnerText.Replace("L", ""); string power = subInfo[4].InnerText.Replace("bhp", ""); string transmission = subInfo[5].InnerText; string fuel = subInfo[6].InnerText; csv.WriteLine($"{id},{description},{price},{mileage},{year},{age},{transmission},{size},{power},{fuel}"); id++; } PageResult = b.NavigateToPage(new Uri($"https://www.autotrader.co.uk/car-search?sort=price-desc&radius=1500&postcode=e148dw&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&make=NISSAN&model=JUKE&aggregatedTrim=Nismo%20RS&page={page}")); } } }
public static HtmlDocument GetHtmlDocument(string url, Encoding encoding = null) { var uri = new Uri(url); var b = new ScrapingBrowser(); if (encoding != null) b.Encoding = fromEncoding; var htmlStr = b.DownloadString(uri); var doc = new HtmlDocument(); doc.LoadHtml(htmlStr); return doc; }
static void Main(string[] args) { var uri = new Uri("http://www.kuaidi100.com/"); var browser = new ScrapingBrowser(); var htmlData = browser.DownloadString(uri); var htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(htmlData); HtmlNode html = htmlDocument.DocumentNode; var nodeList = html.CssSelect("#companyList dl a"); //////////抓取快递100提供的可查询的快递公司数据 IList<KuaiDiModel> listKD = new List<KuaiDiModel>(); foreach (var htmlNode in nodeList) { string data_code = htmlNode.Attributes["data-code"].Value; var children = htmlNode.CssSelect("span"); string data_name = ""; foreach (var spanNode in children) { data_name = spanNode.InnerHtml; } KuaiDiModel itemKD = new KuaiDiModel(); itemKD.Code = data_code; itemKD.Name = data_name; if (listKD.Where(x => x.Code == data_code).Count() == 0) ///////////在itemKD中插入数据前判断code是否已存在 { listKD.Add(itemKD); } } foreach (var item in listKD) { Console.WriteLine(item.Code + "|" + item.Name); } ////////测试查询 type为快递公司编码 postid为运单号 string uuu = "http://www.kuaidi100.com/query?type=yunda&postid=1900491153505"; var uri2 = new Uri(uuu); string kkkk = browser.DownloadString(uri2); KuaiDiLog dd = Util.ParseFromJson<KuaiDiLog>(kkkk); Console.ReadLine(); }
public async Task<int> ExtractCatalogPageCount(string url) { int pageCount = 1; var uri = new Uri(url); var browser1 = new ScrapingBrowser(); browser1.Encoding=Encoding.UTF8; var html1 = browser1.DownloadString(uri); var doc = new HtmlDocument(); doc.LoadHtml(html1); var html = doc.DocumentNode; var pagers =await Task.Run(()=>html.CssSelect("div.pager")).ConfigureAwait(false); var htmlNodes = pagers as IList<HtmlNode> ?? pagers.ToList(); if (htmlNodes.Any()) { var mp = Regex.Match(htmlNodes.First().InnerText, @"共(\d+)页"); if (mp.Success) pageCount= int.Parse(mp.Groups[1].Value); } else { var pagerLinks = html.CssSelect("div#pager>a").Select(t => t.InnerText).ToList(); if (!int.TryParse(pagerLinks.Last(),out pageCount)) { pagerLinks.RemoveAt(pagerLinks.Count-1); pagerLinks.Last(t => int.TryParse(t, out pageCount)); pageCount = await ExtractCatalogPageCount(string.Format(CatalogsUrlTemplate,_blogName, pageCount)); } } return pageCount; }
private async void OnGetUrls() { var cnblog = new CnblogProcess(); var browser = new ScrapingBrowser(); var htmlDocument = new HtmlDocument(); var html = browser.DownloadString(new Uri("http://news.baidu.com")); htmlDocument.LoadHtml(html); var links = htmlDocument.DocumentNode.Descendants("a") .Where(x => x.Attributes.Contains("href")); var progress = new Progress<DownloadStringTaskAsyncExProgress>(); Content = " "; var i = 0; progress.ProgressChanged += (s, e) => { Content += e.Text + Environment.NewLine; ProgressValue = (double) i++/links.Count(); }; await OutputLinks(links, progress).ConfigureAwait(false); }
private static void LoadAPageOfPlayers(ScrapingBrowser browser, HtmlNode rootNode, ConcurrentBag<Player> players) { var tbody = rootNode.CssSelect("#result > tbody").SingleOrDefault(); if (tbody == null) { return; } var childRows = tbody.ChildNodes.Skip(1).ToList(); LoadPlayers(players, childRows); var searchResults = tbody.OwnerDocument.DocumentNode.CssSelect("#searchResults").Single(); if (searchResults.ChildNodes.Any() == false) { return; } var navigationRow = searchResults.ChildNodes[1]; var nextButton = navigationRow.ChildNodes.SingleOrDefault(n => n.InnerText.Trim() == "next"); if (nextButton != null) { var value = nextButton.Attributes.Single(a => a.Name == "href").Value; var uriString = BaseUrl + HttpUtility.HtmlDecode(value); var page = browser.NavigateToPage(new Uri(uriString)); LoadAPageOfPlayers(browser, page.Html, players); } }
public List<string[]> ReadHtmlTable(string divId) { var browser = new ScrapingBrowser(); var html = browser.DownloadString(new Uri(connectionURL)); var document = new HtmlDocument(); var table = new List<string[]>(); document.LoadHtml(html); try { var tableHtml = document.DocumentNode.CssSelect("div#" + divId).CssSelect("Table").First(); table.Add(tableHtml.CssSelect("th").Select(h => h.InnerText).ToArray()); tableHtml.CssSelect("tbody").CssSelect("tr").Each((row, i) => table.Add(row.CssSelect("td").Select(d => d.InnerText).ToArray())); } catch { } return table; }
private void OnLoadUrl() { var uri = new Uri(URL); var browser1 = new ScrapingBrowser(); var html1 = browser1.DownloadString(uri); var doc = new HtmlDocument(); doc.LoadHtml(html1); Content = doc.DocumentNode.InnerHtml; }
public BlogProcessBase() { _scrapyBrowser = new ScrapingBrowser(); }
public IEnumerable<Dictionary<string, object>> ReadNASDAQFRHtmlTable() { var browser = new ScrapingBrowser(); var html = browser.DownloadString(new Uri(connectionURL)); var document = new HtmlDocument(); var table = new List<Dictionary<string, object>>(); document.LoadHtml(html); try { var tableHtml = document.DocumentNode.CssSelect("table.ipos").First().CssSelect("tr"); foreach (var rowHtml in tableHtml) { var dateHtml = rowHtml.CssSelect("td.dkbluert").Where(d => d.InnerText != " "); var dateKeyHtml = rowHtml.CssSelect("td.dkbluelt").Where(d => d.InnerText != " "); string key = dateKeyHtml.Any() ? dateKeyHtml.First().InnerText : "ReportDate"; if (key == "Quarter:") continue; foreach (var colDataHtml in dateHtml) { var newRport = new Dictionary<string, object>(); newRport["End Date"] = colDataHtml.InnerText; table.Add(newRport); } var numHtml = rowHtml.CssSelect("td.fundnum").Where(d => d.InnerText != " "); var itemHtml1 = rowHtml.CssSelect("td.body1").Where(d => d.InnerText != " "); var itemHtml2 = rowHtml.CssSelect("td.indent").Where(d => d.InnerText != " "); if (itemHtml1.Any() || itemHtml2.Any()) { string item = itemHtml1.Any() ? itemHtml1.First().InnerText : itemHtml2.First().InnerText; table = table.Select(t => { t[item] = null; return t; }).ToList(); int i = 0; foreach (var colHtml in numHtml) { string number = colHtml.InnerText.Replace("\n", "").Replace("\t", "").Replace("\r", ""); var cf = new System.Globalization.CultureInfo("en-US"); table[i][item] = Decimal.Parse(number, System.Globalization.NumberStyles.Currency, cf)*1000; i++; } } } } catch { } return table; }
private async Task<List<Catalog>> ExtractCatalogsFromWebPage(string url) { List<Catalog> reusltCatalogs = new List<Catalog>(); var uri = new Uri(url); var browser1 = new ScrapingBrowser(); browser1.Encoding = Encoding.UTF8; var html1 = browser1.DownloadString(uri); var doc = new HtmlDocument(); doc.LoadHtml(html1); var html = doc.DocumentNode; foreach (var script in doc.DocumentNode.Descendants("script").ToArray()) { script.Remove(); } foreach (var style in doc.DocumentNode.Descendants("style").ToArray()) { style.Remove(); } foreach (var comment in doc.DocumentNode.SelectNodes("//comment()").ToArray()) { comment.Remove(); } var days = html.CssSelect("div.day"); if (!days.Any()) { days = html.CssSelect("div#container >div#wrapper >div#content"); } foreach (var day in days) { var catalog = new Catalog(); catalog.IsChecked = true; var title = day.CssSelect("div.dayTitle").FirstOrDefault(); if(title!=null) { catalog.Title = title.InnerText.ClearNotWords(); var atricles = day.CssSelect("div.postTitle"); foreach (var atricle in atricles) { var article = new Article(); article.Title = atricle.InnerText.ClearNotWords(); var articleTitleEl = atricle.CssSelect("a.postTitle2"); article.URL = articleTitleEl.First().Attributes["href"].Value; catalog.Articles.Add(article); } } else { catalog.Title = "CataLog"+DateTime.Now.ToShortTimeString(); var atricles = day.CssSelect("div.post"); foreach (var atricle in atricles) { var article = new Article(); var articleTitleEl = atricle.CssSelect("a.PostTitle"); article.Title = articleTitleEl.First().InnerText.ClearNotWords(); article.URL = articleTitleEl.First().Attributes["href"].Value; catalog.Articles.Add(article); } } reusltCatalogs.Add(catalog); } return reusltCatalogs; }
private static void example3() { var b = new ScrapingBrowser(); var baseLink = "http://webscraper.io"; WebPage PageResult = b.NavigateToPage(new Uri(baseLink + "/test-sites/e-commerce/allinone")); var node = PageResult.Html.CssSelect(".category-link").First(); //string text = node.InnerText.Trim(); //Console.WriteLine(text); var link = node.GetAttributeValue("href"); PageResult = b.NavigateToPage(new Uri(baseLink + "/" + link)); var nodes = PageResult.Html.CssSelect(".price"); foreach (var n in nodes) { Console.WriteLine(WebUtility.HtmlDecode(n.InnerText.Trim())); Console.WriteLine("--------------------------------------------------------------------------"); } }
private static IEnumerable<Player> GetPlayersFromWeb() { var browser = new ScrapingBrowser(); var htmlWeb = new HtmlWeb(); var players = new ConcurrentBag<Player>(); Parallel.For('A', 'Z' + 1, c => { Console.Write("\rLoading players whose last names begins with '{0}'.", Convert.ToChar(c)); var startUrl = String.Format("/players/search?category=lastName&filter={0}&playerType=current", Convert.ToChar(c)); var doc = htmlWeb.Load(BaseUrl + startUrl); LoadAPageOfPlayers(browser, doc.DocumentNode, players); }); //Console.WriteLine(players.Count()); //Console.ReadLine(); //players.OrderBy(p => p.Name).ToList().ForEach(Console.WriteLine); return players; }
public static Match[] GetFractionalOdds() { ScrapingBrowser browser = new ScrapingBrowser(); Uri lolUri = new Uri(lolUrl); WebPage page = browser.NavigateToPage(lolUri); HtmlNode table = ParsePage(page.Html); TableParser tableParser = new TableParser(table.ChildNodes); Match[] matches = tableParser.Parse(); return matches; }
public void TestUsingForm() { ScrapingBrowser browser = new ScrapingBrowser(); //set UseDefaultCookiesParser as false if a website returns invalid cookies format //browser.UseDefaultCookiesParser = false; WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/")); PageWebForm form = homePage.FindFormById("sb_form"); form["q"] = "scrapysharp"; form.Method = HttpVerb.Get; WebPage resultsPage = form.Submit(); HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.b_content h2 a").ToArray(); Assert.True(resultsLinks.Any()); //WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click(); }
static void Main(string[] args) { Log.Logger = new LoggerConfiguration() .ReadFrom.AppSettings() .CreateLogger(); var log = Log.Logger.ForContext<Crufty.CruftyConsole>(); // get list of all courts to process var client = new MongoClient(ConfigurationManager.AppSettings["MongoServer"]); var database = client.GetDatabase("CourtCruft"); var collection = database.GetCollection<CourtWebsite>("CourtWebsites"); List<CourtWebsite> documents = new List<CourtWebsite>(); Task.Run(async () => { documents = await collection.Find(new BsonDocument()).ToListAsync(); }).Wait(); // loop over list of courts and scrape them. foreach (var courtWebsite in documents) { try { // wrap in error handling so we catch 404s and "network down" ScrapingBrowser browser = new ScrapingBrowser(); WebPage homePage = browser.NavigateToPage(new Uri(courtWebsite.Url)); //PageWebForm form = homePage.FindFormById("sb_form"); //form["q"] = "scrapysharp"; //form.Method = HttpVerb.Get; //WebPage resultsPage = form.Submit(); // if no xpath filter, just grab the whole page var currentPage = (String.IsNullOrEmpty(courtWebsite.SelectionXPathString)) ? homePage.Html.InnerHtml :homePage.Html.SelectSingleNode(courtWebsite.SelectionXPathString).InnerHtml; if (String.IsNullOrWhiteSpace(currentPage)) { log.Error("Unable to find {XPath} in {Url}", courtWebsite.SelectionXPathString, courtWebsite.Url); continue; } if (courtWebsite.NewPageHtml != currentPage) { HtmlDiff diffHelper = new HtmlDiff(courtWebsite.NewPageHtml, currentPage); courtWebsite.DiffedHtml = diffHelper.Build().Insert(0, "<style>ins {background-color: #cfc;text-decoration: none;} del { color: #999; background-color:#FEC8C8;</style>"); courtWebsite.OldPageHtml = courtWebsite.NewPageHtml; courtWebsite.NewPageHtml = currentPage; courtWebsite.LastChangedDateTime = DateTime.Now; courtWebsite.Checked = false; log.Information("Changes found for {CourtName}", courtWebsite.CourtName); } courtWebsite.LastRunDateTime = DateTime.Now; var filter = Builders<CourtWebsite>.Filter.Eq(s => s.Id, courtWebsite.Id); Task.Run(async () => { await collection.ReplaceOneAsync(filter, courtWebsite); }).Wait(); log.Verbose("Successfully scraped site {CourtName}", courtWebsite.CourtName); } catch (WebException exception) { log.Error(exception, "Web Exception for {CourtName} (check the url) {ExceptionMessage} {url}", courtWebsite.CourtName, exception.Message, courtWebsite.Url); } catch (Exception exception) { log.With("CourtWebsite", courtWebsite) .Error(exception, "General Exception for {CourtName} {ExceptionMessage}", courtWebsite.CourtName, exception.Message); } } }