/// <summary> /// Scrapes extra data about a course which is not given on first pass /// </summary> /// <param name="course">Course to scrape</param> /// <returns>A list of courses</returns> public static List <Course> AddData(this Course course) { var page = new HtmlWeb().Load(course.CourseUrl).DocumentNode; /* UCAS CODE */ var ucasCodeSection = page.SelectSingleNode("//strong[text()='UCAS code']"); string ucasCode = ucasCodeSection.ParentNode.NextSibling.InnerText; /* ADMISSION STATS */ var admissions = AdmissionStats(page); if (admissions != null) { course.Admission = new Admission(admissions); } /* DURATION STATS */ course.Duration.AddRange(DurationStats(page)); /* ADMISSIONS TEST */ course.AdmissionTest = AdmissionTest(page); /* WRITTEN WORK */ var writtenWorkSection = page.SelectSingleNode("//strong[contains(text(), 'Written work')]"); var writtenWork = writtenWorkSection.ParentNode.NextSibling.InnerText; course.WrittenWork = (writtenWork == "None") ? null : writtenWork; /* SUBJECT REQUIREMENTS */ course.ALevelCourse.AddRange(SubjectRequirements(page)); /* SPLIT COURSES */ //if only one ucas code, return as normal if (ucasCode.Length == 4) { course.UcasCode = ucasCode; return(new List <Course> { course }); } //if has brackets (indicates there are 2 courses on one page) if (ucasCode.Contains('(')) { return(Fetcher.TwoCodes(course, ucasCode)); } //if contains a hyperlink, indicates a link to a table elsewhere if (ucasCodeSection.ParentNode.NextSibling.SelectSingleNode("a") != null) { var link = ucasCodeSection.ParentNode.NextSibling.SelectSingleNode("a"); return(Fetcher.HyperlinkTable(course, link.Attributes["href"].Value)); } //if no URL in ucas code, can simply read table from same page return(Fetcher.Table(course, Fetcher.FindTable(page))); }
static void Main(string[] args) { string docurl = "https://ebiz.khnp.co.kr/getTmpView.do?type=spbidnoti&loadId=fff84246-b957-47fc-a249-e1d023131890"; var doc = new HtmlWeb().Load(docurl).DocumentNode; string aprice1 = doc.SelectSingleNode("//th[text()='가격점수제외금액(A)']").NextSibling.NextSibling.InnerText; string aprice2 = doc.SelectSingleNode("//th[text()[contains(., '가격점수제외금액(A)')]]").NextSibling.NextSibling.InnerText; Console.WriteLine(aprice1); Console.WriteLine(aprice2); }
/// <summary> /// Adds extra data to a college /// </summary> /// <param name="college">College to add data to</param> public static void AddData(this College college) { var page = new HtmlWeb().Load(college.CollegeUrl).DocumentNode; /* CAMPUS CODE */ var campusCodeSelection = page.SelectSingleNode("//h2[contains(text(), 'UCAS campus code:')]"); string campusCode = campusCodeSelection.InnerText.Split(' ').Last(); college.CampusCode = campusCode; /* OFFERED COURSES */ var coursePage = new HtmlWeb().Load($"{college.CollegeUrl}#content-tab--3").DocumentNode; var courseList = coursePage.SelectSingleNode("//div[@id='content-tab--3']/ul"); college.Courses = new List <string>(); //temp hardcoded solution, planning to fix if (college.CampusCode == "P") { college.Courses.Add("Theology and Religion"); college.Courses.Add("Philosophy and Theology"); } else { foreach (var a in courseList.SelectNodes("li/a")) { college.Courses.Add(a.InnerText); } } /* NUMBER OF STUDENTS */ var studentNumSelection = page.SelectSingleNode("//h3[contains(text(), 'Student numbers')]").NextSibling; var textSections = studentNumSelection.InnerHtml.Split("<br>"); //Mathces 1 or more digits (for both regexes) college.UndergradStudents = Convert.ToInt32(Regex.Match(textSections[0], @"\d{1,}").Value); college.GradStudents = Convert.ToInt32(Regex.Match(textSections[1], @"\d{1,}").Value); college.TotalStudents = college.UndergradStudents + college.GradStudents; /* FACILITIES */ var facilityPage = new HtmlWeb().Load($"{college.CollegeUrl}#content-tab--2").DocumentNode; var facilities = facilityPage.SelectSingleNode("//h2[text()='College facilities']").ParentNode.SelectSingleNode("table/tbody"); college.Facilities = new CollegeFacilities(facilities); }
/// <summary> /// Get a list of type T from a url, by parsing the expected table format /// </summary> /// <typeparam name="T">Type for list to use</typeparam> /// <param name="url">Url to parse</param> /// <returns>A list of data parsed from url</returns> public static List <T> GetList <T>(string url) { List <T> outputList = new List <T>(); var page = new HtmlWeb().Load(url).DocumentNode; var tableNode = page.SelectSingleNode("//*[@class='table-reduced']"); var trNode = tableNode.SelectSingleNode("tbody/tr"); foreach (var td in trNode.ChildNodes) { foreach (var link in td.SelectNodes("p/a")) { string name = link.InnerText; string linkUrl = "https:" + link.Attributes["href"].Value; //create an instance of T with given parameters - hacky workaround, but only solution i can find T data = (T)Activator.CreateInstance(typeof(T), name, linkUrl); outputList.Add(data); } } return(outputList); }
public static string DownloadDeck(string url) { if (url.Contains("legends-decks.com")) { var page = new HtmlWeb().Load(url).DocumentNode; return(page.SelectSingleNode("//div[@id='deckModal']//div[@class='well_full']").InnerHtml.Replace("<br>", "\r\n").Trim()); } if (url.Contains("eternalwarcry.com")) { var page = new HtmlWeb().Load(url).DocumentNode; var name = page.SelectSingleNode("//h1").InnerText.Trim(); var cards = page.SelectSingleNode("//textarea[@id='export-deck-text']").InnerHtml.Trim(); return($"### {name} ###\r\n{cards}"); } return(null); }
/// <summary> /// Creates multiple courses based upon a table on another page /// </summary> /// <param name="course">Course to split</param> /// <param name="url">Url of other page</param> /// <returns>A list of correct courses</returns> public static List <Course> HyperlinkTable(Course course, string url) { var page = new HtmlWeb().Load($"https:{url}").DocumentNode; var ucasSection = page.SelectSingleNode("//h3[contains(text(), 'UCAS codes')]/following-sibling::div"); List <Course> newCourses = new List <Course>(); foreach (var table in ucasSection.SelectNodes("table").Take(3)) { newCourses.AddRange(Table(course, table)); } return(newCourses); }
public static string DownloadDeck(string url) { if (url.Contains("legends-decks.com")) { var page = new HtmlWeb().Load(url).DocumentNode; return(page.SelectSingleNode("//div[@id='deckModal']//div[@class='well_full']").InnerHtml.Replace("<br>", "\r\n").Trim()); } if (url.Contains("eternalwarcry.com")) { var page = new HtmlWeb().Load(url).DocumentNode; var name = page.SelectSingleNode("//h1").InnerText.Trim(); var cards = page.SelectSingleNode("//textarea[@id='export-deck-text']").InnerHtml.Trim(); return($"### {name} ###\r\n{cards}"); } if (url.Contains("teslegends.pro")) { var slug = url.Substring(url.TrimEnd('/').LastIndexOf('/')).Trim('/'); var response = new WebClient().UploadValues("https://teslegends.pro/dc/do.php", "POST", new NameValueCollection { { "exportdeck", slug } }); return(Encoding.UTF8.GetString(response)); } return(null); }
private void Update() { _html = ""; _lastStoredHrefBCPEA = ReadLastCarHref(_filenameBCPEA); _lastStoredHrefNap = ReadLastCarHref(_filenameNap); try { var docNap = new HtmlWeb().Load(_urlNap).DocumentNode; var nDivs = docNap.SelectNodes("//div[@class]"); var carsNapDivs = nDivs.Where(at => at.GetAttributeValue("class", "").Trim() == "aoh-item").ToList(); carsNapDivs.ForEach(div => div.InnerHtml = div.InnerHtml.Replace("href=\"/targ/", "href=\"" + _websiteNapLink + "/targ/") .Replace("src=\"/", "src=\"" + _websiteNapLink + "/")); var lastestCarsNap = GetLastestCars(carsNapDivs, _lastStoredHrefNap); UpdateLastStoredHref(lastestCarsNap, _filenameNap, ref _lastStoredHrefNap); if (lastestCarsNap.Any()) { string title = "Original link: <a href=\"" + _urlNap + "\">" + _urlNap + "</a><hr>"; _html += title + string.Join("<hr>", lastestCarsNap.Select(s => s.InnerHtml)) + "<br/>"; Logger.Write("lastestCarsNap: " + lastestCarsNap.Count); } var doc = new HtmlWeb().Load(_urlBcpea).DocumentNode; var nodeBody = doc.SelectSingleNode("//body"); var n = nodeBody.SelectNodes("//ul[@class]"); var carsCsiLis = n.First(at => at.Attributes.AttributesWithName("class").Select(a => a.Value == "results_list").First()).SelectNodes("li").ToList(); carsCsiLis.ForEach(li => li.InnerHtml = li.InnerHtml.Replace("href=\"/bg/auto/", "href=\"" + _websiteBCPEALink + "/bg/auto/")); var lastestCars = GetLastestCars(carsCsiLis, _lastStoredHrefBCPEA); UpdateLastStoredHref(lastestCars, _filenameBCPEA, ref _lastStoredHrefBCPEA); if (lastestCars.Any()) { string title = "Original link: <a href=\"" + _urlBcpea + "\">" + _urlBcpea + "</a><hr>"; _html += title + string.Join("<hr>", lastestCars.Select(s => s.InnerHtml)); Logger.Write("lastestCars: " + lastestCars.Count); } } catch (Exception ex) { Logger.Write("Error in Update(): ", ex); } }
public static List <(string, double, double)> GetNorrington() { List <(string, double, double)> outputList = new List <(string, double, double)>(); var page = new HtmlWeb().Load(@"https://en.wikipedia.org/wiki/Norrington_Table").DocumentNode; var tableNode = page.SelectSingleNode("//table/tbody"); foreach (var tr in tableNode.SelectNodes("tr").Skip(1)) { (string, double, double)value = new ValueTuple <string, double, double> { Item1 = tr.SelectSingleNode("td[1]/a").InnerText, Item2 = Convert.ToDouble(tr.SelectSingleNode("td[2]").InnerText), Item3 = Convert.ToDouble(tr.SelectSingleNode("td[3]").InnerText) }; outputList.Add(value); } return(outputList); }
public void ParseFrom(string url, int StartPage, int FinishPage) { try { var doc = new HtmlWeb().Load(url + "/goods.php"); // int pageCount = MarketItems.PageCount(url); //проверка по пагинации for (int page = StartPage; page <= FinishPage; page++) { doc = new HtmlWeb().Load(url + "/goods.php?cid=5&page=" + page); //текущая страница var pageGoods = doc.DocumentNode.SelectNodes("//div[@class='ernr']/ul/li/div/h3/a"). Select(a => url + "/" + a.ChildAttributes("href"). FirstOrDefault().Value); //ссылки на все товары на странице if (pageGoods.Count() == 0) { throw new Exception("Товары не найдены!\nВозможно, сайт поменял верстку. Требуется обновить ПО!"); } else { foreach (var good in pageGoods.ToList()) { MarketItem item = new MarketItem();//будем заполнять товар try { var goodNode = new HtmlWeb().Load(good).DocumentNode.SelectSingleNode("//div[@class='cps']"); //html товара. отсюда и вытащим всё var Options = MarketItems.ConvertList(goodNode.SelectNodes(".//div[@class='tabmen']/ul/li").Select(a => a.InnerText).ToList()); var Quantities = ConvertList(goodNode.SelectNodes(".//div[@id='tabconten']/ul/li").Select(a => a.InnerText.Replace("双", "")).ToList()); // item.Model = Count != 0 ? this.Max(a => a.Model) + 1 : 1; item.Name = goodNode.SelectSingleNode(".//h6").InnerText + " " + item.Model; item.Description = "<p><br></p>"; item.SEO_url = item.Name.Replace(" ", "-"); item.Out_stock_status = ""; item.Option_type = "radio"; item.Price = "";//сам вводит item.Main_image = url + "/" + goodNode.SelectSingleNode(".//img").ChildAttributes("src").FirstOrDefault().Value; /*TODO*/ item.Manufacturer = Manuf(item.Name); item.Option = MarketItems.Option(Options); string resName = ""; if (item.Manufacturer.Contains("Timber")) { resName += "Ботинки"; } else { resName += "Кроссовки"; } if (Options.Min() <= 36) { resName += " Женские "; } else { resName += " Мужские "; } item.Name = resName + goodNode.SelectSingleNode(".//h6").InnerText + " " + item.Model; item.Meta_title = item.Name; //по парам item.Quantity = Quantities.First().ToString(); item.Option_value = Options.First().ToString(); if (Options.Count == Quantities.Count) { if (Options.Count > 1) { for (int i = 1; i < Options.Count; i++) { MarketItem child = new MarketItem(); child.Model = item.Model; child.Name = item.Name; child.Description = ""; child.Meta_title = ""; child.SEO_url = ""; child.Option = item.Option; child.Option_type = item.Option_type; child.Main_image = ""; child.Quantity = Quantities[i].ToString(); child.Option_value = Options[i].ToString(); item.ChildrenItems.Add(child); } } } else { throw new Exception(); } } catch (Exception exception) { item.Error = exception.Message.ToString(); } Add(item);//после парсинга страницы добавляем элемент } } } } catch (Exception e) { throw new Exception("Произошла ошибка при попытке получить страницу по адресу: " + url + "\n\t\t" + e.Message + "\nПожалуйста, проверьте правильность ввода ресурса!"); } }