コード例 #1
0
        /// <summary>
        /// Scrapes extra data about a course which is not given on first pass
        /// </summary>
        /// <param name="course">Course to scrape</param>
        /// <returns>A list of courses</returns>
        public static List <Course> AddData(this Course course)
        {
            var page = new HtmlWeb().Load(course.CourseUrl).DocumentNode;

            /* UCAS CODE */
            var    ucasCodeSection = page.SelectSingleNode("//strong[text()='UCAS code']");
            string ucasCode        = ucasCodeSection.ParentNode.NextSibling.InnerText;

            /* ADMISSION STATS */
            var admissions = AdmissionStats(page);

            if (admissions != null)
            {
                course.Admission = new Admission(admissions);
            }

            /* DURATION STATS */
            course.Duration.AddRange(DurationStats(page));

            /* ADMISSIONS TEST */
            course.AdmissionTest = AdmissionTest(page);

            /* WRITTEN WORK */
            var writtenWorkSection = page.SelectSingleNode("//strong[contains(text(), 'Written work')]");
            var writtenWork        = writtenWorkSection.ParentNode.NextSibling.InnerText;

            course.WrittenWork = (writtenWork == "None") ? null : writtenWork;

            /* SUBJECT REQUIREMENTS */
            course.ALevelCourse.AddRange(SubjectRequirements(page));

            /* SPLIT COURSES */
            //if only one ucas code, return as normal
            if (ucasCode.Length == 4)
            {
                course.UcasCode = ucasCode;
                return(new List <Course> {
                    course
                });
            }

            //if has brackets (indicates there are 2 courses on one page)
            if (ucasCode.Contains('('))
            {
                return(Fetcher.TwoCodes(course, ucasCode));
            }

            //if contains a hyperlink, indicates a link to a table elsewhere
            if (ucasCodeSection.ParentNode.NextSibling.SelectSingleNode("a") != null)
            {
                var link = ucasCodeSection.ParentNode.NextSibling.SelectSingleNode("a");
                return(Fetcher.HyperlinkTable(course, link.Attributes["href"].Value));
            }

            //if no URL in ucas code, can simply read table from same page
            return(Fetcher.Table(course, Fetcher.FindTable(page)));
        }
コード例 #2
0
        static void Main(string[] args)
        {
            string docurl  = "https://ebiz.khnp.co.kr/getTmpView.do?type=spbidnoti&loadId=fff84246-b957-47fc-a249-e1d023131890";
            var    doc     = new HtmlWeb().Load(docurl).DocumentNode;
            string aprice1 = doc.SelectSingleNode("//th[text()='가격점수제외금액(A)']").NextSibling.NextSibling.InnerText;
            string aprice2 = doc.SelectSingleNode("//th[text()[contains(., '가격점수제외금액(A)')]]").NextSibling.NextSibling.InnerText;

            Console.WriteLine(aprice1);
            Console.WriteLine(aprice2);
        }
コード例 #3
0
        /// <summary>
        /// Adds extra data to a college
        /// </summary>
        /// <param name="college">College to add data to</param>
        public static void AddData(this College college)
        {
            var page = new HtmlWeb().Load(college.CollegeUrl).DocumentNode;

            /* CAMPUS CODE */
            var    campusCodeSelection = page.SelectSingleNode("//h2[contains(text(), 'UCAS campus code:')]");
            string campusCode          = campusCodeSelection.InnerText.Split(' ').Last();

            college.CampusCode = campusCode;

            /* OFFERED COURSES */
            var coursePage = new HtmlWeb().Load($"{college.CollegeUrl}#content-tab--3").DocumentNode;

            var courseList = coursePage.SelectSingleNode("//div[@id='content-tab--3']/ul");

            college.Courses = new List <string>();

            //temp hardcoded solution, planning to fix
            if (college.CampusCode == "P")
            {
                college.Courses.Add("Theology and Religion");
                college.Courses.Add("Philosophy and Theology");
            }

            else
            {
                foreach (var a in courseList.SelectNodes("li/a"))
                {
                    college.Courses.Add(a.InnerText);
                }
            }

            /* NUMBER OF STUDENTS */
            var studentNumSelection = page.SelectSingleNode("//h3[contains(text(), 'Student numbers')]").NextSibling;
            var textSections        = studentNumSelection.InnerHtml.Split("<br>");

            //Mathces 1 or more digits (for both regexes)
            college.UndergradStudents = Convert.ToInt32(Regex.Match(textSections[0], @"\d{1,}").Value);
            college.GradStudents      = Convert.ToInt32(Regex.Match(textSections[1], @"\d{1,}").Value);

            college.TotalStudents = college.UndergradStudents + college.GradStudents;

            /* FACILITIES */
            var facilityPage = new HtmlWeb().Load($"{college.CollegeUrl}#content-tab--2").DocumentNode;

            var facilities = facilityPage.SelectSingleNode("//h2[text()='College facilities']").ParentNode.SelectSingleNode("table/tbody");

            college.Facilities = new CollegeFacilities(facilities);
        }
コード例 #4
0
        /// <summary>
        /// Get a list of type T from a url, by parsing the expected table format
        /// </summary>
        /// <typeparam name="T">Type for list to use</typeparam>
        /// <param name="url">Url to parse</param>
        /// <returns>A list of data parsed from url</returns>
        public static List <T> GetList <T>(string url)
        {
            List <T> outputList = new List <T>();

            var page      = new HtmlWeb().Load(url).DocumentNode;
            var tableNode = page.SelectSingleNode("//*[@class='table-reduced']");

            var trNode = tableNode.SelectSingleNode("tbody/tr");

            foreach (var td in trNode.ChildNodes)
            {
                foreach (var link in td.SelectNodes("p/a"))
                {
                    string name    = link.InnerText;
                    string linkUrl = "https:" + link.Attributes["href"].Value;

                    //create an instance of T with given parameters - hacky workaround, but only solution i can find
                    T data = (T)Activator.CreateInstance(typeof(T), name, linkUrl);

                    outputList.Add(data);
                }
            }

            return(outputList);
        }
コード例 #5
0
 public static string DownloadDeck(string url)
 {
     if (url.Contains("legends-decks.com"))
     {
         var page = new HtmlWeb().Load(url).DocumentNode;
         return(page.SelectSingleNode("//div[@id='deckModal']//div[@class='well_full']").InnerHtml.Replace("<br>", "\r\n").Trim());
     }
     if (url.Contains("eternalwarcry.com"))
     {
         var page  = new HtmlWeb().Load(url).DocumentNode;
         var name  = page.SelectSingleNode("//h1").InnerText.Trim();
         var cards = page.SelectSingleNode("//textarea[@id='export-deck-text']").InnerHtml.Trim();
         return($"### {name} ###\r\n{cards}");
     }
     return(null);
 }
コード例 #6
0
        /// <summary>
        /// Creates multiple courses based upon a table on another page
        /// </summary>
        /// <param name="course">Course to split</param>
        /// <param name="url">Url of other page</param>
        /// <returns>A list of correct courses</returns>
        public static List <Course> HyperlinkTable(Course course, string url)
        {
            var page = new HtmlWeb().Load($"https:{url}").DocumentNode;

            var           ucasSection = page.SelectSingleNode("//h3[contains(text(), 'UCAS codes')]/following-sibling::div");
            List <Course> newCourses  = new List <Course>();

            foreach (var table in ucasSection.SelectNodes("table").Take(3))
            {
                newCourses.AddRange(Table(course, table));
            }

            return(newCourses);
        }
コード例 #7
0
 public static string DownloadDeck(string url)
 {
     if (url.Contains("legends-decks.com"))
     {
         var page = new HtmlWeb().Load(url).DocumentNode;
         return(page.SelectSingleNode("//div[@id='deckModal']//div[@class='well_full']").InnerHtml.Replace("<br>", "\r\n").Trim());
     }
     if (url.Contains("eternalwarcry.com"))
     {
         var page  = new HtmlWeb().Load(url).DocumentNode;
         var name  = page.SelectSingleNode("//h1").InnerText.Trim();
         var cards = page.SelectSingleNode("//textarea[@id='export-deck-text']").InnerHtml.Trim();
         return($"### {name} ###\r\n{cards}");
     }
     if (url.Contains("teslegends.pro"))
     {
         var slug     = url.Substring(url.TrimEnd('/').LastIndexOf('/')).Trim('/');
         var response = new WebClient().UploadValues("https://teslegends.pro/dc/do.php", "POST", new NameValueCollection {
             { "exportdeck", slug }
         });
         return(Encoding.UTF8.GetString(response));
     }
     return(null);
 }
コード例 #8
0
ファイル: MyContextApp.cs プロジェクト: mariyan87/ScrapeApp
        private void Update()
        {
            _html = "";

            _lastStoredHrefBCPEA = ReadLastCarHref(_filenameBCPEA);
            _lastStoredHrefNap   = ReadLastCarHref(_filenameNap);


            try
            {
                var docNap      = new HtmlWeb().Load(_urlNap).DocumentNode;
                var nDivs       = docNap.SelectNodes("//div[@class]");
                var carsNapDivs = nDivs.Where(at => at.GetAttributeValue("class", "").Trim() == "aoh-item").ToList();
                carsNapDivs.ForEach(div => div.InnerHtml = div.InnerHtml.Replace("href=\"/targ/", "href=\"" + _websiteNapLink + "/targ/")
                                                           .Replace("src=\"/", "src=\"" + _websiteNapLink + "/"));
                var lastestCarsNap = GetLastestCars(carsNapDivs, _lastStoredHrefNap);
                UpdateLastStoredHref(lastestCarsNap, _filenameNap, ref _lastStoredHrefNap);

                if (lastestCarsNap.Any())
                {
                    string title = "Original link: <a href=\"" + _urlNap + "\">" + _urlNap + "</a><hr>";
                    _html += title + string.Join("<hr>", lastestCarsNap.Select(s => s.InnerHtml)) + "<br/>";
                    Logger.Write("lastestCarsNap: " + lastestCarsNap.Count);
                }

                var doc        = new HtmlWeb().Load(_urlBcpea).DocumentNode;
                var nodeBody   = doc.SelectSingleNode("//body");
                var n          = nodeBody.SelectNodes("//ul[@class]");
                var carsCsiLis = n.First(at => at.Attributes.AttributesWithName("class").Select(a => a.Value == "results_list").First()).SelectNodes("li").ToList();
                carsCsiLis.ForEach(li => li.InnerHtml = li.InnerHtml.Replace("href=\"/bg/auto/", "href=\"" + _websiteBCPEALink + "/bg/auto/"));
                var lastestCars = GetLastestCars(carsCsiLis, _lastStoredHrefBCPEA);
                UpdateLastStoredHref(lastestCars, _filenameBCPEA, ref _lastStoredHrefBCPEA);

                if (lastestCars.Any())
                {
                    string title = "Original link: <a href=\"" + _urlBcpea + "\">" + _urlBcpea + "</a><hr>";
                    _html += title + string.Join("<hr>", lastestCars.Select(s => s.InnerHtml));
                    Logger.Write("lastestCars: " + lastestCars.Count);
                }
            }
            catch (Exception ex)
            {
                Logger.Write("Error in Update(): ", ex);
            }
        }
コード例 #9
0
        public static List <(string, double, double)> GetNorrington()
        {
            List <(string, double, double)> outputList = new List <(string, double, double)>();

            var page      = new HtmlWeb().Load(@"https://en.wikipedia.org/wiki/Norrington_Table").DocumentNode;
            var tableNode = page.SelectSingleNode("//table/tbody");

            foreach (var tr in tableNode.SelectNodes("tr").Skip(1))
            {
                (string, double, double)value = new ValueTuple <string, double, double>
                {
                    Item1 = tr.SelectSingleNode("td[1]/a").InnerText,
                    Item2 = Convert.ToDouble(tr.SelectSingleNode("td[2]").InnerText),
                    Item3 = Convert.ToDouble(tr.SelectSingleNode("td[3]").InnerText)
                };

                outputList.Add(value);
            }

            return(outputList);
        }
コード例 #10
0
        public void ParseFrom(string url, int StartPage, int FinishPage)
        {
            try
            {
                var doc = new HtmlWeb().Load(url + "/goods.php");
                // int pageCount = MarketItems.PageCount(url);
                //проверка по пагинации
                for (int page = StartPage; page <= FinishPage; page++)
                {
                    doc = new HtmlWeb().Load(url + "/goods.php?cid=5&page=" + page); //текущая страница
                    var pageGoods = doc.DocumentNode.SelectNodes("//div[@class='ernr']/ul/li/div/h3/a").
                                    Select(a => url + "/" + a.ChildAttributes("href").
                                           FirstOrDefault().Value);           //ссылки на все товары на странице
                    if (pageGoods.Count() == 0)
                    {
                        throw new Exception("Товары не найдены!\nВозможно, сайт поменял верстку. Требуется обновить ПО!");
                    }
                    else
                    {
                        foreach (var good in pageGoods.ToList())
                        {
                            MarketItem item = new MarketItem();//будем заполнять товар
                            try
                            {
                                var goodNode   = new HtmlWeb().Load(good).DocumentNode.SelectSingleNode("//div[@class='cps']");                                         //html товара. отсюда и вытащим всё
                                var Options    = MarketItems.ConvertList(goodNode.SelectNodes(".//div[@class='tabmen']/ul/li").Select(a => a.InnerText).ToList());
                                var Quantities = ConvertList(goodNode.SelectNodes(".//div[@id='tabconten']/ul/li").Select(a => a.InnerText.Replace("双", "")).ToList()); //

                                item.Model = Count != 0 ? this.Max(a => a.Model) + 1 : 1;

                                item.Name             = goodNode.SelectSingleNode(".//h6").InnerText + " " + item.Model;
                                item.Description      = "<p><br></p>";
                                item.SEO_url          = item.Name.Replace(" ", "-");
                                item.Out_stock_status = "";
                                item.Option_type      = "radio";
                                item.Price            = "";//сам вводит
                                item.Main_image       = url + "/" + goodNode.SelectSingleNode(".//img").ChildAttributes("src").FirstOrDefault().Value;
                                /*TODO*/
                                item.Manufacturer = Manuf(item.Name);
                                item.Option       = MarketItems.Option(Options);
                                string resName = "";
                                if (item.Manufacturer.Contains("Timber"))
                                {
                                    resName += "Ботинки";
                                }
                                else
                                {
                                    resName += "Кроссовки";
                                }
                                if (Options.Min() <= 36)
                                {
                                    resName += " Женские ";
                                }
                                else
                                {
                                    resName += " Мужские ";
                                }
                                item.Name       = resName + goodNode.SelectSingleNode(".//h6").InnerText + " " + item.Model;
                                item.Meta_title = item.Name;
                                //по парам
                                item.Quantity     = Quantities.First().ToString();
                                item.Option_value = Options.First().ToString();
                                if (Options.Count == Quantities.Count)
                                {
                                    if (Options.Count > 1)
                                    {
                                        for (int i = 1; i < Options.Count; i++)
                                        {
                                            MarketItem child = new MarketItem();
                                            child.Model        = item.Model;
                                            child.Name         = item.Name;
                                            child.Description  = "";
                                            child.Meta_title   = "";
                                            child.SEO_url      = "";
                                            child.Option       = item.Option;
                                            child.Option_type  = item.Option_type;
                                            child.Main_image   = "";
                                            child.Quantity     = Quantities[i].ToString();
                                            child.Option_value = Options[i].ToString();
                                            item.ChildrenItems.Add(child);
                                        }
                                    }
                                }
                                else
                                {
                                    throw new Exception();
                                }
                            }
                            catch (Exception exception)
                            {
                                item.Error = exception.Message.ToString();
                            }
                            Add(item);//после парсинга страницы добавляем элемент
                        }
                    }
                }
            }
            catch (Exception e)
            {
                throw new Exception("Произошла ошибка при попытке получить страницу по адресу: " + url + "\n\t\t" + e.Message + "\nПожалуйста, проверьте правильность ввода ресурса!");
            }
        }