public DeveloperData(HtmlDocument document, string url) { this.url = url; HtmlNode root = document.DocumentNode; name = root.SelectSingleNode("//h1[@class='cluster-heading']").InnerText.Trim(); HtmlNodeCollection nodes = root.SelectNodes("//div[@class='card-list']//a[@class='title']"); appsNames = nodes.Select(el => el.InnerText.Trim()).ToList(); List <string> shortenUrls = nodes.Select(el => el.GetAttributeValue("href", null)).ToList(); if (shortenUrls.All(el => el.StartsWith(@"/store/apps/details?id="))) { appsIDs = shortenUrls.Select(el => el.Replace(@"/store/apps/details?id=", "")).ToList(); } else if (shortenUrls.All(el => el.StartsWith(@"https://play.google.com/store/apps/details?id="))) { appsIDs = shortenUrls.Select(el => el.Replace(@"https://play.google.com/store/apps/details?id=", "")).ToList(); } else { throw new Exception("incorrect developer apps IDs"); } }
public List <string> ParseList(XpathExtractModel model) { if (model == null || string.IsNullOrEmpty(model.XpathRule)) { return(null); } HtmlNodeCollection nodes = this.htmlNode?.SelectNodes(model.XpathRule); if (nodes == null || nodes.Count <= 0) { return(null); } if (model.XpathEndAttributes != null && model.XpathEndAttributes.Count > 0) { return(nodes.Select(n => n.Attributes.Where(a => model.XpathEndAttributes.Contains(a.Name)).Select(a => a.Value?.Trim()).FirstOrDefault()). Where(n => !string.IsNullOrEmpty(n)) .ToList()); } switch (model.ExtractType) { case ExtractType.Text: return(nodes.Select(n => n.InnerText.Trim()).Where(n => !string.IsNullOrEmpty(n)).ToList()); case ExtractType.Html: default: return(nodes.Select(n => n.InnerHtml.Trim()).Where(n => !string.IsNullOrEmpty(n)).ToList()); } }
public static List <clsCricketMatches> GetLiveMatches() { //Local variables List <clsCricketMatches> objCricketMatches = new List <clsCricketMatches>(); try { //return null; HtmlWeb objHtmlWeb = new HtmlWeb(); HtmlDocument objHtmlDoc = objHtmlWeb.Load("https://www.cricbuzz.com/cricket-match/live-scores"); HtmlNodeCollection objHtmlClassNodes = objHtmlDoc.DocumentNode.SelectNodes("//div[contains(@class, 'cb-col cb-col-100 cb-lv-main')]"); List <string> objHtmlHrefNodes = objHtmlClassNodes.Select(x => x.SelectSingleNode(".//a").Attributes["href"].Value).ToList(); var lstOfMatchNoSplits = objHtmlHrefNodes.Select(x => x.Split('/')); List <string> lstOfMatchNos = lstOfMatchNoSplits.Select(x => "https://www.cricbuzz.com/match-api/" + x[2] + "/commentary.json").ToList(); List <string> objMatchTitles = objHtmlClassNodes.Select(x => x.SelectSingleNode(".//a").Attributes["title"].Value).ToList(); objCricketMatches = objMatchTitles.Zip(lstOfMatchNos, (strMatchName, strMatchLink) => new clsCricketMatches { objMatchName = strMatchName, objMatchHyperLink = strMatchLink }).ToList(); //Dictionary<string, string> objDicOfMatchDetails = objMatchTitles.Zip(lstOfMatchNos, (k, v) => new { k, v }).ToDictionary(x => x.k, x => x.v); //return objDicOfMatchDetails; return(objCricketMatches); } catch (Exception) { return(null); } }
List <VideoInfo> getVideos(Category category) { List <VideoInfo> videos = new List <VideoInfo>(); string url = (category as RssLink).Url; var document = GetWebData <HtmlDocument>(url).DocumentNode; HtmlNodeCollection videoNodes = null; // Check for an 'all episdoes' link if we couldn't find any episodes var allEpisodes = document.SelectSingleNode(@"//a[starts-with(@href, '/iplayer/episodes/')]"); if (allEpisodes != null) { document = GetWebData <HtmlDocument>(BASE_URL + allEpisodes.GetAttributeValue("href", "")).DocumentNode; videoNodes = document.SelectNodes(@"//div[contains(@class, 'content-item')]"); } if (videoNodes == null) { videoNodes = document.SelectNodes(@"//div[contains(@class, 'content-item')]"); } // Single video if (videoNodes == null) { var videoNode = document.SelectSingleNode(@"//div[@id='main']"); if (videoNode != null) { VideoInfo video = createSingleVideo(videoNode, url); if (video != null) { videos.Add(video); } } return(videos); } videos.AddRange(videoNodes.Select(v => createVideo(v, category.Name)).Where(v => v != null)); int pageCount = getPageCount(document); int currentPage = 1; while (currentPage < pageCount) { currentPage++; document = GetWebData <HtmlDocument>(url + "?page=" + currentPage).DocumentNode; videoNodes = document.SelectNodes(@"//div[contains(@class, 'content-item')]"); if (videoNodes == null) { break; } videos.AddRange(videoNodes.Select(v => createVideo(v, category.Name)).Where(v => v != null)); } return(videos); }
private static ClassDay ParseToClassDay(HtmlNodeCollection nodes, string day, List <string> schedules) { if (day.Equals("Sexta")) { Console.Write("adasdad"); } //Abreviacao do nome da disciplina. var nickNames = nodes.Select(x => x.InnerText.Trim()).ToList(); //Embora no site esteja a abeviacao do nome, optei por preencher os objetos com o nome completo var names = nodes.Select(a => a.GetAttributeValue("title", "")).ToList(); //Remove um codigo estranho que fica na frente do nome, ex: #1047 - nome Regex reg = new Regex("^[^a-zA-Z]+"); names = names.Select(s => reg.Replace(s, String.Empty)).ToList(); var disciplines = new List <TimeTableDiscipline>(); for (int i = 0; i < nickNames.Count; i++) { disciplines.Add ( new TimeTableDiscipline() { NickName = nickNames[i], Name = names[i] } ); } //Dicionario tem a funcao de garantir que o horario estara na mesma posicado do site var dayDisciplines = new Dictionary <string, TimeTableDiscipline>(); for (int i = 0; i < disciplines.Count; i++) { dayDisciplines.Add(schedules[i], disciplines[i]); } return(new ClassDay() { DayOfWeek = day, FirstClass = GetValue(dayDisciplines, "1"), SecondClass = GetValue(dayDisciplines, "2"), ThirdClass = GetValue(dayDisciplines, "3"), FourthClass = GetValue(dayDisciplines, "4"), }); }
private static List <int> GetNewSearchPageNumbers(HtmlDocument doc) { List <int> result = new List <int>(); HtmlNodeCollection hnc = doc.DocumentNode.SelectNodes("//table[@class='searchresultpaging'][1]/tr/td[2]/a"); HtmlNode hn = doc.DocumentNode.SelectSingleNode("//table[@class='searchresultpaging'][1]/tr/td[2]/a/p/b"); int currentPageNumber = Convert.ToInt32(hn.InnerText); bool flag = false; foreach (int pageNumber in hnc.Select(n => Convert.ToInt32(n.InnerText))) { if (pageNumber == currentPageNumber) { flag = true; } if (pageNumber != currentPageNumber && !_pageNumbers.Contains(pageNumber) && flag) { lock (_pageNumbers) { _pageNumbers.Add(pageNumber); result.Add(pageNumber); } } } return(result); }
private void Page_onPageDownload(object sender, HTMLPageEventArgs e) { //dw получаем список раздач с загруженной страницы HtmlNodeCollection htmlNodes = e.Page.DocumentNode.SelectNodes(@"//div[@id=""index""]//tr[position()>1]/td[2]"); //up если нужного узла не будет - null //dw необходимо для добавления корректной ссылки на страницу раздачи //по умолчанию ссылка парситься без домена первого уровня string rutorMainUrl = MainFunc.rutorWorkURL.Replace(@"/soft", ""); //dw если список раздач получен if (htmlNodes != null) { List <TrackersListItem> postLst; postLst = htmlNodes.Select((el, i) => new TrackersListItem { //dw HtmlDecode необходим чтобы привести HTML escape последовательности //в нормальный вид Name = HttpUtility.HtmlDecode(el.LastChild.InnerText), //dw добавляем в ссылку домен первого уровня Href = rutorMainUrl + el.LastChild.GetAttributeValue("href", null), Index = i, Magnet = el.ChildNodes[1].GetAttributeValue("href", null), }).ToList(); //подготавливаем аргументы для события RutorListEventArgs eventArgs = new RutorListEventArgs(postLst); //вызываем событие. аналог if(onPostReceived!=null)onPostReceived(arg); OnPostReceived?.Invoke(this, eventArgs); } else { Program.statusBarGlobal.Message = "Ошибка на этапе парсинга страницы"; } }
public async Task <IList <WebPage> > SearchAsync(string query, int page) { HttpResponseMessage response = await _client.GetAsync(SearchUri(query, page)); if (response.IsSuccessStatusCode) { string data = await response.Content.ReadAsStringAsync(); var doc = new HtmlDocument(); doc.LoadHtml(data); HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//div[@class='g']"); if (nodes != null) { return(nodes.Select(node => { var titleNode = node.Descendants("h3").Where(x => x.Attributes.Contains("class")) .FirstOrDefault(x => x.Attributes["class"].Value.Contains("LC20lb")); var linkNode = node.Descendants("a") .FirstOrDefault(); var snippetNode = node.Descendants("span").Where(x => x.Attributes.Contains("class")) .FirstOrDefault(x => x.Attributes["class"].Value.Contains("st")); var title = HttpUtility.HtmlDecode(titleNode?.InnerText); var link = HttpUtility.HtmlDecode(linkNode?.Attributes["href"].Value); var snippet = HttpUtility.HtmlDecode(snippetNode?.InnerText); return new WebPage(query, title, link, snippet, searchTag); }) .ToList()); } } return(new List <WebPage>()); }
static void Main(string[] args) { //Logger var loggerFactory = new LoggerFactory().AddConsole(); var logger = loggerFactory.CreateLogger(typeof(Program)); //Configuration var builder = new ConfigurationBuilder() .SetBasePath(Directory.GetCurrentDirectory()) .AddJsonFile("config.json"); var config = builder.Build(); var site = config.GetSection("Site").Value; //logger.LogInformation(site); var html = HTTPUtil.GetHtml(site); //HtmlDocument doc = new HtmlDocument(); //HtmlWeb web = new HtmlWeb(); HtmlDocument doc = new HtmlDocument(); //logger.LogInformation(html); doc.LoadHtml(html); //XPath 语法 HtmlNodeCollection categoryNodes = doc.DocumentNode.SelectNodes("//h3/a[@class='titlelnk']"); IEnumerable <string> category = categoryNodes.Select(x => x.Attributes["href"].Value + " " + x.InnerText).ToList(); foreach (var item in category) { logger.LogInformation(item); } Console.ReadLine(); }
public static DataMeta TaiwanBK_GetRate(this HtmlNodeCollection Node, EnumBank EnumBank) { DataMeta Table = new DataMeta(); RateData TempTable = new RateData(); Table.CreateDate = MyTimeZone.Today; Table.Expire = MyTimeZone.Today.AddMinutes(Expire); Table.Key = EnumBank; //int LIndex = (int)EnumRate.幣別; foreach (var item in Node.Select((value, index) => new { index, value })) { Table.Data.Add(new RateData() { Currencty = item.value.SelectNodes(@"//div[@class=""hidden-phone print_show""]")[item.index].InnerText .Replace("\r\n", "") .Replace(" ", "") .Replace(" ", "") .Replace("(", "") .Replace(")", "") .TrimStart(' ').TrimEnd(' ').Replace("\r\n", ""), CashBuying = item.value.SelectNodes(@"//td[@data-table=""本行現金買入""]")[item.index].InnerText, CashSelling = item.value.SelectNodes(@"//td[@data-table=""本行現金賣出""]")[item.index].InnerText, SpotBuying = item.value.SelectNodes(@"//td[@data-table=""本行即期買入""]")[item.index].InnerText, SpotSelling = item.value.SelectNodes(@"//td[@data-table=""本行即期賣出""]")[item.index].InnerText }); } return(Table); }
public IEnumerable <string> GetStylesheets() { HtmlNodeCollection collection = this.document.DocumentNode.SelectNodes("//link[@rel='stylesheet']"); IEnumerable <string> links = collection?.Select(node => node.GetAttributeValue("href", string.Empty)); return(links ?? new List <string>()); }
public IEnumerable <string> GetScripts() { HtmlNodeCollection collection = this.document.DocumentNode.SelectNodes("//script[@src]"); IEnumerable <string> links = collection?.Select(node => node.GetAttributeValue("src", string.Empty)); return(links ?? new List <string>()); }
private void GetPageCourseData() { //1. 确定总页数 //2. 分别抓取每一页的数据 //3. 分析 过滤 清洗 //4. 入库 category.Url = $"https://ke.qq.com{category.Url}"; string strHtml = HttpHelper.DownloadUrl(category.Url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(strHtml); //Xpath string pagePath = "/html/body/section[1]/div/div[@class='sort-page']/a[@class='page-btn']"; HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pagePath); int pageCount = 1; if (pageNodes != null) { pageCount = pageNodes.Select(a => int.Parse(a.InnerText)).Max(); } List <CourseEntity> courseList = new List <CourseEntity>(); for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) { Console.WriteLine($"******************************当前是第{pageIndex}页数据************************************"); string pageIndexUrl = $"{category.Url}&page={pageIndex}"; List <CourseEntity> courseEntities = GetPageIndeData(pageIndexUrl); courseList.AddRange(courseEntities); } //courseRepository.SaveList(courseList); }
private void GetTitle() { string strContent = m_wd.GetPageByHttpWebRequest(this.textBoxUrl.Text, Encoding.UTF8); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml(strContent); string strTitle = ""; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!Equals(nodes, null)) { strTitle = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } strTitle = strTitle.Replace("博客园", ""); strTitle = Regex.Replace(strTitle, @"[|/\;:*?<>&#-]", "").ToString(); strTitle = Regex.Replace(strTitle, "[\"]", "").ToString(); this.textBoxTitle.Text = strTitle.TrimEnd(); }
private void Downloader_FinishDownload(object sender, DownloaderHtmlPageArgs e) { // если нужного узла не будет - null HtmlNodeCollection htmlNodes = e.Page.DocumentNode.SelectNodes(@"//div[@id=""index""]//tr[position()>1]/td[2]"); // необходимо для добавления корректной ссылки на страницу раздачи // по умолчанию ссылка парситься без домена первого уровня string rutorMainUrl = UriWork.OriginalString.Replace(@"/soft", ""); if (htmlNodes != null) { List <ItemList> postsList; postsList = htmlNodes.Select((el, i) => new ItemList { // HtmlDecode необходим чтобы привести HTML escape последовательности // в нормальный вид Name = HttpUtility.HtmlDecode(el.LastChild.InnerText), Href = rutorMainUrl + el.LastChild.GetAttributeValue("href", null), Index = i, Magnet = el.ChildNodes[1].GetAttributeValue("href", null), }).ToList(); ItemListArgs eventArgs = new ItemListArgs(postsList); //вызываем событие. аналог if(onPostReceived!=null)onPostReceived(arg); ListReceived?.Invoke(this, eventArgs); } else { Program.statusBarGlobal.Message = "Ошибка на этапе парсинга страницы"; } }
/// <summary> /// Fetches a list of <see cref="NewsItem"/> from the given stream which should point to http://heise.de /// </summary> /// <param name="documentStream">Stream to fetch <see cref="NewsItem"/> from</param> /// <returns>List of <see cref="NewsItem"/></returns> public override IEnumerable <NewsItem> GetNewsItemsFromStream(Stream documentStream) { if (documentStream == null) { throw new ArgumentNullException("documentStream"); } HtmlDocument doc = new HtmlDocument(); doc.Load(documentStream, Encoding.Default); HtmlNodeCollection allArticleContainers = doc.DocumentNode.SelectNodes("/descendant::article"); List <HtmlNode> articleLinks = allArticleContainers.Select(d => d.SelectSingleNode("a")).ToList(); // Some articles are null, remove them articleLinks.RemoveAll(item => item == null); IEnumerable <NewsItem> items = articleLinks.Select(article => new NewsItem( article.GetAttributeValue("title", null), article.GetAttributeValue("href", null), // some articles don't contain a paragraph, set them to null (article.SelectSingleNode("div/p") != null) ? article.SelectSingleNode("div/p").InnerHtml : null )); return(items); }
/// <summary>///html/body/div[4]/div[1]/div[6]/div[1]/ul/div[1]/div[2]/ol/li/a /// 验证一个List的模式是否能应用于某一个页面(只是检查是否明显不可能) /// 与下面的函数都是直接从ListStrategy里copy过来的 /// </summary> /// <param name="Url"></param> /// <param name="HTML"></param> /// <param name="XPath"></param> /// <returns></returns> public bool ValidateListXPath(string Url, string HTML, XpathPattern XPath) { //获取root节点(有些网站页面不带html标签的,直接从head开始写) HtmlNode rootNode = HtmlUtility.getSafeHtmlRootNode(HTML, true, true); if (rootNode == null) { return(false); } HtmlNodeCollection rootNodes = rootNode.SelectNodes(XPath.ItemRootXPath); if (rootNodes == null) { return(false); } var TitleNode = rootNodes.Select(f => f.SelectSingleNode(XPath.TitleXPath)).Where(f => f != null); if (TitleNode == null || TitleNode.Count() == 0 || (TitleNode.Count() == 1 && TitleNode.FirstOrDefault() == null)) { return(false); } //获取时有可能第一个为空 TitleNode = TitleNode.Where(f => f != null); List <HtmlNode> TitleNodes = TitleNode.Where(a => !string.IsNullOrEmpty(a.InnerText)).ToList(); double Score = ScoreforListTitle(TitleNodes); return((Score > Threshold.LeastTitleScore || (Url.Contains("tieba.baidu.com") && Score > 100)) && ValidateListXPath(Url, rootNode, XPath)); }
protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage) { ProcessorResult result = new ProcessorResult { UniqueAttributeId = 222 }; Match regexResult = wordPressPattern.Match(crawledPage.RawContent); if (regexResult.Success) { result.Attributes.Add("siteBuilder", "BlogWordPress"); result.IsAHit = true; return(result); } HtmlNodeCollection listhref = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null); if (listhref.Select(node => node.GetAttributeValue("href", "")).Any(content => content.Contains("wordpress.org"))) { result.Attributes.Add("siteBuilder", "BlogWordPress"); result.IsAHit = true; return(result); } return(result); }
private IEnumerable <string> GetPageUrls() { HtmlDocument doc = _webClient.Load(siteIndex); HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes(pageUrlPath); return(nodes.Select(node => node.Attributes["href"].Value)); }
public void CrawlerAll() { try { if (string.IsNullOrWhiteSpace(category.Url)) { Console.WriteLine($"分类的链接为空{category.Name}{category.CategoryLevel}"); } else { string html = HttpHelper.DownLoad(category.Url, Encoding.UTF8); //解析网页 HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); //通过xpath解析 string pageXPath = "/html/body/section[1]/div/div[6]/a[@class='page-btn']"; HtmlNodeCollection pageNodes = document.DocumentNode.SelectNodes(pageXPath); int pageCount = pageNodes.Select(a => Convert.ToInt32(a.InnerText)).OrderByDescending(a => a).FirstOrDefault(); List <CourseEntity> courses = new List <CourseEntity>(); for (int i = 1; i <= pageCount; i++) { string pageUrl = $"{category.Url}&page={i}"; html = HttpHelper.DownLoad(pageUrl, Encoding.UTF8); //解析网页 HtmlDocument pageDocument = new HtmlDocument(); pageDocument.LoadHtml(html); string liXPath = "/html/body/section[1]/div/div[4]/ul/li"; HtmlNodeCollection pageLiNodeList = pageDocument.DocumentNode.SelectNodes(liXPath); foreach (var liNode in pageLiNodeList) { var course = GetOneCourse(liNode); courses.Add(course); } if (i == 3) { break; } } string direct = $"{System.AppDomain.CurrentDomain.BaseDirectory}CrawlerFile\\";; string sheetName = "course"; string Name = $"{sheetName}{DateTime.Now.ToString("yyyyMMddHHmmss")}.xls";// sheetName + DateTime.Now.ToString("yyyyMMddHHmmss") + ").xls"; if (!Directory.Exists(direct)) { Directory.CreateDirectory(direct); } ExcelHelper eh = new ExcelHelper(direct + Name); DataTable dtCourses = ListToDataTable.ToDataTable <CourseEntity>(courses); eh.DataTableToExcel(dtCourses, sheetName, true); } } catch (Exception) { Console.WriteLine("Crawler抓取异常"); } }
public static string TextAll(this HtmlNodeCollection htmlNodeCollection) { if (htmlNodeCollection is null) { return(null); } return(string.Join("", htmlNodeCollection.Select(e => e)).Trim()); }
protected void parsePage(SearchPage page) { if (isHaveParsed(page)) { return; } HttpClient client; HtmlDocument document = new HtmlDocument(); string body = default(string); bool hasNavigationMenu; using (client = getNewHttpClient()) { body = client.GetStringAsync(URL + page.CURRENT_PAGE).Result; } if (string.IsNullOrWhiteSpace(body)) { throw new Exception("Empty body"); } document.LoadHtml(body); hasNavigationMenu = checkNavigationMenu(document); if (!hasNavigationMenu) { //throw new Exception("No navigation menu"); Tools._pause(5000, 10000); parsePage(page); return; } if (isHaveParsed(page)) { return; } updateLastPagePosition(document); HtmlNodeCollection nodes = document.DocumentNode.SelectNodes(Node.TABLE_SEARCH_RESULT); if (nodes == null) { //throw new Exception("No data in body (maybe need enter the captcha)"); Tools._pause(5000, 10000); parsePage(page); return; } links = nodes .Select(a => a.GetAttributeValue("href", String.Empty)) .Where(link => !string.IsNullOrWhiteSpace(link) && link != "javascript:void(0)") .ToList(); }
public async Task <string> GetGoogleResultsAsync(string searchTerm, string url) { Search search = new Search { Query = searchTerm, Date = DateTime.Now }; Result result = new Result() { UrlAnalyzed = url, }; int maxResultPosition = 100; searchTerm = searchTerm.Replace(' ', '+'); if (url.Contains("www")) { url = url.Replace("www.", "").Split('.')[0]; } string searchUrl = $"http://www.google.co.uk/search?num={maxResultPosition}&q={searchTerm}"; // need to process to get the real URL of the question. HtmlDocument response = new HtmlWeb().Load(searchUrl); //resultContent.DocumentNode.SelectNodes("//span[contains(@class, 'BNeawe')]") HtmlNodeCollection nodes = response.DocumentNode.SelectNodes("//div[contains(@class, 'BNeawe UPmit AP7Wnd')]"); if (nodes != null) { string stringResult = ""; IEnumerable <int> indexes = nodes.Select((x, i) => new { i, x.InnerHtml }) .Where(x => x.InnerHtml.Contains(url)) .Select(x => x.i + 1); foreach (int i in indexes) { stringResult += $"{i},"; } stringResult = stringResult.TrimEnd(','); result.Position = stringResult; search.Result = result; _context.Searches.Add(search); _context.SaveChanges(); return(stringResult); } return(""); }
public static List <string> GetListUrls(string pageUrl) { var web = new HtmlWeb(); HtmlDocument doc = web.Load(pageUrl); //HtmlNode node = doc.DocumentNode.SelectSingleNode("..."); HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//div[@class='r-list-container action-bar-margin bbs-screen']/div[@class='r-ent']/div[@class='title']/a"); List <string> urls = nodes.Select(n => n.Attributes["href"].Value).ToList(); return(urls); }
public static List <string> GetArticleText(HtmlNode parentNode) { HtmlNode mainContent = GetMainContent(parentNode); HtmlNodeCollection paragraphs = mainContent.SelectNodes("//p"); if (paragraphs == null) { return(null); } return(paragraphs.Select(n => n.InnerText).ToList()); }
public async Task <IEnumerable <News> > GetNewsAsync() { string html = await GetHtmlAsync(NewsUrl); var document = new HtmlDocument(); document.LoadHtml(html); HtmlNode body = document.DocumentNode.SelectSingleNode("//body"); HtmlNodeCollection entries = body.SelectNodes("//div[@class='post text']"); return(entries.Select(ParseNode)); }
private IEnumerable <CityData> CreateCityData(HtmlNodeCollection cityHtml) { var cityData = cityHtml?.Select(node => new CityData() { Name = _htmlParser.ParseText(node), Url = _pageHandler.CreateLunUaUrl(_htmlParser.ParseHref(node)) } ); return(cityData); }
private IEnumerable <ApartComplexesGroupData> CreateApartComplexData(HtmlNodeCollection cityHtml) { var apartComplexesData = cityHtml?.Select(node => new ApartComplexesGroupData() { CityName = _htmlParser.ParseText(node), Url = _pageHandler.CreateDomRiaUrl(_htmlParser.ParseHref(node)) } ); return(apartComplexesData); }
protected virtual decimal GetValue(HtmlNode oddNode) { decimal parsedValue = 0; HtmlNodeCollection nodeCollection = oddNode.SelectNodes(OddXPaths.VALUE); if (nodeCollection != null) { string value = nodeCollection.Select(x => x.InnerText).First(); decimal.TryParse(value, out parsedValue); } return(parsedValue); }
private string SafeGetAttributeValue(HtmlNodeCollection collection, string name) { string result = string.Empty; var attr = collection .Select(n => n.Attributes.FirstOrDefault(a => a.Name == name)) .FirstOrDefault(); if (attr != null) { result = attr.Value; } return(result); }