private (IEnumerable <HtmlNode>, IEnumerable <HtmlNode>) splitList(HtmlNodeCollection nodes) { var jmfPrisList = nodes.Where((x, i) => i % 2 != 0); var productManufacturerList = nodes.Where((x, i) => i % 2 == 0); return(productManufacturerList, jmfPrisList); }
private void LoadPlzenskyDvur() { HtmlNode doc = Utils.GetHtmlDoc(Constants.plzenskyDvur).DocumentNode; HtmlNode menu = doc.SelectSingleNode("//div[@class='listek']/div[@class='tyden']"); HtmlNodeCollection prices = doc.SelectNodes("//div[@class='listek']/div[@class='tyden_ceny']//td"); DayMenu ParseDay(HtmlNode title, HtmlNode text) { string dateStr = title.InnerText; DateTime date = Utils.ParseDateTime(dateStr.Split(' ')[1]); HtmlNodeCollection rows = text.SelectNodes("./p"); string soup = rows[0].InnerText; Food[] foods = rows .Where((_, index) => index > 0 && index % 2 == 0) .Zip(prices, (HtmlNode food, HtmlNode price) => new Food( HtmlEntity.DeEntitize(food.InnerText.Trim()), Utils.ParsePrice(price.InnerText.Split('-')[1].Trim(), ' ') )).ToArray(); return(new DayMenu(date, soup, foods)); } HtmlNodeCollection titles = menu.SelectNodes("./p[@class='title']"); HtmlNodeCollection texts = menu.SelectNodes("./div[@class='text']"); titles.Remove(0); texts.Remove(0); DayMenu[] dayMenus = titles.Zip(texts, ParseDay).ToArray(); string restaurantName = GetRestaurantName(doc); SaveRestaurant(restaurantName, dayMenus, Restaurants.PlzenskyDvur); }
/// <summary> /// Returns the label nodes related to the xpath resulting nodes. /// </summary> /// <param name="root">Root node from where the XPath expression will be evaluated.</param> /// <param name="xpath">XPath expression.</param> /// <returns>Dictonary with key-pair values, where the key is the main field node, and the value is a set of its related label nodes.</returns> public Dictionary <HtmlNode, HtmlNode[]> GetRelatedLabels(HtmlNode root, string xpath) { var results = new Dictionary <HtmlNode, HtmlNode[]>(); if (root != null) { foreach (HtmlNode node in root.SelectNodes(xpath)) //TODO: if none returns, throws a null object exception... { string id = node.GetAttributeValue("id", ""); if (string.IsNullOrEmpty(id)) { results.Add(node, null); } else { HtmlNodeCollection labels = this.HtmlDoc.DocumentNode.SelectNodes("//label"); if (labels == null) { results.Add(node, null); } else { results.Add(node, labels.Where(x => x.GetAttributeValue("for", "").Equals(id)).ToArray()); } } } } return(results); }
private void GhettoParse() { // https://stackoverflow.com/questions/37320624/htmlagilitypack-how-to-extract-html-between-some-tag //string query = "//node()[preceding-sibling::h2 or self::h2][following-sibling::h2 or self::h2]"; // grabs all h2 and nodes in between but won't grab "content" of last h2 node "Notes:" //string query = "//node()[preceding-sibling::h2][following-sibling::h2]"; // will skip first h2 "setup summary" and the last h2 "Notes:" //string query = "//node()[preceding-sibling::h2]"; // skips first h2 "setup summary" and doesn't skip last h2 "Notes:", grabs all nodes that have a h2 as //List<string> lines = new List<string>(); foreach (var item in documentNodes.Where(x => x.ParentNode is HtmlNode && !string.IsNullOrEmpty(x.InnerText.Trim()))) { //var tabs = item.XPath.Length > 24 ? "\t" : "\t\t"; //var line = item.XPath + ";" + item.InnerText.Trim(); var line = item.XPath; // if line = h2 // found area // identify area // get id, find Template id // if line = text // found property // if line = u // found value // // lines.add(line); parsedLines.Add(line); } }
//Updated July 15th public static void WriteShows() { try { HtmlNodeCollection collection = DownloadMgmtUtils.GetHtmlNodeCollection (ConfigurationManager.AppSettings["show_url"] + ConfigurationManager.AppSettings["show_url_list_postfix"], "//a"); MongoCollection mongoCollection = MongoUtils.GetMongoCollection ( @"mongodb://" + ConfigurationManager.AppSettings["mongoHost"] + @"/", ConfigurationManager.AppSettings["port"], ConfigurationManager.AppSettings["db"], ConfigurationManager.AppSettings["show_collection"] ); foreach ( HtmlNode link in collection.Where(link => !ConfigurationManager.AppSettings["show_innerhtml_excl"].Split(',') .Any(s => link.InnerHtml.Contains(s))).Where(link => link.Attributes[0].Value != null)) { mongoCollection.Insert(new TvShow() { Name = link.InnerHtml, Path = link.Attributes[0].Value }); } } catch (Exception e) { Log.AppendToLog("Error : FATAL Write Show Issue : " + e, ConfigurationManager.AppSettings["log_file"]); } }
private List <string> FetchMailingListMonths(string mailingListURL) { try { List <string> urls = new List <string>(); HtmlWeb web = new HtmlWeb(); HtmlDocument document = web.Load(mailingListURL); HtmlNodeCollection nodeCollection = document.DocumentNode.SelectSingleNode("//body").ChildNodes; List <HtmlNode> TableList = nodeCollection.Where(x => x.Name == "table").ToList(); foreach (var cell in document.DocumentNode.SelectNodes("//table/tr/td")) { HtmlNode threadNode = cell.ChildNodes.FirstOrDefault(x => x.Name == "a" && x.InnerText == "[ Thread ]"); if (threadNode != null) { string url = threadNode.Attributes["href"].Value; urls.Add(url); } } // construct full urls for (int i = 0; i < urls.Count; i++) { urls[i] = mailingListURL + urls[i]; Console.WriteLine(urls[i]); } return(urls); } catch (Exception) { throw; } }
public static bool UpdateKDRISINReport(KoreaEquityInfo item) { HtmlNodeCollection records = SearchISIN(item.KoreaName, false, false); if (records == null) { return(false); } string isin; string ticker; foreach (HtmlNode n in records.Where(n => n.SelectSingleNode(".//td[4]").InnerText.Trim().Equals("예탁증서"))) { isin = n.SelectSingleNode(".//td[2]").InnerText.Trim(); ticker = GetTickerByISIN(isin, 2); if (item.Ticker.Equals(ticker)) { item.ISIN = isin; item.Type = "KDR"; return(true); } } return(false); }
public IEnumerable <string> GetAllOfferLinks() { string pageSource = CurrentSiteSource(pageId); HtmlDocument document = new HtmlDocument(); document.LoadHtml(pageSource); var offerContainer = document.DocumentNode.SelectNodes("//div[@id=\"opbox-listing--base\"]"); if (offerContainer == null || !offerContainer.Any()) { throw new OfferListControllerException("Allegro layout has been changed", this); } else { HtmlNodeCollection offerLinks = offerContainer.First().SelectNodes("//a"); List <HtmlNode> offerLinksNode = offerLinks.Where(x => x.HasAttributes).ToList(); logger.Info("offerl links: " + offerLinksNode.Count); foreach (var offerLink in offerLinksNode) { if (offerLink.Attributes["href"] != null && offerLink.Attributes["href"].Value.Contains("/oferta/") && offerLink.Attributes["href"].Value.Contains("http")) { yield return(offerLink.Attributes["href"].Value); } } } }
private List <string> CheckPlaceholders() { List <string> errors = new List <string>(); try{ HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input"); if (nodes == null) { errors.Add("Unable to find any placeholder."); } else { List <HtmlNode> inputs = nodes.Where(x => !(new[] { "radio", "checkbox", "reset", "submit" }).Contains(x.GetAttributeValue("type", ""))).ToList(); nodes = this.HtmlDoc.DocumentNode.SelectNodes("//textarea"); if (nodes != null) { inputs.AddRange(nodes.ToList()); } if (inputs.Where(x => x.Attributes.Where(y => y.Name == "placeholder").Count() < 1).Count() > 0) { errors.Add("Some fields does not have any defined placeholder."); } } } catch (Exception e) { errors.Add(string.Format("EXCEPTION: {0}", e.Message)); } return(errors); }
private List <string> CheckSelectFields() { List <string> errors = new List <string>(); try{ HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//select"); if (nodes == null || nodes.Count < 1) { errors.Add("Does not contains enough select fields."); } else { errors.AddRange(CheckLabels(nodes.ToList(), "select")); } nodes = this.HtmlDoc.DocumentNode.SelectNodes("//select/option"); if (nodes == null || nodes.Count < 3) { errors.Add("The select field does not contains enough options."); } else { if (nodes.Where(x => x.Attributes.Where(y => y.Name == "selected").Count() > 0).Count() != 1) { errors.Add("The select field does not have a single default option."); } } } catch (Exception e) { errors.Add(string.Format("EXCEPTION: {0}", e.Message)); } return(errors); }
public void DownloadCouncilPdfFiles() { var docs = this.LoadDocumentsDoneSQL(); var queries = this.LoadQueriesDoneSQL(); WebClient c = new WebClient(); HtmlWeb web = new HtmlWeb(); Regex dateReg = new Regex("(([0-9]{1,2}\\/[0-9]{1,2}\\/[0-9]{4})|((0|1)[0-9]{1}[0-9]{2}[0-9]{4}))"); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; string html = this.GetHtml(categoryUrl, string.Empty); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); HtmlNode councilPacketNode = doc.DocumentNode.SelectSingleNode("//*[text()='Council Packet']"); HtmlNodeCollection fileNodes = null; if (councilPacketNode != null) { var ancestorsPacket = councilPacketNode.Ancestors(); councilPacketNode = ancestorsPacket.FirstOrDefault(t => t.OriginalName == "table"); fileNodes = councilPacketNode.SelectNodes(".//div[@id='file_name']//a[contains(@href,'.pdf')]"); } else { fileNodes = doc.DocumentNode.SelectNodes("//div[@id='RZdocument_center']//a[contains(@href,'.pdf')]"); } if (fileNodes != null) { var fileNodesTarget = fileNodes.Where(t => t.SelectSingleNode("./img") == null); foreach (HtmlNode fileNode in fileNodesTarget) { string fileUrl = fileNode.Attributes["href"].Value; fileUrl = !fileUrl.StartsWith("http") ? this.cityEntity.CityUrl + fileUrl : fileUrl; string meetingDateText = dateReg.Match(fileNode.InnerText).ToString(); Console.WriteLine("DEBUG: {0}", fileUrl); Console.WriteLine("DEBUG: {0}", fileNode.OuterHtml); Console.WriteLine("DEBUG: meeting date - {0}...", meetingDateText); DateTime meetingDate = meetingDateText.Length == 8 ? DateTime.ParseExact(meetingDateText, "MMddyyyy", null) : DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Early, skip..."); continue; } this.ExtractADoc(c, fileUrl, category, "pdf", meetingDate, ref docs, ref queries); } } } }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); foreach (string url in this.docUrls) { string category = url.Split('*')[0]; string categoryUrl = url.Split('*')[1]; string baseUrl = categoryUrl.Replace(categoryUrl.Split('/').LastOrDefault(), string.Empty); HtmlDocument doc = web.Load(categoryUrl); HtmlNodeCollection docNodes = doc.DocumentNode.SelectNodes("//div[@class='center_body_text center_scroller']//table//tr/td"); if (docNodes != null) { for (int i = this.dtStartFrom.Year; i <= DateTime.Now.Year; i++) { List <HtmlNode> entries = docNodes.Where(t => t.SelectSingleNode("./a[@href]") != null && t.SelectSingleNode("./a[@href]").Attributes["href"].Value.StartsWith(i.ToString())) .ToList(); foreach (HtmlNode entryNode in entries) { string meetingDateText = string.Format("{0}, {1}", HttpUtility.HtmlDecode(entryNode.InnerText.Replace("\n", string.Empty).Split('(').FirstOrDefault()), i); HtmlNode entryUrlNode = entryNode.SelectSingleNode("./a"); #if debug try { DateTime.Parse(meetingDateText); Console.WriteLine("No problem..."); continue; } catch { Console.WriteLine("Not match: {0} on {1}...", meetingDateText, categoryUrl); continue; } #endif DateTime meetingDate = DateTime.Parse(meetingDateText); if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } string docUrl = string.Format("{0}{1}", baseUrl, entryUrlNode.Attributes["href"].Value); this.ExtractADoc(c, docUrl, category, "pdf", meetingDate, ref docs, ref queries); } } } } }
// Return's a list of HtmlNodes representing every <a> tag linking to a Module // Used to determine all modules the user has access to and populate their content public static List <HtmlNode> GetModuleLinks(string pageSource) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(pageSource); HtmlNodeCollection allLinks = doc.DocumentNode.SelectNodes("//a[@href]"); // Module links have "type=Course" in the href tag List <HtmlNode> moduleLinks = allLinks.Where(item => item.Attributes["href"].Value.Contains("type=Course")).ToList(); return(moduleLinks); }
public static string GetOg(string url) { //Handle edge cases if (url.Contains("mobile.twitter.com")) { url = url.Replace("mobile.", String.Empty); } if (url.Contains("tumblr.com/image/")) { url = url.Replace("/image/", "/post/"); } string resultUrl = ""; string html = FetchHtml(url); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); if (url.Contains("reddit.com/gallery/")) { var images = doc.DocumentNode.SelectNodes("//img").Where(x => x.Attributes["src"]?.Value != null); var imageUrl = images.FirstOrDefault(x => x.Attributes["src"].Value.StartsWith("https://preview.redd.it/"))?.Attributes["src"].Value; return(imageUrl); } HtmlNodeCollection list = doc.DocumentNode.SelectNodes("//meta"); if (list == null) { return(string.Empty); } try { List <HtmlNode> ogImageNodes = list .Where(x => x.Attributes["property"]?.Value == "og:image" || x.Attributes["name"]?.Value == "twitter:image").ToList(); //Prefer any format vs gif var first = ogImageNodes.FirstOrDefault(x => x.Attributes["content"].Value.EndsWith(".jpg") || x.Attributes["content"].Value.EndsWith(".jpg?play") || //Gifv x.Attributes["content"].Value.EndsWith(".jpeg") || x.Attributes["content"].Value.EndsWith(".png") ); resultUrl = first != null ? first.Attributes["content"].Value.TrimEnd("?play") : ogImageNodes.First().Attributes["content"].Value.TrimEnd("?play"); } catch (Exception) { // ignored } return(resultUrl); }
private static DataTable GetPlayerCarrierTable(HtmlNodeCollection i_objHtmlTableClassNodes, DatableSelection i_DataTableRequest) { // Local variables DataTable objDataTable = null; HtmlNode objHtmlNode = null; try { if (i_DataTableRequest == DatableSelection.Batting) { objHtmlNode = i_objHtmlTableClassNodes.Where(x => x.InnerText.Contains("Batting Career Summary")).First().SelectNodes(".//table").First(); } else if (i_DataTableRequest == DatableSelection.Bowling) { objHtmlNode = i_objHtmlTableClassNodes.Where(x => x.InnerText.Contains("Bowling Career Summary")).First().SelectNodes(".//table").First(); } objDataTable = new DataTable(); // Creating columns to datatable var headers = objHtmlNode.SelectNodes(".//tr").First().SelectNodes(".//th").Select(x => x.InnerText); foreach (var header in headers) { objDataTable.Columns.Add(header); } // Adding values to datatable var rows = objHtmlNode.SelectNodes(".//tr").Skip(1).Select(tr => tr.Elements("td").Select(td => td.InnerText.Trim()).ToArray()); foreach (var row in rows) { objDataTable.Rows.Add(row); } return(objDataTable); } catch (Exception ex) { return(null); } }
public static string GetFavolaMenu() { var doc = new HtmlDocument { OptionFixNestedTags = true, OptionDefaultStreamEncoding = Encoding.GetEncoding("windows-1250"), OptionAutoCloseOnEnd = true }; string htmlString; using (var client = new WebClient()) { client.Encoding = Encoding.GetEncoding("windows-1250"); htmlString = client.DownloadString("http://www.kaval-group.si/FAVOLA,,ponudba/kosila"); } doc.LoadHtml(htmlString); HtmlNodeCollection results = doc.DocumentNode.SelectNodes(string.Format("//*[contains(@class,'{0}')]", "childNaviLiElement")); foreach (HtmlNode item in results.Where(item => item.FirstChild.Attributes["onclick"] != null)) { if (item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("d.M.yyyy"))) || item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("d.MM.yyyy"))) || item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("dd.M.yyyy"))) || item.InnerText.Contains(string.Format("{0}", DateTime.Now.ToString("dd.MM.yyyy")))) { //PrintHtml(item); string className = string.Format("show show-{0}", item.FirstChild.Attributes["class"].Value.Split('-').LastOrDefault()); HtmlNode activeMenu = doc.DocumentNode.SelectSingleNode(string.Format("//*[contains(@class,'{0}')]", className)); var sb = new StringBuilder("<b>Favola:</b>" + Environment.NewLine + Environment.NewLine); foreach ( HtmlNode childNode in activeMenu.ChildNodes.Where(x => x.Name == "p")) { if (!childNode.InnerText.Contains("***")) { sb.AppendLine(string.Format("<li>{0}</li>", childNode.InnerText)); } } return(sb.ToString()); //PrintHtml(activeMenu); } } return(""); }
public static IEnumerable <OutlineNode> ToOutlineNodes(this HtmlNodeCollection collection, bool exceptEmptyTags = false, bool disablePosition = true) { var prepareCollection = exceptEmptyTags ? collection.Where(x => Clear(x.InnerText) != string.Empty) : collection.ToList(); for (int i = 0; i < prepareCollection.Count(); i++) { yield return(new OutlineNode { Position = disablePosition ? -1 : i, TagName = prepareCollection.ElementAt(i).Name, InnerText = Clear(prepareCollection.ElementAt(i).InnerText) }); } }
/// <summary> /// Подготавливает описание /// </summary> /// <param name="post"></param> /// <returns></returns> string FormatDescriptionRutor(string desc, string poster) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(desc); HtmlNode htmlNode = htmlDocument.DocumentNode; HtmlNodeCollection nodesScrenshots = htmlNode.SelectNodes(@"//img[parent::a]"); if (nodesScrenshots != null) { foreach (var item in nodesScrenshots) { item.Remove(); } } HtmlNodeCollection nodesImgs = htmlNode.SelectNodes(@"//img"); if (nodesImgs != null && nodesImgs.Count == 2) { var item = nodesImgs.Where(el => el.GetAttributeValue("src", null) .Contains(Path.GetFileName(poster))) .SingleOrDefault(); item?.Remove(); } else { foreach (var item in nodesImgs) { item.Remove(); } } string description = htmlNode.OuterHtml.Replace("<div></div>", ""); description = description.Replace("<hr>", ""); description = description.Replace("<br>", ""); while (description.Contains(Environment.NewLine + Environment.NewLine)) { description = description.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine); } return(description); }
public void DownloadCouncilPdfFiles() { List <Documents> docs = this.LoadDocumentsDoneSQL(); List <QueryResult> queries = this.LoadQueriesDoneSQL(); HtmlWeb web = new HtmlWeb(); WebClient c = new WebClient(); Regex dateReg = new Regex("[a-zA-Z]+[\\s]{0,1}[0-9]{1,2},[\\s]{0,1}[0-9]{4}"); foreach (string url in this.docUrls) { HtmlDocument meetingHomeDoc = web.Load(url); HtmlNodeCollection fileNodes = meetingHomeDoc.DocumentNode.SelectNodes("//a[contains(@href,'.pdf')]"); for (int year = this.dtStartFrom.Year; year <= DateTime.Now.Year; year++) { var targetNodes = fileNodes.Where(t => t.OuterHtml.Contains(year.ToString())); if (targetNodes != null) { foreach (HtmlNode fileNode in targetNodes) { string nodeUrl = fileNode.Attributes["href"].Value; nodeUrl = !nodeUrl.StartsWith("http") ? this.cityEntity.CityUrl + nodeUrl : nodeUrl; DateTime meetingDate = DateTime.MinValue; string meetingDateText = dateReg.Match(nodeUrl).ToString(); if (!string.IsNullOrEmpty(meetingDateText)) { meetingDate = DateTime.Parse(meetingDateText); } if (meetingDate < this.dtStartFrom) { Console.WriteLine("Too early, skip..."); continue; } string category = nodeUrl.Contains("PC") || nodeUrl.Contains("Plan") ? "Planning" : "City Council"; this.ExtractADoc(c, nodeUrl, category, "pdf", meetingDate, ref docs, ref queries); } } } } }
//Updated July 30th - Combined with WriteEpisodes public static void RefreshEpisodes() { try { MongoCollection mongoCollection = MongoUtils.GetMongoCollection ( @"mongodb://" + ConfigurationManager.AppSettings["mongoHost"] + @"/", ConfigurationManager.AppSettings["port"], ConfigurationManager.AppSettings["db"], ConfigurationManager.AppSettings["show_collection"] ); foreach (var show in mongoCollection.FindAllAs <TvShow>().ToList <TvShow>()) { if (!ConfigurationManager.AppSettings["show_excl"].Split(',').Any(s => show.Name.Contains(s))) { HtmlNodeCollection collection = DownloadMgmtUtils.GetHtmlNodeCollection( ConfigurationManager.AppSettings["show_url"] + show.Path, "//a"); if (collection != null) { foreach (var link in collection.Where(link => !ConfigurationManager.AppSettings["episode_innerhtml_excl"].Split(',') .Any(s => link.InnerHtml.Contains(s)) && link.Attributes.Count > 0 && !link.Attributes[0].Value.Contains("tvnews") && link.Attributes[HrefIndex(link)].Value.Contains("/ep"))) { UpdateShowFromHtml(show, link.InnerHtml, collection, collection.IndexOf(link)); } } mongoCollection.Save(show); } } } catch (Exception e) { Log.AppendToLog("Error : FATAL Refresh Episodes Issue : " + e, ConfigurationManager.AppSettings["log_file"]); } }
/// <summary> /// Static helper method that tears apart the provided HtmlDocument to construct /// a collection of WtaPlayer objects. /// </summary> /// <param name="wtaPlayerHtmlDocument">The HtmlDocument that contains the WTA player data.</param> /// <returns>An IEnumerable of type <see cref="WtaPlayer"/>.</returns> private static IEnumerable <WtaPlayer> GetPlayersFromHtmlDocument(HtmlDocument wtaPlayerHtmlDocument) { // Work with the returned HTML data - first step is to identify the table rows (only a single table on the page at // the time of producing this code sample). We only want every other 'tr', as these are the only ones that contain data, also // keeping in mind that spacer 'tr' elements exist (without a 'td' Count of 14, so also remove these elements) HtmlNodeCollection tbodyRowNodes = wtaPlayerHtmlDocument.DocumentNode.SelectNodes("//tbody/tr"); IEnumerable <HtmlNode> everyOtherNode = tbodyRowNodes .Where((node, index) => index % 2 == 0 && node?.ChildNodes?.Count == 14) .Take(100); // Setup a regex to clean up the rank information Regex rankCleanerRegex = new Regex(@"<[^>]+>| "); // Construct and return WtaPlayer objects based on some very (fixed, agreed!) ripping of text from td elements return(everyOtherNode.Select(node => new WtaPlayer(int.Parse(rankCleanerRegex.Replace(node.ChildNodes[0].InnerText, string.Empty).Trim()), node.ChildNodes[3].InnerText.Trim(), int.Parse(node.ChildNodes[6].InnerText.Trim())))); }
/// <summary> /// 获取分页 /// </summary> private void GetNextPage(string url, HtmlNodeCollection aNodes) { if (!string.IsNullOrEmpty(url) && aNodes != null) { var hitANode = aNodes.Where(c => c.InnerText.Contains("末页")).FirstOrDefault(); if (hitANode != null) { //https://xm.focus.cn/loupan/p38/?saleStatus=6 var hrefAttr = hitANode.Attributes["href"]; if (hrefAttr != null) { var findUrl = hrefAttr.Value; var getPageNum = Toolslib.Str.Sub(findUrl, "loupan/p", "/"); var pageNum = 0; if (int.TryParse(getPageNum, out pageNum)) { // UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = 1 });//第一页 var oldPageText = string.Format("/loupan/p{0}", getPageNum); //获取当前页数 for (var i = 2; i <= pageNum; i++) { //url https://xm.focus.cn/loupan/?saleStatus=6 var newPageText = string.Format("/loupan/p{0}", i); var resultUrl = findUrl.Replace(oldPageText, newPageText); if (!filter.Contains(resultUrl)) { UrlQueue.Instance.EnQueue(new UrlInfo(resultUrl) { Depth = 1 }); } } } else { Console.WriteLine("无法获取页数"); } } } } }
private List <string> CheckReset() { List <string> errors = new List <string>(); try{ HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input"); //TODO: also button is alowed if (nodes == null) { errors.Add("Does not contains any reset button."); } else if (nodes.Where(x => x.GetAttributeValue("type", "").Equals("reset")).Count() < 1) { errors.Add("Does not contains any reset button."); } } catch (Exception e) { errors.Add(string.Format("EXCEPTION: {0}", e.Message)); } return(errors); }
/// <summary> /// 通过XQuery查询到候选结果,再用Validate函数过滤,返回结果 /// </summary> /// <param name="root"></param> /// <param name="XQuery"></param> /// <param name="ValidateFunction"></param> /// <returns></returns> public static List <HtmlNode> FilterNodes(HtmlNode root, string XQuery, ValidateNode ValidateFunction = null) { if (root == null || string.IsNullOrEmpty(XQuery)) { return(null); } HtmlNodeCollection nodes = root.SelectNodes(XQuery); if (nodes == null) { return(null); } if (ValidateFunction == null) { return(nodes.ToList()); } else { return(nodes.Where(n => ValidateFunction(n)).ToList()); } }
public static int[] GetLinkPages(HtmlDocument doc) { if (doc != null) { Column column = new Column(); string pagePath = "//td[@colspan=\"12\"][@align=\"center\"][1]/text()"; HtmlNodeCollection categoryList = doc.DocumentNode.SelectNodes(pagePath); if (categoryList == null) { pagePath = "//td[@colspan=\"13\"][@align=\"center\"][1]/text()"; categoryList = doc.DocumentNode.SelectNodes(pagePath); } if (categoryList != null) { HtmlNode[] innerText = categoryList.Where(x => x.InnerText.Contains("共")).ToArray(); if (innerText.Length > 0) { string result = System.Text.RegularExpressions.Regex.Replace(innerText[0].InnerText, @"[^0-9]+", ","); string[] args = result.Split(','); args = args.Where(x => !string.IsNullOrEmpty(x)).ToArray(); int[] info = new int[2]; info[0] = Convert.ToInt32(args[0]); info[1] = Convert.ToInt32(args[1]); return(info); } else { return(null); } } else { return(null); } } else { return(null); } }
private List <string> CheckInputFields(string type, int min) { List <string> errors = new List <string>(); try{ HtmlNodeCollection nodes = this.HtmlDoc.DocumentNode.SelectNodes("//input"); if (nodes == null) { errors.Add(string.Format("Does not contains any {0} fields.", type)); } else { //TODO: get the nodes using XPath... I can't get the correct one, maybe a bug? //input[@type='text'] //TODO: solved in Css3Validator (method CheckCssProperty) List <HtmlNode> filtered = nodes.Where(x => x.GetAttributeValue("type", "").Equals(type)).ToList(); if (filtered.Count() < min) { errors.Add(string.Format("Does not contains enough {0} fields.", type)); } else if (type == "radio" || type == "checkbox") { if (filtered.GroupBy(x => x.GetAttributeValue("name", "")).Count() > 1) { errors.Add(string.Format("The {0} fields does not share the same name.", type)); } if (filtered.Where(x => x.Attributes.Where(y => y.Name == "checked").Count() > 0).Count() != 1) { errors.Add(string.Format("The {0} fields does not have a single default value.", type)); } } errors.AddRange(CheckLabels(filtered, type)); } } catch (Exception e) { errors.Add(string.Format("EXCEPTION: {0}", e.Message)); } return(errors); }
/// <summary> /// We use XPath expressions to extract all available links from the article, making sure that the links are taken only from the main content section where the article is present. /// </summary> public List <Link> GetLinks() { HtmlNode mainContent = WikipediaUtility.GetMainContent(website); HtmlNodeCollection links = mainContent.SelectNodes("//a[starts-with(@href,'/wiki/')]"); return(links .Where(n => n.GetAttributeValue("href", null) != null) .Select(n => n.Attributes["href"].Value) .Distinct() .Where(urll => { string urllow = urll.ToLower(); return !urllow.StartsWith("/wiki/file:") && !urllow.StartsWith("/wiki/template:") && !urllow.StartsWith("/wiki/special:"); }) .Select(urll => new Link() { URL = urll }) .ToList()); }
public List <string> FetchMontlyURLs(string monthURL) { try { List <string> urls = new List <string>(); HtmlWeb web = new HtmlWeb(); HtmlDocument document = web.Load(monthURL); HtmlNodeCollection nodeCollection = document.DocumentNode.SelectSingleNode("//body").ChildNodes; List <HtmlNode> allUlList = nodeCollection.Where(x => x.Name == "ul").ToList(); if (allUlList.Count > 1) { HtmlNode urlsULNode = allUlList[1]; // fetching ul foreach (var liItem in urlsULNode.ChildNodes.Where(x => x.Name == "li").ToList()) { HtmlNode urlNode = liItem; urls.Add(liItem.FirstChild.Attributes["href"].Value); if (liItem.ChildNodes.FirstOrDefault(x => x.Name == "ul") != null) { RecursiveFetchMontlyURL(liItem, urls); } } } // construct full urls for (int i = 0; i < urls.Count; i++) { urls[i] = monthURL.Substring(0, monthURL.LastIndexOf('/') + 1) + urls[i]; Console.WriteLine(urls[i]); } return(urls); } catch (Exception ex) { throw; } }
public static SongInfo[] GetPlaylist(long id) { string html = HttpHelper.HttpGet($"http://music.migu.cn/v3/music/playlist/{id}", headers: DefaultHeaders); try { HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); HtmlNode root = document.DocumentNode; HtmlNodeCollection list = root.SelectNodes("//div[@class='row J_CopySong']"); return(list.Where(p => p.Attributes.Any(q => q.Name == "data-cid" && !string.IsNullOrEmpty(q.Value))).Select(p => new SongInfo { CopyrightId = p.Attributes["data-cid"].Value, Name = p.SelectSingleNode(".//a[contains(@class,'song-name-txt')]")?.InnerText, Artist = string.Join(",", p.SelectSingleNode("./div[contains(@class,'song-singers')]").SelectNodes("./a")?.Select(q => q.InnerText) ?? Array.Empty <string>()), Album = p.SelectSingleNode("./div[contains(@class,'song-belongs')]").SelectSingleNode("./a")?.InnerText, AlbumId = int.Parse(p.SelectSingleNode("./div[contains(@class,'song-belongs')]").SelectSingleNode("./a")?.Attributes["href"].Value.Split('/').Last() ?? "0") }).ToArray()); } catch (JsonReaderException) { throw new NotImplementedException("意外的服务器返回"); } }
private void LoadUDrevaka() { DayMenu ParseDay(HtmlNode day) { string dateStr = day.SelectSingleNode("./div[@class='menu-day']").InnerText; DateTime date = Utils.ParseDateTime(dateStr); HtmlNodeCollection rows = day.SelectNodes("./div[@class='row']"); int soupIndex = rows[0].SelectSingleNode("./div").InnerText.IndexOf("Polévka:", 0, 8); string soup = soupIndex >= 0 ? rows[0].SelectSingleNode("./div").InnerText.Substring(9) : null; Food[] foods = rows // Check if row is really food and not soup or note .Where((row, index) => index > soupIndex && row.SelectSingleNode("./div[@class='col-sm-10 col-xs-9']") != null && row.SelectSingleNode("./div[@class='col-sm-2 col-xs-3 special-menu-price']") != null) .Select((row) => new Food( HtmlEntity.DeEntitize(Utils.RemoveLeadingNumbers( row.SelectSingleNode("./div[@class='col-sm-10 col-xs-9']").InnerText) ), Utils.ParsePrice( row.SelectSingleNode("./div[@class='col-sm-2 col-xs-3 special-menu-price']").InnerText ) )).ToArray(); return(new DayMenu(date, soup, foods)); } HtmlNode doc = Utils.GetHtmlDoc(Constants.udrevakaUrl).DocumentNode; HtmlNode menu = doc.SelectSingleNode("//ul[@class='special-menu pb-xlg']"); HtmlNodeCollection days = menu.SelectNodes("./li[@class='item-day']"); DayMenu[] dayMenus = days.Select(ParseDay).ToArray(); string restaurantName = GetRestaurantName(doc); SaveRestaurant(restaurantName, dayMenus, Restaurants.UDrevaka); }