protected static MatchInfos ParseLine(HtmlNode node) { MatchInfos infos = null; infos = new MatchInfos(); var infosAdresse = node.Descendants("div").ElementAt(0).ChildNodes.ElementAt(1).Descendants("#text"); infos.Nom = infosAdresse.ElementAt(0).InnerText.Trim().Replace(" ", " "); infos.Adresse = infosAdresse.ElementAt(1).InnerText.Trim().Replace(" ", " "); infos.Ville = infosAdresse.ElementAt(2).InnerText.Trim().Replace(" ", " "); if(node.ChildNodes.Where(item => item.Name == "div").Count() > 2) { var infosArbitres = node.ChildNodes.Where(item => item.Name == "div") .ElementAt(2) .ChildNodes .ElementAt(1) .Descendants("ul") .ElementAt(0) .Descendants("li"); foreach (var arbitre in infosArbitres) { infos.Arbitres.Add(HtmlEntity.DeEntitize(arbitre.InnerText.Trim())); } } return infos; }
public void ParseMainArticle(HtmlNode articleNode) { ArticleImage = articleNode.Descendants("img").FirstOrDefault().GetAttributeValue("src", string.Empty); Title = WebUtility.HtmlDecode( articleNode.Descendants("h2").FirstOrDefault().Descendants("a").FirstOrDefault().InnerText); ArticleLink = articleNode.Descendants("h2") .FirstOrDefault() .Descendants("a") .FirstOrDefault() .GetAttributeValue("href", string.Empty); Date = articleNode.Descendants("span") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("date")) .InnerHtml; Author = WebUtility.HtmlDecode( articleNode.Descendants("span") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("author")) .Descendants("a") .FirstOrDefault() .InnerText); AuthorLink = articleNode.Descendants("span") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("author")) .Descendants("a") .FirstOrDefault() .GetAttributeValue("href", string.Empty); ArticleText = WebUtility.HtmlDecode(articleNode.Descendants("p").FirstOrDefault().InnerText); }
public void Parse(HtmlNode rowNode) { Status = rowNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("status")) .Descendants("img") .FirstOrDefault() .GetAttributeValue("src", string.Empty); Icon = rowNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("icon")) .Descendants("img") .FirstOrDefault() .GetAttributeValue("src", string.Empty); Title = rowNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("title")) .InnerText; Sender = rowNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("sender")) .InnerText; Date = rowNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("date")) .InnerText; }
public static ForumUserEntity FromPost(HtmlNode postNode) { var user = new ForumUserEntity { Username = WebUtility.HtmlDecode( postNode.Descendants("dt") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("author")) .InnerHtml), DateJoined = DateTime.Parse(postNode.Descendants("dd") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("registered")) .InnerHtml) }; var avatarTitle = postNode.Descendants("dd").FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("title")); var avatarImage = postNode.Descendants("dd").FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("title")).Descendants("img").FirstOrDefault(); if (avatarTitle != null) { user.AvatarTitle = WebUtility.HtmlDecode(avatarTitle.InnerText).WithoutNewLines().Trim(); } if (avatarImage != null) { user.AvatarLink = FixPostHtmlImage(avatarImage.OuterHtml); } user.Id = Convert.ToInt64(postNode.DescendantsAndSelf("td").FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Contains("userinfo")).GetAttributeValue("class", string.Empty).Split('-')[1]); return user; }
public Tuple <string, string, string, string> GetMatchData(HtmlAgilityPack.HtmlNode MatchListItem) { Tuple <string, string, string> newTuple; string date = MatchListItem.Descendants("td") .Where(node => (node.GetAttributeValue("class", "") .Contains("date"))).FirstOrDefault().InnerText.Trim(); string day = date.Substring(0, date.IndexOf('/')); string month = date.Substring(date.IndexOf('/') + 1, 2); string year = date.Substring(date.LastIndexOf('/') + 1, 2); string testString = date.Remove(date.LastIndexOf('/') + 1, 2); date = month + "/" + day + "/" + "20" + year; //DateTime evaluatedDate = Convert.ToDateTime(date); string homeTeam = MatchListItem.Descendants("td") .Where(node => (node.GetAttributeValue("class", "") .Contains("team-a"))).FirstOrDefault().InnerText.Trim(); string awayTeam = MatchListItem.Descendants("td") .Where(node => (node.GetAttributeValue("class", "") .Contains("team-b"))).FirstOrDefault().InnerText.Trim(); string score = MatchListItem.Descendants("td") .Where(node => (node.GetAttributeValue("class", "") .Contains("score-time"))).FirstOrDefault().InnerText.Trim(); return(new Tuple <string, string, string, string>(date, homeTeam, awayTeam, score)); }
private static Activity ExtractActivity(HtmlNode node, int index) { var name = node.Descendants("div") .Where(div => div.GetAttributeValue("class", null) == "action_prompt") .Select(div => HtmlEntity.DeEntitize(div.InnerText).Trim().Replace(" ", " ")) .FirstOrDefault(); if (name == null) { throw new InvalidDataException("Unable to find activity name"); } return new Activity { Sequence = index, Name = name, Note = node.Descendants("li") .Where(li => li.GetAttributeValue("class", null) == "stream_note") .Select(li => HtmlEntity.DeEntitize(li.InnerText).Trim()) .FirstOrDefault(), Sets = node.Descendants("li") .Where(li => li.GetAttributeValue("class", null) != "stream_note") .Select(ExtractSet) .ToList() }; }
public Team GetTeamInfo(HtmlNode div) { HtmlNode[] links = div.Descendants("a").ToArray(); string name = div.Descendants("h1").First().InnerHtml; name = name.Substring(name.IndexOf(" ") + 1, name.IndexOf("Roster") - 9); string divName = links[3].NextSibling.InnerHtml; divName = divName.Substring(1, divName.Length - 12); Team team = new Team(name, new Division(divName)); string coachName = links[5].InnerHtml; team.Coach = new Coach(coachName, team); HtmlNode[] strongs = div.Descendants("strong").ToArray(); string arenaName = strongs[strongs.Length - 2].NextSibling.InnerHtml; arenaName = arenaName.Substring(1, arenaName.IndexOf("&") - 2); string arenaAttendance = strongs[strongs.Length - 1].NextSibling.InnerHtml; arenaAttendance = arenaAttendance.Substring(1, arenaAttendance.IndexOf("(") - 2).Replace(",", string.Empty); team.Arena = new Arena(arenaName, int.Parse(arenaAttendance)); return team; }
public OverviewResult ConvertSingleResult(HtmlNode node) { OverviewResult result = new OverviewResult(); result.Url = node.Descendants("a").SingleOrDefault()?.Attributes["href"]?.Value; result.Type = node.Descendants("h4").SingleOrDefault()?.InnerText; result.Name = node.Descendants("h2").SingleOrDefault()?.InnerText; return result; }
private void buttonParseHtml_Click(object sender, EventArgs e) { OpenFileDialog openFileDialog = new OpenFileDialog(); openFileDialog.Filter = "HTML File (*.html;)|*.html"; openFileDialog.Multiselect = false; if (openFileDialog.ShowDialog() == DialogResult.OK) { if (String.IsNullOrEmpty(openFileDialog.FileName)) { return; } string strHtml = String.Empty; using (StreamReader reader = new StreamReader(openFileDialog.FileName, Encoding.UTF8)) { strHtml = reader.ReadToEnd(); reader.Close(); } HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(strHtml); //加载html foreach (var err in doc.ParseErrors) { Console.WriteLine(err.Code); } //Console.WriteLine(doc.Text); HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode; //获取文档的根节点 //Console.WriteLine(rootNode.OuterHtml); #if true string xpath = @"//table"; HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath); //获取表格 foreach (var script in node.Descendants("script").ToArray()) { script.Remove(); } foreach (var style in node.Descendants("style").ToArray()) { style.Remove(); } string innerText = node.OuterHtml; //到这里就是纯纯的表格了 var trNodes = node.SelectNodes("tr"); foreach (var trnod in trNodes) //遍历行 { var tdNodes = trnod.SelectNodes("td"); for (int i = 0; i < tdNodes.Count; i++) //遍历列 { Console.WriteLine(tdNodes[i].InnerText); } } #endif } }
public void ParseThread(HtmlNode popularThreadsNode) { Tag = popularThreadsNode.Descendants("img").FirstOrDefault().GetAttributeValue("src", string.Empty); Title = WebUtility.HtmlDecode(popularThreadsNode.Descendants("a").FirstOrDefault().InnerText); Id = Convert.ToInt64( popularThreadsNode.Descendants("a") .FirstOrDefault() .GetAttributeValue("href", string.Empty) .Split('=')[1]); }
private HtmlNode FindRosterTable(HtmlNode document) { IEnumerable<HtmlNode> blocks = document.Descendants("th"). Where(h => h.InnerHtml.IndexOf("2011-12 Roster") != -1); if (blocks.Count() == 0) { blocks = document.Descendants("td"). Where(h => h.InnerHtml.IndexOf("2011-12 Roster") != -1); } return blocks.Last().Ancestors("table").First(); }
/// <summary> /// Parses a forum post in a thread. /// </summary> /// <param name="postNode">The post HTML node.</param> public void Parse(HtmlNode postNode) { User = ForumUserEntity.FromPost(postNode); HtmlNode postDateNode = postNode.Descendants() .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("postdate")); string postDateString = postDateNode == null ? string.Empty : postDateNode.InnerText; if (postDateString != null) { PostDate = postDateString.WithoutNewLines().Trim(); } PostIndex = ParseInt(postNode.GetAttributeValue("data-idx", string.Empty)); var postId = postNode.GetAttributeValue("id", string.Empty); if (!string.IsNullOrEmpty(postId) && postId.Contains("#")) { PostId = Int64.Parse(postNode.GetAttributeValue("id", string.Empty) .Replace("post", string.Empty) .Replace("#", string.Empty)); } else if (!string.IsNullOrEmpty(postId) && postId.Contains("post")) { PostId = Int64.Parse(postNode.GetAttributeValue("id", string.Empty) .Replace("post", string.Empty)); } else { PostId = 0; } var postBodyNode = postNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("postbody")); this.FixQuotes(postBodyNode); PostHtml = postBodyNode.InnerHtml; HtmlNode profileLinksNode = postNode.Descendants("td") .FirstOrDefault(node => node.GetAttributeValue("class", string.Empty).Equals("postlinks")); HtmlNode postRow = postNode.Descendants("tr").FirstOrDefault(); if (postRow != null) { HasSeen = postRow.GetAttributeValue("class", string.Empty).Contains("seen"); } User.IsCurrentUserPost = profileLinksNode.Descendants("img") .FirstOrDefault(node => node.GetAttributeValue("alt", string.Empty).Equals("Edit")) != null; }
public void Parse(HtmlNode node) { try { Id = Convert.ToInt32(node.Descendants("input").First().GetAttributeValue("value", string.Empty)); ImageUrl = node.Descendants("img").First().GetAttributeValue("src", string.Empty); Title = node.Descendants("img").First().GetAttributeValue("alt", string.Empty); } catch (Exception) { // If, for some reason, it fails to get an icon, ignore the error. // The list view won't show it. } }
private void SearchFromNode(HtmlNode baseNode) { var nodes = Enumerable.Empty<HtmlNode>(); if (!_html.DocumentNode.HasChildNodes) ParseHtml(); if (chkXPath.IsChecked == true) nodes = baseNode.SelectNodes(txtSearchTag.Text); else nodes = baseNode.Descendants(txtSearchTag.Text); if (nodes == null) return; listResults.Items.Clear(); foreach (var node in nodes) { var tr = new NodeTreeView { BaseNode = node }; var lvi = new ListBoxItem(); var pnl = new StackPanel(); pnl.Children.Add(new Label { Content = string.Format("id:{0} name:{1} children{2}", node.Id, node.Name, node.ChildNodes.Count), FontWeight = FontWeights.Bold }); pnl.Children.Add(tr); lvi.Content = pnl; listResults.Items.Add(lvi); } tabControl1.SelectedItem = tabSearchResults; }
private static IEnumerable<string> ExtractTitles(HtmlNode container) { // <a href="http://fr.feedbooks.com/item/316137/les-les-de-l-espace" itemprop="url">Les Îles de l'espace</a> return from element in container.Descendants("a") where element.GetAttributeValue("href", "").StartsWith("http://fr.feedbooks.com/item/") select element.InnerText; }
private List <VideoInfo> GetOppetArkivVideoList(HtmlAgilityPack.HtmlNode node) { List <VideoInfo> videoList = new List <VideoInfo>(); foreach (var article in node.Descendants("article")) { VideoInfo video = new VideoInfo(); video.VideoUrl = article.Descendants("a").Select(a => a.GetAttributeValue("href", "")).FirstOrDefault(); Uri result; if (!Uri.TryCreate(video.VideoUrl, UriKind.Absolute, out result)) { Uri.TryCreate(new Uri("http://www.oppetarkiv.se/"), video.VideoUrl, out result); } video.VideoUrl = result.ToString(); if (!string.IsNullOrEmpty(video.VideoUrl)) { video.Title = HttpUtility.HtmlDecode((article.Descendants("a").Select(a => a.GetAttributeValue("title", "")).FirstOrDefault() ?? "").Trim().Replace('\n', ' ')); video.Thumb = (article.SelectSingleNode(".//noscript/img") != null) ? article.SelectSingleNode(".//noscript/img").GetAttributeValue("src", "") : ""; if (video.Thumb.StartsWith("//")) { video.Thumb = "http:" + video.Thumb; } video.Airdate = article.Descendants("time").Select(t => t.GetAttributeValue("datetime", "")).FirstOrDefault(); if (!string.IsNullOrEmpty(video.Airdate)) { video.Airdate = DateTime.Parse(video.Airdate).ToString("d", OnlineVideoSettings.Instance.Locale); } videoList.Add(video); } } return(videoList); }
public bool LoadFromHtml(HtmlNode RootNode, DataLoadedEventArgs loadedEventArgs) { try { IEnumerable<HtmlNode> linkNodes = RootNode.Descendants("a"); foreach (HtmlNode linkNode in linkNodes) { MitbbsClubGroupLink clubGroupLink = new MitbbsClubGroupLink(); clubGroupLink.ParentUrl = Url; if (clubGroupLink.LoadFromHtml(linkNode)) { ClubGroupLinks.Add(clubGroupLink); IsLoaded = true; } } } catch (Exception e) { IsLoaded = false; loadedEventArgs.Error = e; } return IsLoaded; }
public static ForumUserRapSheetEntity FromRapSheet(HtmlNode rapSheetNode) { var rapSheet = new ForumUserRapSheetEntity(); List<HtmlNode> rapSheetData = rapSheetNode.Descendants("td").ToList(); rapSheet.PunishmentType = rapSheetData[0].Descendants("b").FirstOrDefault().InnerText; rapSheet.Date = rapSheetData[1].InnerText; rapSheet.HorribleJerk = rapSheetData[2].Descendants("a").FirstOrDefault().InnerText; rapSheet.HorribleJerkId = Convert.ToInt64( rapSheetData[2].Descendants("a").FirstOrDefault().GetAttributeValue("href", string.Empty).Split('=') [3]); rapSheet.PunishmentReason = rapSheetData[3].InnerText; rapSheet.RequestedBy = rapSheetData[4].Descendants("a").FirstOrDefault().InnerText; rapSheet.RequestedById = Convert.ToInt64( rapSheetData[4].Descendants("a").FirstOrDefault().GetAttributeValue("href", string.Empty).Split('=') [3]); rapSheet.ApprovedBy = rapSheetData[5].Descendants("a").FirstOrDefault().InnerText; rapSheet.ApprovedById = Convert.ToInt64( rapSheetData[5].Descendants("a").FirstOrDefault().GetAttributeValue("href", string.Empty).Split('=') [3]); return rapSheet; }
private static ThreadPageMetadata ParseIsThreadBookmarked(this ThreadPageMetadata page, HtmlNode node) { // <img src="http://fi.somethingawful.com/images/buttons/button-unbookmark.png" // alt="Unbookmark" // class="thread_bookmark unbookmark" // title="Unbookmark thread"> var bookmarkIconNode = node.Descendants("img") .Where(n => n.GetAttributeValue("class", string.Empty) .Contains("thread_bookmark")) .FirstOrDefault(); if (bookmarkIconNode != null) { string value = bookmarkIconNode.GetAttributeValue("class", string.Empty); if (value.ToLower().Contains("unbookmark")) page.IsBookmarked = true; else page.IsBookmarked = false; } return page; }
/// <summary> /// NOTE: only works for ONE classname. If several TODO: create another method.. /// </summary> /// <param name="tagName"></param> /// <param name="className"></param> /// <returns></returns> public HtmlNode GetSingleNodeByClassName(string tagName, string className, HtmlNode node) { //TODO: fire a warning if any single methods would have produced more than one result, could mean the layout has changed return node.Descendants() .Where(x => x.Name == tagName && x.Attributes.Contains("class") && x.Attributes["class"].Value.Split().Contains(className)).SingleOrDefault(); }
public HtmlNode GetElementWithAttribute(HtmlNode root, string elementName, string attributeName, string attributeValue) { attributeName = attributeName.ToLower(); attributeValue = attributeValue.ToLower(); elementName = elementName.ToLower(); HtmlNode result = null; var node = root .Descendants() .Where(n => n.Name == elementName) .SelectMany(n => n .Attributes .Where(a => a.Name == attributeName) .Select(a => new { Element = n, ClassName = a.Value.ToLower() })) .FirstOrDefault(l => l.ClassName.Contains(attributeValue)); if (node == null) { Console.WriteLine("{0} for {1}={2} is null", elementName, attributeName, attributeValue); } else { result = node.Element; } return result; }
private static IEnumerable<string> ExtractAuthors(HtmlNode container) { // <a href="/store/top?contributor=Isaac+Marion&lang=fr" class="gray" title="Isaac Marion">Isaac Marion</a> return from element in container.Descendants("a") where element.GetAttributeValue("class", "") == "gray" && element.GetAttributeValue("href", "").StartsWith("/store/top?contributor=") select element.InnerText; }
private static IEnumerable<string> ParseHtmlColumns(HtmlNode htmlRow, int rowIndex) { var nodeName = rowIndex == 0 ? "th" : "td"; var htmlColumns = htmlRow.Descendants(nodeName); var columns = htmlColumns.Select(CleanColumnValue); return columns; }
public MsdnIssue(int year, HtmlNode tableCellCover) { var cellInfos = tableCellCover.NextSibling; Year = year; Title = cellInfos.Element("strong").InnerText; CoverImageUrl = new Uri(tableCellCover.Descendants("img").First().GetAttributeValue("src", "")); _url = new Uri(tableCellCover.Element("a").GetAttributeValue("href", "")); }
private IEnumerable<string> ParseExpectedTeam(HtmlNode item) { //var playerNodes = item.FirstChild.SelectNodes("//li/span[@class='player-name']"); var playerNodes = item .Descendants("div").Single() .Descendants("ul").SelectMany(x => x.SelectNodes("li/span[@class='player-name']")); return playerNodes.Select(node => node.InnerText); }
public static HtmlNode GetSingleNodeTag(HtmlNode html, string tag, string attribute, string value) { var res = html.Descendants(tag) .SingleOrDefault( x => x.Attributes.Contains(attribute) && x.Attributes[attribute].Value.Contains(value)); return res; }
protected override void ExtractForm(HtmlNode documentNode) { var postFormNodes = from formNode in documentNode.Descendants("form") where formNode.Attributes.Contains("name") && (formNode.Attributes["name"].Value == "postform" || formNode.Attributes["name"].Value == "form1") select formNode; foreach (HtmlNode postFormNode in postFormNodes) { HtmlNode postFormParent = postFormNode.ParentNode; SendPageUrl = HtmlUtilities.GetAbsoluteUrl(EditPageUrl, postFormNode.Attributes["action"].Value); _web.FormElements = new FormElementCollection(postFormParent); IEnumerable<HtmlNode> imageNodes = postFormParent.Descendants("img"); foreach (HtmlNode imageNode in imageNodes) { VerifyImageUrl = HtmlUtilities.GetAbsoluteUrl(EditPageUrl, imageNode.Attributes["src"].Value); break; } if (_web.FormElements.ContainsKey("title")) { PostTitle = _web.FormElements["title"]; } if (_web.FormElements.ContainsKey("text")) { PostBody = _web.FormElements["text"]; } if (_web.FormElements.ContainsKey("validcode")) { VerifyCode = _web.FormElements["validcode"]; } if (_web.FormElements.ContainsKey("userid")) { Recipient = _web.FormElements["userid"]; } else { Recipient = null; } if (_web.FormElements.ContainsKey("attachname")) { UploadFileUrl = HtmlUtilities.GetAbsoluteUrl(EditPageUrl, "bbsupload.php"); } else { UploadFileUrl = null; } IsEditPageLoaded = true; break; } }
private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { if (!clip.Contains("equestriadaily.com")) { backgroundWorker1.ReportProgress(100, new ClipResult() { status = "error", message = "No valid Equestria Daily link found on clipboard" }); return; } backgroundWorker1.ReportProgress(10); HtmlWeb website = new HtmlWeb(); HtmlAgilityPack.HtmlDocument document = website.Load(clip); backgroundWorker1.ReportProgress(25); //#(\d+) HtmlAgilityPack.HtmlNode titleNode = document.DocumentNode.SelectSingleNode("//*[@id=\"Blog1\"]/div[1]/ul/li/div[2]/div[1]/h3/a"); HtmlAgilityPack.HtmlNode node = document.DocumentNode.SelectSingleNode("//*[@id=\"Blog1\"]/div[1]/ul/li/div[2]/div[2]"); IEnumerable <HtmlNode> links = node.Descendants("a"); IEnumerable <ParsedLink> parsedLinks = links.Select(l => new ParsedLink(l)).Where(v => v.Valid); backgroundWorker1.ReportProgress(50); IEnumerable <string> linkDupes = parsedLinks.GroupBy(l => l.Url).Where(g => g.Count() > 1).Select(ig => ig.Select(v => v.InnerText).Aggregate((a, b) => a + " and " + b)); IEnumerable <int> numeratedSources = parsedLinks.Select(l => l.Source); IEnumerable <int> sourceDupes = numeratedSources.GroupBy(l => l).Where(g => g.Count() > 1).Select(d => d.Key).Where(n => n != -1); backgroundWorker1.ReportProgress(75); int max = numeratedSources.Max(); IEnumerable <int> sourceMissing = Enumerable.Range(1, max).Except(numeratedSources); IEnumerable <ParsedLink> brokenDAlinks = parsedLinks.Where(l => l.DeviantArt && !l.JValid); backgroundWorker1.ReportProgress(100, new ClipResult() { status = "success", ParsedLinks = parsedLinks, LinkDupes = linkDupes, SourceDupes = sourceDupes, SourceMissing = sourceMissing, JLinksCount = parsedLinks.Where(l => l.JValid).Count(), JLinks = parsedLinks.Where(l => l.JValid).Select(j => j.Url).SelectMany(l => l).Aggregate((a, b) => string.Format("{0}{1}{2}", a, System.Environment.NewLine, b)), Title = titleNode.InnerText, BrokenDeviantArtLinks = brokenDAlinks, AlternativeLinks = parsedLinks.Where(l => l.AlternativeProposed) }); }
private static void processTopNode(StringBuilder xamlString, HtmlNode node, bool isTop = false) { HtmlNode nextNode = null; if (!string.IsNullOrWhiteSpace(node.InnerText)) { if (testTop(node.FirstChild)) { processTopNode(xamlString, node.FirstChild); return; } if (node.Name.Equals("blockquote", StringComparison.CurrentCultureIgnoreCase) || node.Name.Equals("ul", StringComparison.CurrentCultureIgnoreCase) || node.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)) { nextNode = processNode(xamlString, node, true); } else { writeBeginningTag(xamlString, tags["p"]); nextNode = processNode(xamlString, node, true); writeEndTag(xamlString, tags["p"]); } } if (nextNode != null) { processTopNode(xamlString, nextNode); } var imgNode = node.Descendants("img").FirstOrDefault(); if (imgNode != null && imgNode.Name.Equals("img", StringComparison.CurrentCultureIgnoreCase)) { writeBeginningTag(xamlString, tags["p"]); parseImage(xamlString, imgNode); writeEndTag(xamlString, tags["p"]); } if (!isTop && node.NextSibling != null) { if (testTop(node.NextSibling)) processTopNode(xamlString, node.NextSibling); else { writeBeginningTag(xamlString, tags["p"]); nextNode = processNode(xamlString, node.NextSibling); writeEndTag(xamlString, tags["p"]); if (nextNode != null) processTopNode(xamlString, nextNode); } } //if (node.Name.Equals("img", StringComparison.CurrentCultureIgnoreCase)) //{ // writeBeginningTag(xamlString, tags["p"]); // parseImage(xamlString, node); // writeEndTag(xamlString, tags["p"]); //} }
private string GetText(HtmlNode parent) { var node = parent.Descendants("div") .Where(n => n.GetAttributeValue("class", "").Equals(SMILEY_TEXT_ATTRIBUTE_VALUE)) .FirstOrDefault(); string value = string.Empty; if (node != null) { value = node.InnerText; } return value; }
private void RemoveUnwantedTags(ConfigSection config, HtmlAgilityPack.HtmlNode parentNode) { if (parentNode != null && config != null && config.RemoveTags != null && config.RemoveTags.Count > 0) { parentNode.Descendants() .Where(n => config.RemoveTags.Contains(n.Name.ToLowerInvariant())) .ToList() .ForEach(n => n.Remove()); } }
public HtmlAgilityPack.HtmlNode[] fillNode(HtmlAgilityPack.HtmlNode hd, string className) { IEnumerable <HtmlAgilityPack.HtmlNode> h = from node in hd.Descendants() where node.Attributes.Contains("class") && node.Attributes["class"].Value == className select node; //== "ListTableCell" select node; if (h != null && h.Count() >= 1) { return(h.ToArray()); } return(null); }
public string GetTableCSVData(HtmlNode table) { if (table == null) return String.Empty; StringBuilder csvBuilder = new StringBuilder(); StringBuilder lineBuilder = new StringBuilder(); foreach (HtmlNode th in table.Descendants("th")) { lineBuilder.Append(th.InnerText + ','); } csvBuilder.AppendLine(lineBuilder.Remove(lineBuilder.Length - 1, 1).ToString()); lineBuilder.Clear(); foreach (HtmlNode tbody in table.Descendants("tbody")) { foreach (HtmlNode tr in tbody.Descendants("tr")) { foreach (HtmlNode td in tr.Descendants("td")) { lineBuilder.Append(td.InnerText + ','); } csvBuilder.AppendLine(lineBuilder.Remove(lineBuilder.Length - 1, 1).ToString()); lineBuilder.Clear(); } } foreach (HtmlNode tbody in table.Descendants("tfoot")) { foreach (HtmlNode tr in tbody.Descendants("tr")) { foreach (HtmlNode td in tr.Descendants("td")) { lineBuilder.Append(td.InnerText + ','); } csvBuilder.AppendLine(lineBuilder.Remove(lineBuilder.Length - 1, 1).ToString()); lineBuilder.Clear(); } } return csvBuilder.ToString(); }
private static TagMetadata ParseValue(this TagMetadata data, HtmlNode parent) { var node = parent.Descendants("div") .Where(n => n.GetAttributeValue("class", "").Equals(SMILEY_TEXT_ATTRIBUTE_VALUE)) .FirstOrDefault(); string value = string.Empty; if (node != null) { value = node.InnerText; } data.Value = value; return data; }
public List<NovelInfo> ParseNode(HtmlNode node) { List<NovelInfo> result=new List<NovelInfo>(); var table = node.Descendants("div").FirstOrDefault(w => w.HasId("query_result_main")); var rows = table.Descendants("tr").ToArray(); for (int i = 1; i < rows.Length; i++) { result.Add(GetNovelInfoFromTableRow(rows[i])); } return result; }
/// <summary> /// Get the game image. /// </summary> /// <param name="htmlNode">Html node to look for the information.</param> /// <returns>Full URL for the game image</returns> private string GetImage(HtmlNode htmlNode) { var foundNode = htmlNode.Descendants("img").FirstOrDefault(n => n.Attributes["src"].Value.Contains("playstation.net/trophy")); if (foundNode != null) { var value = foundNode.Attributes["src"].Value; return value; } else return string.Empty; }
private List <VideoInfo> VideosForCategory(HtmlAgilityPack.HtmlNode node) { List <VideoInfo> videoList = new List <VideoInfo>(); var sections = node.Descendants("section").Where(s => s.GetAttributeValue("class", "") == "tv"); if (sections != null) { foreach (var section in sections) { var a = section.SelectSingleNode("a"); if (a != null) { VideoInfo video = new VideoInfo(); video.VideoUrl = baseUrl + a.GetAttributeValue("href", ""); var img = a.Descendants("img"); if (img != null && img.First() != null) { video.Thumb = img.First().GetAttributeValue("data-src", ""); video.Title = img.First().GetAttributeValue("alt", ""); } var dd = a.Descendants("dd"); if (dd != null && dd.FirstOrDefault() != null) { video.Length = dd.FirstOrDefault().InnerText; } var h1 = a.Descendants("h1"); var descP = section.Descendants("p").Where(p => p.GetAttributeValue("class", "") == "ellipsis-lastline"); if (descP != null && descP.FirstOrDefault() != null) { video.Description = descP.FirstOrDefault().InnerText; } if (EnableAOSearch) { SerializableDictionary <string, string> other = new SerializableDictionary <string, string>(); var ul = section.SelectSingleNode("div[@class = 'details']/ul[@class = 'keywords']"); if (ul != null) { IEnumerable <HtmlNode> keyAs = ul.Descendants("a"); foreach (HtmlNode keyA in keyAs) { other.Add(keyA.GetAttributeValue("data-keyword", ""), keyA.GetAttributeValue("href", "")); } } video.Other = other; } videoList.Add(video); } } } return(videoList); }
Boolean PageHasData(string documentText) { try { HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(documentText); HtmlAgilityPack.HtmlNode bloodDonor = htmlDocument.DocumentNode.SelectSingleNode("//*[@id='regmain']"); HtmlAgilityPack.HtmlNode userLink = bloodDonor.SelectSingleNode("//*[@id='regPage']/div/div/div"); // Of Label // LabelElement.InnerHTML.Equals("of") // linkPageCount = aElement.InnHTML var n1 = userLink.Descendants().ToArray(); var link = userLink.Descendants().Count(); totalPages = Int32.Parse(n1[(link - 5)].InnerText); if (totalPages > 0) { return(true); } else { return(false); } } catch (Exception ex) { totalPages = 0; return(false); } }
public static Result <List <Section> > ParseSections(HtmlAgilityPack.HtmlNode sectionRow) { try { var sectionList = new List <Section>(); var sectionCells = sectionRow.Descendants("div").Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value == ("program__cell")); foreach (var sectionNode in sectionCells) { var name = sectionNode.Descendants("div").FirstOrDefault().InnerHtml; var areaName = sectionNode.Descendants("div").LastOrDefault().InnerHtml; sectionList.Add(new Section { Name = name, AreaName = areaName }); } return(new Result <List <Section> >(sectionList)); } catch (Exception ex) { return(new Result <List <Section> >(null, true, ex.Message)); } }
public static IEnumerable <HAP.HtmlNode> DescendantElements(this HAP.HtmlNode parent) { var elements = parent.Descendants().Where(n => n.NodeType == HAP.HtmlNodeType.Element); return(elements); }
public static BookModels GetBookDetails(string url) { var book = new BookModels(); book.AmazonUrl = url; var webGet = new HtmlWeb(); var htmlDoc = webGet.Load(url); htmlDoc.OptionFixNestedTags = true; // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { // Handle any parse errors as required } if (htmlDoc.DocumentNode != null) { HtmlAgilityPack.HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//body"); if (bodyNode != null) { var image = bodyNode.SelectSingleNode("//img[@id='main-image']"); if (image == null) { image = bodyNode.SelectSingleNode("//img[@id='imgBlkFront']"); } if (image != null) { book.CoverUrl = image.Attributes["src"].Value; book.CoverUrl = book.CoverUrl.Substring(0, book.CoverUrl.IndexOf("._")) + ".jpg"; } var title = bodyNode.Descendants("span") .Where(x => x.Id == "btAsinTitle") .Select(s => s.InnerText); book.Title = title.FirstOrDefault(); if (string.IsNullOrEmpty(book.Title)) { title = bodyNode.Descendants("h1") .Where(x => x.Id == "title") .Select(s => s.InnerText); book.Title = title.FirstOrDefault(); } var price = bodyNode.SelectSingleNode("//b[@class='priceLarge']"); if (price != null) { book.Price = Convert.ToDecimal(price.InnerText.Trim().Replace("$", string.Empty).Replace("\n", string.Empty)); } var description = bodyNode.SelectSingleNode("//div[@id='postBodyPS']") .InnerText; book.Description = description; } } return(book); }
public static Result <Speach> ParseSpeache(HtmlAgilityPack.HtmlNode speackerCell, int sectionId, Section sections, DateTime dateTime, string baseUrl) { //Check Empty var emptyClass = "program__message"; var isEmpty = speackerCell.Descendants("div").FirstOrDefault(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains(emptyClass) && string.IsNullOrEmpty(x.InnerText)) != null; if (isEmpty) { return(new Result <Speach>(new Speach { AreaName = sections.AreaName, AreaType = sections.Name, SpeachTesis = "Нет спикера", SpeachStartTime = dateTime })); } var mayBeOpening = speackerCell.Descendants("div").FirstOrDefault(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains(emptyClass) && !string.IsNullOrEmpty(x.InnerText)); var isOpening = mayBeOpening != null; if (isOpening) { return(new Result <Speach>(new Speach { AreaName = sections.AreaName, AreaType = sections.Name, SpeachTesis = mayBeOpening.InnerText, SpeachStartTime = dateTime })); } var speach = new Speach(); //speach url var lectionRef = speackerCell.Descendants("a").FirstOrDefault(x => x.Attributes["href"] != null); speach.SpeachUrl = baseUrl + lectionRef.Attributes["href"].Value; //speakers var speakers = speackerCell.Descendants("div").Where(x => x.Attributes["class"] != null && x.Attributes["class"].Value == "program__speaker"); //every speaker var speakersArray = new Speaker[speakers.Count()]; try { for (int i = 0; i < speakers.Count(); i++) { speakersArray[i] = new Speaker(); //name var nameNode = speakers.ElementAt(i).Descendants("div").FirstOrDefault(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("program__speaker-name")); speakersArray[i].Name = nameNode.InnerText.Replace("\n ", ""); //icon var iconNode = speakers.ElementAt(i).Descendants("div").FirstOrDefault(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("program__speaker-image")); var icon = iconNode.Attributes["style"].Value; icon = icon.Replace("background-image: url('", ""); speakersArray[i].FaceImageSource = baseUrl + icon.Replace("')", ""); //company var companyNode = speakers.ElementAt(i).Descendants("div").FirstOrDefault(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("program__speaker-company")); speakersArray[i].Company = companyNode.InnerText.Replace("\n ", ""); } //tesis var tesisNode = speackerCell.Descendants("div").FirstOrDefault(x => x.Attributes["class"] != null && x.Attributes["class"].Value.Contains("program__topic")); speach.SpeachTesis = tesisNode.InnerText.Replace("\n ", ""); speach.Speakers = speakersArray; speach.AreaName = sections.AreaName; speach.AreaType = sections.Name; speach.SpeachStartTime = dateTime; } catch (Exception ex) { return(new Result <Speach>(null, true, ex.Message)); } return(new Result <Speach>(speach)); }
//reads in scraped html files that contain projected points for each player in the league //writes out a neatly formatted txt file containing week, player name, playerId, projected points and position private static void Main(string[] args) { using ( StreamWriter errorLogger = new StreamWriter("C:\\Users\\SomeFolder\\Logs\\" + DateTime.Now.ToString("MM_dd_yy_hh_mm_ss") + ".txt", false)) { try { //get list of all input data files and instantiate new HtmlDoc string[] files = Directory.GetFiles(dataFilePath); HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.OptionFixNestedTags = true; List<string> results = new List<string>(); //process each file, order does not matter foreach (string fileName in files) { //week data is from is inferred from filename int firstUnderscore = fileName.IndexOf("_"); int secondUnderscore = fileName.IndexOf("_", firstUnderscore + 1); string week = fileName.Substring(firstUnderscore + 1, secondUnderscore - firstUnderscore - 1); int lastUnderScore = fileName.LastIndexOf("_"); int firstPeriod = fileName.IndexOf("."); string playerPage = fileName.Substring(lastUnderScore + 1, firstPeriod - lastUnderScore - 1); htmlDoc.Load(fileName); if (htmlDoc.DocumentNode != null) { HtmlAgilityPack.HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//body"); if (bodyNode != null) { List<HtmlNode> allTables = bodyNode.Descendants("tbody").ToList(); for (int i = 0; i < allTables.Count; i++) { var playerTable = allTables[i]; List<HtmlNode> playerRows = playerTable.Descendants("tr").ToList(); foreach (HtmlNode playerRow in playerRows) { try { List<HtmlNode> rowCells = playerRow.Descendants("td").ToList(); HtmlNode playerNameNode = rowCells[1]; HtmlNode projectedPointsNode = rowCells[5]; HtmlNode playerIdSubNode = playerNameNode.Descendants("a") .FirstOrDefault(a => a.Attributes.Contains("data-ys-playerid")); string playerId = playerIdSubNode.Attributes["data-ys-playerid"].Value; HtmlNode playerPosSubNode = playerNameNode.Descendants("span") .FirstOrDefault(a => a.Attributes["class"].Value == "Fz-xxs"); string playerPos = playerPosSubNode.InnerHtml.Substring( playerPosSubNode.InnerHtml.IndexOf("-") + 1, playerPosSubNode.InnerHtml.Length - playerPosSubNode.InnerHtml.IndexOf("-") - 1); HtmlNode playerNameSubNode = playerNameNode.Descendants("a") .FirstOrDefault( a => a.Attributes["class"].Value == "Nowrap name F-link"); string playerName = playerNameSubNode.InnerHtml; string projectedPoints = projectedPointsNode.InnerText; string line = string.Format("{3}\t{0}\t{1}\t{2}\t{4}\t{5}", playerName, playerId, projectedPoints, week, playerPage, playerPos); if (!results.Contains(line)) { results.Add(line); } } catch (Exception e) { errorLogger.WriteLine(e.Message + "\r\n" + e.StackTrace); } } } } } } //sometimes scraper does not download all pages successfully //calculate the number of players that have projected points from each week //to ensure that new data was retrived for all players int week1Count = 0; int week2Count = 0; int week3Count = 0; int week4Count = 0; int week5Count = 0; int week6Count = 0; int week7Count = 0; int week8Count = 0; int week9Count = 0; int week10Count = 0; int week11Count = 0; int week12Count = 0; int week13Count = 0; using (StreamWriter sw = new StreamWriter("C:\\Users\\SomeFolder\\Data\\" + DateTime.Now.ToString("MM_dd_yy") + ".txt", false)) { sw.WriteLine("Week\tPlayerName\tPlayerId\tPoints\tPageIndex"); foreach (string line in results) { string[] lineArr = line.Split('\t'); string week = lineArr[0]; switch (week) { case "1": { week1Count++; break; } case "2": { week2Count++; break; } case "3": { week3Count++; break; } case "4": { week4Count++; break; } case "5": { week5Count++; break; } case "6": { week6Count++; break; } case "7": { week7Count++; break; } case "8": { week8Count++; break; } case "9": { week9Count++; break; } case "10": { week10Count++; break; } case "11": { week11Count++; break; } case "12": { week12Count++; break; } case "13": { week13Count++; break; } } sw.WriteLine(line); } } Console.WriteLine("Week 1 player count: {0}", week1Count); Console.WriteLine("Week 2 player count: {0}", week2Count); Console.WriteLine("Week 3 player count: {0}", week3Count); Console.WriteLine("Week 4 player count: {0}", week4Count); Console.WriteLine("Week 5 player count: {0}", week5Count); Console.WriteLine("Week 6 player count: {0}", week6Count); Console.WriteLine("Week 7 player count: {0}", week7Count); Console.WriteLine("Week 8 player count: {0}", week8Count); Console.WriteLine("Week 9 player count: {0}", week9Count); Console.WriteLine("Week 10 player count: {0}", week10Count); Console.WriteLine("Week 11 player count: {0}", week11Count); Console.WriteLine("Week 12 player count: {0}", week12Count); Console.WriteLine("Week 13 player count: {0}", week13Count); List<int> weekCounts = new List<int> { week1Count, week2Count, week3Count, week4Count, week5Count, week6Count, week7Count, week8Count, week9Count, week10Count, week11Count, week12Count, week13Count }; //ignore past weeks that have no player data and 0 player rows List<int> uniqueCounts = weekCounts.Where(w => w != 0).Distinct().ToList(); //send an alert email with information showing which week was missing data if (uniqueCounts.Count != 1) { Emailer.SendEmail("Warning: incomplete weekly scoring data detected", string.Format("Weekly players found:\r\n1: {0}\r\n2: {1}\r\n3: {2}\r\n4: {3}\r\n5: {4}\r\n6: {5}\r\n" + "7: {6}\r\n8: {7}\r\n9: {8}\r\n10: {9}\r\n11: {10}\r\n12: {11}\r\n13: {12}\r\n", week1Count, week2Count, week3Count, week4Count, week5Count, week6Count, week7Count, week8Count, week9Count, week10Count, week11Count, week12Count, week13Count)); } } catch (Exception e) { errorLogger.WriteLine(e.Message + "\r\n" + e.StackTrace); Emailer.SendEmail("Warning Error occurred during HTML Parser", e.Message + "\r\n" + e.StackTrace); } } }
public static IEnumerable <HtmlNode> GetElementsByClassName(this HtmlNode node, string className) { return(node.Descendants().Where(n => n.GetAttributeValue("class", "").Split(' ').Contains(className))); }
public static IEnumerable <HtmlNode> GetElementsByName(this HtmlNode node, string name) { return(node.Descendants().Where(n => n.GetAttributeValue("name", "") == name)); }
public static HtmlNode GetElementById(this HtmlNode node, string tagName, string id) { return(node.Descendants(tagName).FirstOrDefault(n => n.GetAttributeValue("id", "") == id)); }
/// <summary> /// Recupera o texto de um elemento filho a partir do nome e indice do mesmo /// </summary> /// <param name="instance">Elemento DOM Pai</param> /// <param name="tagName">Nome do elemento filho a ser procurado</param> /// <param name="tagOrder">Índice do elemento filho </param> /// <returns></returns> public static string GetTagValue(this ap.HtmlNode instance, string tagName, int tagOrder) { return(instance.Descendants().Where(d => d.Name == tagName).ToList()[tagOrder - 1].InnerText); }