public SelectNodes ( string xpath ) : HtmlAgilityPack.HtmlNodeCollection | ||
xpath | string | The XPath expression. |
Résultat | HtmlAgilityPack.HtmlNodeCollection |
private EuroMillionsResult ParseResultSection(HtmlNode section) { var date = DateTime.ParseExact(section.SelectSingleNode(".//div[@class = 'floatLeft']/a").InnerText, "dd/MM/yyyy", CultureInfo.InvariantCulture); var balls = section.SelectNodes(".//td[@class = 'euro-ball-s']").Select(x => Convert.ToInt32(x.InnerText)); var bonusBalls = section.SelectNodes(".//td[@class = 'euro-lucky-star-s']").Select(x => Convert.ToInt32(x.InnerText)); return new EuroMillionsResult(date, 0, balls.ToList(), bonusBalls.ToList()); }
public bool SetValue(HtmlNode n, string value) { if (n is HtmlNode && n.Name == "select") { foreach (HtmlNode o in n.SelectNodes("option")) { o.SetAttributeValue("selected", o.GetAttributeValue("value", "").Equals(value) ? "selected" : ""); } return true; } if (n is HtmlNode && n.Name == "input") { switch (n.GetAttributeValue("type", "")) { case "radio": n.SetAttributeValue("checked", n.GetAttributeValue("value", "").Equals(value) ? "checked" : ""); break; default: n.SetAttributeValue("value", value); break; } n.SetAttributeValue("value", value); return true; } return false; }
internal static List<ChannelItemInfo> ParseNode(HtmlNode node, bool abroadOnly) { var items = new List<ChannelItemInfo>(); if (node == null) return items; var playableArticles = node.SelectNodes(".//article[contains(@class, 'playJsInfo-Core') or contains(@class, 'slick_item')]"); if (playableArticles != null) foreach (var article in playableArticles) { var playable = ParsePlayableArticle(article, abroadOnly); if (playable != null) items.Add(playable); } var folderArticles = node.SelectNodes(".//article[not(contains(@class, 'playJsInfo-Core') or contains(@class, 'slick_item'))]"); if (folderArticles != null) foreach (var article in folderArticles) { var folder = ParseFolderArticle(article); if (folder != null) items.Add(folder); } return items; }
protected void Page_Load(object sender, EventArgs e) { string heads = @"Accept: application/json, text/javascript, */* q=0.01 " + @"Accept-Encoding: gzip, deflate " + @"Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2 " + @"Connection: keep-alive " + @"Cookie: s_ViewType=10; _lxsdk_cuid=1729cd29d3dc8-04d80d1c3b31398-4c302c7d-144000-1729cd29d3ec8; _lxsdk=1729cd29d3dc8-04d80d1c3b31398-4c302c7d-144000-1729cd29d3ec8; _hc.v=6c48a318-c117-5df7-478a-f0f694f1570e.1591768948; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1591768950,1591788446; _lxsdk_s=1729dfc18eb-4f6-3ef-94c%7C%7C19; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1591788446 " + @"Host: catdot.dianping.com " + @"Referer: http:/www.dianping.com/search…/0_%E8%8B%B1%AF%AD%E5%9F%B9%AE " + @"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"; string url = @"http://www.dianping.com/search/keyword/1/0_%E8%8B%B1%AF%AD%E5%9F%B9%AE"; ClassHttpRequestClient s = new ClassHttpRequestClient(true); HtmlDocument doc = new HtmlDocument(); string content = ""; string response = s.httpPost(url, heads, content, Encoding.UTF8); HtmlAgilityPack.HtmlNodeCollection collection = doc.DocumentNode.SelectNodes("//div[@class=\"txt\"]"); StringBuilder sb = new StringBuilder(); foreach (HtmlAgilityPack.HtmlNode item in collection) { HtmlAgilityPack.HtmlNode divtit = item.SelectNodes("div[@class=\"txt\"]")[0]; HtmlAgilityPack.HtmlNode aname = divtit.SelectNodes("a[1]")[0]; HtmlAgilityPack.HtmlNode divcomment = item.SelectNodes("div[@class=\"comment\"]")[0]; HtmlAgilityPack.HtmlNode anum = divcomment.SelectNodes("a[1]")[0]; HtmlAgilityPack.HtmlNode aprice = divcomment.SelectNodes("a[2]")[0]; sb.Append(string.Format("{0}—{1}—{2}", aname.InnerText, anum.InnerText, aprice.InnerText)); } Response.Write(sb); }
private List<Movie> ParseMovieListHtml(HtmlNode htmlNode, string xPath) { try { var movies = new List<Movie>(); var hnc = htmlNode.SelectNodes(xPath); var hnc2 = htmlNode.SelectNodes("//div/a/img"); if (hnc.Count < 1) return null; for (int i = 0; i < hnc.Count; i++) { var node1 = hnc2[i]; var node = hnc[i]; var movie = new Movie(); var hac = node1.Attributes; movie.Grade =node.SelectSingleNode("div[@class='fm-movie-desc']/div/span[@class='fm-rating']").InnerText.Replace("\n", "").RemoveSpace().Trim(); movie.Image = hac[0].Value.Replace("-poster100","").Trim(); movie.Name = node.SelectNodes("div[@class='fm-movie-desc']/div")[0].InnerText.Replace("\n","").RemoveSpace().Trim(); movie.Director =node.SelectNodes("div[@class='fm-movie-desc']/div")[2].InnerText.Replace("\n", "").RemoveSpace().Trim(); movie.Story = node.SelectNodes("div[@class='fm-movie-desc']/div")[3].InnerText.Replace("\n", "").RemoveSpace().Trim(); movie.Actor =node.SelectNodes("div[@class='fm-movie-desc']/div")[4].InnerText.Replace("\n","").RemoveSpace().Trim(); movie.DetailUrl = node.SelectSingleNode("div[@class='fm-movie-cover']/a").Attributes["href"].Value.Trim(); movies.Add(movie); } return movies; } catch (Exception e) { throw e; } }
public YoutubeVideoEntry(HtmlNode node) { var url_node = node.SelectNodes(".//a[@href]"); if (url_node != null) { var url_value = url_node.FirstOrDefault().Attributes["href"].Value; var splitIndex = url_value.IndexOf("&"); if (splitIndex > 0) { url = "http://www.youtube.com" + url_value.Substring(0, splitIndex); } } var title_node = node.SelectNodes(".//span[contains(@class, 'video-title')]"); if (title_node != null) title = title_node.FirstOrDefault().InnerText; if (!String.IsNullOrEmpty(title)) title = title.Trim(); var img_node = node.SelectNodes(".//img[@src]"); if (img_node != null) imageUrl = "http:" + img_node.FirstOrDefault().Attributes["src"].Value; }
protected override string retrieveTitle(HtmlNode node) { string title = node.SelectNodes("//div[contains(@class, 'entry-content')]//strong" + "|//div[contains(@class, 'entry-content')]//b")?.First()?.InnerText ?? ""; node.SelectNodes("//div[contains(@class, 'entry-content')]//strong" + "|//div[contains(@class, 'entry-content')]//b")?.First()?.Remove(); return title; }
/// <summary> /// Run xpath from html or node /// </summary> private List<KeyValuePair<string, object>> run(HtmlNode node) { Factory.Instance.iInfo(string.Format("Running xpathSingle id : {0}", rule.id)); if (node == null) return new List<KeyValuePair<string, object>>(); //Get all attriibutes by type and save to List<KeyValuePair<string, object>> foreach (Db.xpathSingleAttributes attr in rule.attributes) { object val = null; if (attr.getType == Db.xpathSingleAttributesGetType.nodeCollection) val = node.SelectNodes(attr.xpath); else if (attr.getType == Db.xpathSingleAttributesGetType.count) { HtmlNodeCollection c = node.SelectNodes(attr.xpath); if (c != null) val = c.Count.ToString(); } else { string val2 = string.Empty; HtmlNode n = node.SelectSingleNode(attr.xpath); if (n != null) { if (attr.getType == Db.xpathSingleAttributesGetType.singleNode) val = n; else { if (attr.getType == Db.xpathSingleAttributesGetType.text) val2 = n.InnerText.Trim(); if (attr.getType == Db.xpathSingleAttributesGetType.html) val2 = n.InnerHtml.Trim(); if (attr.getType == Db.xpathSingleAttributesGetType.attribute) { if (n.Attributes[attr.attributeName] != null) val2 = n.Attributes[attr.attributeName].Value; } val = postProcessResult(val2, attr); if(attr.getType != Db.xpathSingleAttributesGetType.html && attr.getType != Db.xpathSingleAttributesGetType.nodeCollection && attr.getType != Db.xpathSingleAttributesGetType.singleNode) Factory.Instance.iInfo(string.Format("{0} = {1}",attr.id,val)); } } } res.Add(new KeyValuePair<string, object>(attr.id, val)); } return res; }
public HtmlNodeCollection GetDishInfoList(HtmlNode dishTypeNode) { var baseCollectionSite = new BaseCollectionSite(PageUrl); var dishNodeList = dishTypeNode.SelectNodes(DishesPath()); if (dishNodeList == null || dishNodeList.Count <= 0) { return new HtmlNodeCollection(null); } var scripNode = dishTypeNode.SelectSingleNode(@"./../../../../..//div[@class='rec-dishes tab-item active']/div[@class='pic-list J_toggle']/ul/script"); if (scripNode != null && !string.IsNullOrWhiteSpace(scripNode.InnerText)) { var liNodeList = baseCollectionSite.BaseHtmlNodeCollection(scripNode.InnerText); if (liNodeList != null) { var dishLiList = liNodeList.SelectNodes(".//li"); if (dishLiList != null) { foreach (var dishLi in dishLiList) { dishNodeList.Add(dishLi); } } } } return dishNodeList; }
public static H.HtmlNodeCollection SelectNodesOrEmpty(this H.HtmlNode node, string xpath) { Requires.NonNull(node, nameof(node)); var nodeCollection = node.SelectNodes(xpath); return(nodeCollection ?? new H.HtmlNodeCollection(node)); }
private void ParseRegionElement(HtmlNode region) { var regionTitle = region.SelectSingleNode("h2").InnerText; foreach (var server in region.SelectNodes(".//div[@class=\"server\" or @class=\"server alt\"]")) { var serverName = server.SelectSingleNode(".//div[@class=\"server-name\"]").InnerText.Trim(); var pollCategoryValue = new PollCategoryValue(); var possibleCategoryMatch = Categories.FirstOrDefault(p => string.Compare(p.Region, regionTitle, true) == 0 && string.Compare(p.ServerCategory, serverName) == 0); if (possibleCategoryMatch == null) continue; pollCategoryValue.CategoryID = possibleCategoryMatch.PollCategoryID; pollCategoryValue.Status = PollStatusType.Unknown; pollCategoryValue.CreatedTime = DateTime.Now; foreach (var div in server.SelectNodes("div")) { if (div.OuterHtml.Contains("status-icon")) { pollCategoryValue.Status = div.OuterHtml.Contains("status-icon up") ? PollStatusType.Up : PollStatusType.Down; } } DB.InsertPollCategoryValue(pollCategoryValue); } }
public List<FormElement> GetOptions(HtmlNode htmlNode) { List<FormElement> options = new List<FormElement>(); HtmlNodeCollection nodeTags = htmlNode.SelectNodes(@".//option"); if (nodeTags != null) { foreach (HtmlNode node in nodeTags) { string id = node.GetAttributeValue("id", ""); string type = "option"; string name = node.GetAttributeValue("name", ""); string value = node.GetAttributeValue("value", ""); bool chk = node.Attributes["selected"] != null; FormElement el = new FormElement(); el.Id = id; el.Type = type; el.Name = node.NextSibling.InnerText; el.Value = value; el.Type = type; el.Checked = chk; options.Add(el); } } return options; }
private int AddSubcats(HtmlNode node, RssLink parentCat) { var subs = node.SelectNodes(".//article"); foreach (var sub in subs) { RssLink subcat = new RssLink() { ParentCategory = parentCat }; subcat.Name = HttpUtility.HtmlDecode(sub.SelectSingleNode(".//a[@title]").Attributes["title"].Value.Trim()); subcat.Url = FormatDecodeAbsolutifyUrl(parentCat.Url, sub.SelectSingleNode(".//a[@href]").Attributes["href"].Value, null, UrlDecoding.None); subcat.Thumb = getThumb(sub.SelectSingleNode(".//picture/img")); parentCat.SubCategories.Add(subcat); } var np = node.SelectSingleNode(".//a[@href and text()='More shows']"); nextPageAvailable = false; if (np != null) { string url = CreateUrl(parentCat.Url, np.Attributes["href"].Value); var npCat = new NextPageCategory() { Url = url, ParentCategory = parentCat }; parentCat.SubCategories.Add(npCat); } parentCat.SubCategoriesDiscovered = true; return(parentCat.SubCategories.Count); }
private static Nhl_Games_Rtss MapHtmlRowToModel(HtmlNode row, NhlSeasonType nhlSeasonType) { HtmlNodeCollection tdNodes = row.SelectNodes(@"./td"); Nhl_Games_Rtss model = new Nhl_Games_Rtss(); model.NhlSeasonType = nhlSeasonType; model.Date = Convert.ToDateTime(tdNodes[0].InnerText.Replace("'", "/")); model.Year = NhlModelHelper.GetSeason(model.Date).Item2; model.GameNumber = Convert.ToInt32(tdNodes[1].InnerText); model.Visitor = tdNodes[2].InnerText; model.Home = tdNodes[3].InnerText; model.RosterLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[4]); model.GameLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[5]); model.EventsLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[6]); model.FaceOffsLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[7]); model.PlayByPlayLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[8]); model.ShotsLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[9]); model.HomeToiLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[10]); model.VistorToiLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[11]); model.ShootoutLink = NhlGamesRtss.ParseLinkFromTd(tdNodes[12]); return model; }
private static void AddPackage(SteamApp app, HtmlNode packageNode) { var package = app.AddNewPackage(); var packageTitleNode = packageNode.SelectSingleNode($"//{PackageTitle}"); package.Title = packageTitleNode.InnerHtml.Replace("Buy ", "").Trim(); var priceNodes = packageNode.SelectNodes($"//div[@class='{PackagePriceXPath}']"); if (priceNodes != null) { var priceNode = priceNodes[0]; package.CurrentPrice = ParseNodeWithCurrencyToDecimal(priceNode); package.OriginalPrice = package.CurrentPrice; } else { var originalPriceNode = packageNode.SelectSingleNode($"//div[@class='{PackageOriginalPriceXPath}']"); package.OriginalPrice = ParseNodeWithCurrencyToDecimal(originalPriceNode); var discountPriceNode = packageNode.SelectSingleNode($"//div[@class='{PackageDiscountPriceXPath}']"); package.CurrentPrice = ParseNodeWithCurrencyToDecimal(discountPriceNode); } }
public IList<Flight> ParseFlights(HtmlNode documentNode) { var element = documentNode.SelectNodes("//table[@class='resultTable dealsResults']/tbody//tr[position()>1]"); IEnumerable<Flight> flights = new List<Flight>(); if (element == null) return flights.ToList(); flights = from row in element where row.HasChildNodes let departureAirport = row.ChildNodes[1].InnerText let destination = row.ChildNodes[3].InnerText let departureDate = row.ChildNodes[5].SelectSingleNode("ul/li").InnerText let returnDate = row.ChildNodes[5].SelectSingleNode("ul/li[position()>1]").InnerText let departFlightTime = row.ChildNodes[7].SelectSingleNode("ul/li/ul/li").InnerText let returnFlightTime = row.ChildNodes[7].SelectSingleNode("ul/li[position()>1]/ul/li").InnerText let noOfNights = row.ChildNodes[9].InnerText let departureAirportCode = row.ChildNodes[13].SelectSingleNode("fieldset/input[@id='depAP']").GetAttributeValue("value", "N/a") let arrivalAirportCode = row.ChildNodes[13].SelectSingleNode("fieldset/input[@id='retAP']").GetAttributeValue("value", "N/a") let seats = row.SelectSingleNode("td[@class='seatsLeft']").ChildNodes.Count > 2 ? row.SelectSingleNode("td[@class='seatsLeft']/div").InnerText : "0" select new Flight { DepartureAirport = new Airport { Code = departureAirportCode, Name = departureAirport }, ArrivalAirport = new Airport { Code = arrivalAirportCode, Name = destination }, ArrivalDate = (departureDate + " " + departFlightTime + ":00").ToFormattedDateString(), SeatsLeft = seats.ToInt32(), DepartureDate =(returnDate + " " + returnFlightTime + ":00").ToFormattedDateString(), NoOfNights = noOfNights.ToInt32() }; return flights.ToList(); }
public static void Classification(Menu menu, agi.HtmlNode node) { agi.HtmlNodeCollection divide_td = node.SelectNodes(".//td"); agi.HtmlNodeCollection check_div = divide_td[0].SelectNodes(".//div"); agi.HtmlNodeCollection check_br = divide_td[0].SelectNodes(".//br"); int count = check_br.Count; if (check_div == null) { return; } if (count > 2) { String text = divide_td[0].InnerHtml; text = text.Replace("<br>", "</div><div>"); divide_td[0].InnerHtml = text; agi.HtmlNodeCollection tmp = divide_td[0].SelectNodes(".//div"); for (int i = 0; i < tmp.Count; i++) { menu.menu.Add(tmp[i].InnerText); } } else { //menu.menu.Add(node.InnerText); for (int i = 0; i < count; i++) { menu.menu.Add(check_div[i].InnerText); } } }
private static List<Item> extractSelect(HtmlNode html, string nameSelect) { List<Item> country = new List<Item>(); var criterioBusqueda = "//select[@name='"+ nameSelect +"']"; foreach (HtmlNode item in html.SelectNodes(criterioBusqueda)) { string[] valueText = item.InnerText.Split('\n'); List<HtmlNode> values = item.Elements("option").ToList(); for (int i = 0; i < values.Count; i++) { var aux = values[i].GetAttributeValue("value", ""); if (!aux.Equals("")) { int parse = int.Parse(aux); //tener en cuenta que en el caso de la pagina ejemplo hay un elemento vacio al inicio. //puede que no pase asi con todas. string value = valueText[i + 1]; country.Add(new Item(parse, value)); } } } return country; }
private static string GetBody(HtmlNode docNode) { var paragraphs = docNode.SelectNodes("//div[@class='delfi-article-body']//p"); var text = String.Join(" ", paragraphs.Elements().Select(e => e.InnerText)); return text; }
private void SearchFromNode(HtmlNode baseNode) { var nodes = Enumerable.Empty<HtmlNode>(); if (!_html.DocumentNode.HasChildNodes) ParseHtml(); if (chkXPath.IsChecked == true) nodes = baseNode.SelectNodes(txtSearchTag.Text); else nodes = baseNode.Descendants(txtSearchTag.Text); if (nodes == null) return; listResults.Items.Clear(); foreach (var node in nodes) { var tr = new NodeTreeView { BaseNode = node }; var lvi = new ListBoxItem(); var pnl = new StackPanel(); pnl.Children.Add(new Label { Content = string.Format("id:{0} name:{1} children{2}", node.Id, node.Name, node.ChildNodes.Count), FontWeight = FontWeights.Bold }); pnl.Children.Add(tr); lvi.Content = pnl; listResults.Items.Add(lvi); } tabControl1.SelectedItem = tabSearchResults; }
/// <summary> /// Identify Macros /// </summary> /// <param name="skeleton">skeleton</param> /// <param name="pageType">page Type</param> /// <returns></returns> private void IdentifyMacros(HtmlNode skeleton, PageType pageType) { var menuProperties = new List<PropertyDTO>(); var doc = skeleton.OwnerDocument; var propertyFactory = factory.PropertyFactory; foreach (var menuNode in pageType.MacroXpaths .SelectMany(xpath => skeleton.SelectNodes(xpath)) .Where(n => n != null && n.ParentNode != null)) { var property = propertyFactory.GetNew(); menuProperties.Add(property); var propertyNode = doc.CreateTextNode(property.TemplateReference); menuNode.ParentNode.ReplaceChild(propertyNode, menuNode); } var macros = menuProperties.Select(p => new Definition { Number = p.Number, Name = p.Name, TemplateReference = p.TemplateReference, IsMacro = true }); pageType.Definitions.AddRange(macros); }
private ProductBasicData BuildProductBasicData(HtmlNode productNode) { log.DebugFormat("[BuildProductBasicData] OuterHtml= {0}.", productNode.OuterHtml); //throw new NotImplementedException(); ProductBasicData returnValue = new ProductBasicData(); //should be transfered to barcode, but last digitals of the barcode is productId returnValue.Barcode = productNode.Attributes.First(c => c.OriginalName == "DdPiD").Value; returnValue.ProductId = productNode.Attributes.First(c => c.OriginalName == "DdPiD").Value; returnValue.pbcatid = productNode.Attributes.First(c => c.OriginalName == "pbcatid").Value; returnValue.qty = productNode.Attributes.First(c => c.OriginalName == "qty").Value; returnValue.iq = productNode.Attributes.First(c => c.OriginalName == "iq").Value; returnValue.inb = productNode.Attributes.First(c => c.OriginalName == "inb").Value; //returnValue.ImageSource = productNode.SelectSingleNode("//img[@src]").Attributes.FirstOrDefault(c=> c.Name == "src").Value; returnValue.ImageSource = productNode.SelectNodes("child::*/child::div/child::img").First().Attributes.First(c => c.Name == "src").Value; returnValue.EffectivePrice = productNode.SelectSingleNode("child::*/child::div/child::div/child::div/child::span").InnerText; //returnValue.EffectivePrice = productNode.SelectSingleNode("//span[@id='spnEffectivePrice']").InnerText; //returnValue.Description = HttpUtility.HtmlDecode(productNode.SelectSingleNode("//div[@class='ProdBoxSupplierText']").InnerText); returnValue.Description = HttpUtility.HtmlDecode(productNode.SelectSingleNode("child::*/child::div/child::div/child::a").InnerText); returnValue.ProductName = HttpUtility.HtmlDecode(productNode.SelectSingleNode("child::*/child::div/child::div/child::a").InnerText); log.DebugFormat("[BuildProductBasicData] fetched product={0}.", returnValue.ToString()); return returnValue; }
private static Nhl_Players_Rtss_Skater MapHtmlRowToModel(HtmlNode row, NhlSeasonType nhlSeasonType, int year) { HtmlNodeCollection tdNodes = row.SelectNodes(@"./td"); Nhl_Players_Rtss_Skater model = new Nhl_Players_Rtss_Skater(); model.NhlSeasonType = nhlSeasonType; model.Year = year; model.Number = 0; model.Name = tdNodes[1].InnerText; model.Team = tdNodes[2].InnerText; model.Position = tdNodes[3].InnerText; model.GamesPlayed = ConvertStringToInt(tdNodes[4].InnerText); model.Hits = ConvertStringToInt(tdNodes[5].InnerText); model.BlockedShots = ConvertStringToInt(tdNodes[6].InnerText); model.MissedShots = ConvertStringToInt(tdNodes[7].InnerText); model.Giveaways = ConvertStringToInt(tdNodes[8].InnerText); model.Takeaways = ConvertStringToInt(tdNodes[9].InnerText); model.FaceoffsWon = ConvertStringToInt(tdNodes[10].InnerText); model.FaceoffsLost = ConvertStringToInt(tdNodes[11].InnerText); model.FaceoffsTaken = ConvertStringToInt(tdNodes[12].InnerText); model.FaceoffWinPercentage = Convert.ToDouble(tdNodes[13].InnerText); model.PercentageOfTeamFaceoffsTaken = Convert.ToDouble(tdNodes[14].InnerText); model.Shots = ConvertStringToInt(tdNodes[15].InnerText); model.Goals = ConvertStringToInt(tdNodes[16].InnerText); model.ShootingPercentage = Convert.ToDouble(tdNodes[17].InnerText); return model; }
private IList<Achievement> ParseSubAchievements(Achievement achievement, HtmlNode subAchievementNode) { IList<Achievement> subAchievements = new List<Achievement>(); HtmlNodeCollection achievements = subAchievementNode.SelectNodes("./li"); if (achievements != null) { foreach (HtmlNode subNode in achievements) { // TODO : Parse out achievement id if ( subNode.Attributes["onmousemove"] != null ) { Match match = parseTooltip.Match(subNode.Attributes["onmousemove"].Value); if ( match.Success ) { string subAchievementId = match.Groups["achievementid"].Value; int blizzardId = 0; int.TryParse(subAchievementId, out blizzardId); Achievement subAchievement = _service.Find(blizzardId); if (subAchievement == null) { subAchievement = new Achievement() { BlizzardID = blizzardId }; subAchievement.Name = GetValueAsString(subNode, ".//h3"); subAchievement.Description = GetValueAsString(subNode, ".//div[@class='color-tooltip-yellow']"); subAchievement.Points = GetValueAsInt32(subNode, ".//span[@class='points border-3']"); _service.Save(subAchievement); } achievement.Points = achievement.Points - subAchievement.Points; subAchievements.Add(subAchievement); } } } } return subAchievements; }
/// <summary> /// Run xpath from html or node /// </summary> public List<List<KeyValuePair<string, object>>> run(HtmlNode node) { Factory.Instance.iInfo(string.Format("Running xpathCollection id : {0}", rule.id)); HtmlNodeCollection nodes = new HtmlNodeCollection(node); HtmlNodeCollection n2 = node.SelectNodes(rule.xpath); if (n2 != null) { foreach (HtmlNode n in n2) nodes.Add(n); } //run if (node != null) { foreach (HtmlNode n in nodes) { List<KeyValuePair<string, object>> last_val = null; if (rule.xpathSingle != null) { XPathSingle xs = new XPathSingle(rule.xpathSingle, last_val); last_val = (List<KeyValuePair<string, object>>)xs.Run(n); res.Add(last_val); } } } return res; }
private static Nhl_Players_Bio_Goalie MapHtmlRowToModel(HtmlNode row, NhlSeasonType nhlSeasonType, int year) { HtmlNodeCollection tdNodes = row.SelectNodes(@"./td"); Nhl_Players_Bio_Goalie model = new Nhl_Players_Bio_Goalie(); model.NhlSeasonType = nhlSeasonType; model.Year = year; model.Number = ConvertStringToInt(tdNodes[0].InnerText); model.Name = tdNodes[1].InnerText; model.Team = tdNodes[2].InnerText; model.Position = "G"; model.DateOfBirth = Convert.ToDateTime(tdNodes[3].InnerText.Replace("'", "/")); model.BirthCity = tdNodes[4].InnerText; model.StateOrProvince = tdNodes[5].InnerText; model.BirthCountry = tdNodes[6].InnerText; model.HeightInches = ConvertStringToInt(tdNodes[7].InnerText); model.WeightLbs = ConvertStringToInt(tdNodes[8].InnerText); model.Catches = tdNodes[9].InnerText; model.Rookie = tdNodes[10].InnerText; model.DraftYear = ConvertStringToInt(tdNodes[11].InnerText); model.DraftRound = ConvertStringToInt(tdNodes[12].InnerText); model.DraftOverall = ConvertStringToInt(tdNodes[13].InnerText); model.GamesPlayed = ConvertStringToInt(tdNodes[14].InnerText); model.Wins = ConvertStringToInt(tdNodes[15].InnerText); model.Losses = ConvertStringToInt(tdNodes[16].InnerText); model.OTSOLosses = ConvertStringToInt(tdNodes[17].InnerText); model.GAA = Convert.ToDouble(tdNodes[18].InnerText); model.SavePercentage = Convert.ToDouble(tdNodes[19].InnerText); model.Shutouts = ConvertStringToInt(tdNodes[20].InnerText); return model; }
void ParseHTMLTable(Hashtable loadLanguages, Hashtable loadEntries, HtmlAgilityPack.HtmlNode node) { int row = -1; bool firstRow = true; foreach (HtmlAgilityPack.HtmlNode trNode in node.SelectNodes(".//tr")) { if (trNode.SelectNodes(".//td") == null) { continue; } row++; int i = -1; string key = ""; foreach (HtmlAgilityPack.HtmlNode tdNode in trNode.SelectNodes(".//td")) { i++; if (firstRow && row == 0) { //Language header if (i == 0) { continue; //Ignore this top-left empty cell } loadLanguages[i] = tdNode.InnerText; if (!loadEntries.ContainsKey(i)) { loadEntries[i] = new Hashtable(); } } else { //Data rows if (i == 0) { key = tdNode.InnerText; continue; } if (key == "") { continue; //Skip entries with empty keys (the other values can be used as labels) } string content = tdNode.InnerText; Hashtable hTable = (Hashtable)loadEntries[i]; if (hTable.ContainsKey(key)) { Debug.LogError("ERROR: Double key [" + key + "]"); unresolvedErrors++; } hTable[key] = content; } } firstRow = false; } }
private static string getItemPrice(HtmlNode itemPriceNode) { HtmlNodeCollection spans = itemPriceNode.SelectNodes("span"); string price = itemPriceNode.InnerText.Replace(" ", " ").TrimStart(); string result = parsePriceRecursion(spans[0].Attributes["title"].Value, price); return parsePrice(result); }
public override string ReadContent(HtmlNode node) { RemoveTags(node.SelectNodes("//*[@id='taboola-bottom-main-column']")); var content = node.SelectSingleNode("//*[@id='news-story']/article"); RemoveScripts(content); return content.InnerHtml; }
private void buttonParseHtml_Click(object sender, EventArgs e) { OpenFileDialog openFileDialog = new OpenFileDialog(); openFileDialog.Filter = "HTML File (*.html;)|*.html"; openFileDialog.Multiselect = false; if (openFileDialog.ShowDialog() == DialogResult.OK) { if (String.IsNullOrEmpty(openFileDialog.FileName)) { return; } string strHtml = String.Empty; using (StreamReader reader = new StreamReader(openFileDialog.FileName, Encoding.UTF8)) { strHtml = reader.ReadToEnd(); reader.Close(); } HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(strHtml); //加载html foreach (var err in doc.ParseErrors) { Console.WriteLine(err.Code); } //Console.WriteLine(doc.Text); HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode; //获取文档的根节点 //Console.WriteLine(rootNode.OuterHtml); #if true string xpath = @"//table"; HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath); //获取表格 foreach (var script in node.Descendants("script").ToArray()) { script.Remove(); } foreach (var style in node.Descendants("style").ToArray()) { style.Remove(); } string innerText = node.OuterHtml; //到这里就是纯纯的表格了 var trNodes = node.SelectNodes("tr"); foreach (var trnod in trNodes) //遍历行 { var tdNodes = trnod.SelectNodes("td"); for (int i = 0; i < tdNodes.Count; i++) //遍历列 { Console.WriteLine(tdNodes[i].InnerText); } } #endif } }
public static Remark ParseRemark(HtmlNode remarkNode) { var remark = new Remark(); //发现物 var discoveryNode = remarkNode.SelectSingleNode("a[@title!='']"); if (discoveryNode!=null) { var levelNode = discoveryNode.PreviousSibling.PreviousSibling; var typeNode = levelNode.PreviousSibling.PreviousSibling; remark.DiscoveryType = Enum.Parse(typeof(DisType), typeRegex.Match(typeNode.Attributes["src"].Value).Groups["type"].Value).ToString(); remark.DiscoveryLevel = Int32.Parse(levelNode.InnerText.Substring(0, 1)); remark.DiscoveryExp = Int32.Parse(discoveryNode.Attributes["title"].Value.Remove(0,5)); remark.Discovery = discoveryNode.InnerText; } //奖励物 var awardNode = remarkNode.SelectSingleNode("span[@style='color:#804000;']"); if (awardNode != null) remark.AwardItem = awardNode.InnerText; //相关任务 var relativeNodes = remarkNode.SelectNodes("descendant::a[@style='color:#C000C0;' or @style='color:DarkBlue;']"); if (relativeNodes != null) { foreach (HtmlNode relativeNode in relativeNodes) { IList<int> questList = null; IList<string> foundNameList = null; if (relativeNode.InnerText.StartsWith("前:")) { foundNameList = remark.PreFoundName; questList = remark.PreQuestID; } else { questList = remark.FollowQuestID; } var match = questRegex.Match(relativeNode.Attributes["href"].Value); if (relativeNode.InnerText.StartsWith("前:港口-") == false) questList.Add(Int32.Parse(match.Groups["id"].Value)); else foundNameList.Add(relativeNode.InnerText.Replace("前:港口-","")); } } //接受城市 //last br next a var cityNodes = remarkNode.SelectNodes("descendant::a[@class='MisCity']"); if (cityNodes != null) { cityNodes.All(node => { if (node.InnerText == "南美开拓港" || node.InnerText == "东南亚开拓港" || node.InnerText == "掠夺地图" || node.InnerText == "沉船资讯") return true; remark.FromCityList.Add(node.InnerText); return true; }); } return remark; }
public override string ReadContent(HtmlNode node) { var content = node.SelectNodes("//div").FirstOrDefault(d => d.Attributes.Contains("class") && d.Attributes["class"].Value.Contains("body")); RemoveScripts(content); return CleanHtml(content.InnerHtml); }
public static string getTable(HtmlNode table) { var data = ""; foreach (HtmlNode row in table.SelectNodes("tr")) { data += row.InnerText + ";"; } return data; }
public IEnumerable<HtmlNode> FindWithin(HtmlNode xmlElement) { var nodes = xmlElement.SelectNodes(xpath); if(nodes == null) { return new List<HtmlNode>(); } return nodes; }
Dictionary<decimal, decimal> ParseOrderDepthFromResult(HtmlNode table) { return table .SelectNodes("tr[position() > 1]") // Skip the header row .ToDictionary( tr => Decimal.Parse(tr.ChildNodes[0].InnerText), tr => Decimal.Parse(tr.ChildNodes[1].InnerText) ); }
public static HAP.HtmlNodeCollection SelectCollection(this HAP.HtmlNode parent, string xPath, bool doThrow) { var node = parent.SelectNodes(xPath); if (doThrow && node == null) { throw new NewsParserException(new { xPath }); } return(node ?? new HAP.HtmlNodeCollection(parent)); }
public string Translate (HtmlNode commandNode) { HtmlNodeCollection cellNodes = commandNode.SelectNodes ("td"); string command = cellNodes[0].InnerText; string target = cellNodes[1].InnerText; string value = cellNodes[2].InnerText; return Translate (command, target, value); }
public FormElement(HtmlAgilityPack.HtmlNode Node, int NodeIndex) : base(Node, NodeIndex) { HtmlNodeCollection NodesColl = Node.SelectNodes(".//input"); if (NodesColl != null) { for (int i = 0; i < NodesColl.Count; i++) { InputElements.Add(new InputElement(NodesColl[i], i)); } } NodesColl = Node.SelectNodes(".//select"); if (NodesColl != null) { for (int i = 0; i < NodesColl.Count; i++) { SelectElements.Add(new Element(NodesColl[i], i)); } } }
private void BwLoad_DoWork(object sender, DoWorkEventArgs e) { HtmlAgilityPack.HtmlWeb web = new HtmlWeb(); HtmlAgilityPack.HtmlDocument htmlDoc = web.Load("https://www.naver.com"); HtmlAgilityPack.HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//ul[@class='ah_l']"); HtmlAgilityPack.HtmlNode[] node = bodyNode.SelectNodes(".//span[@class='ah_k']").ToArray(); foreach (HtmlNode n in node) { this.result.Add(n.InnerText); } }
private static string GetDescription(HtmlAgilityPack.HtmlNode documentNode) { string description = string.Empty; var node = documentNode.SelectNodes("//h4") .SingleOrDefault( s => s.Attributes["class"] != null && s.Attributes["class"].Value.Equals("video_top_more") && s.Attributes["id"] != null && s.Attributes["id"].Value.Equals("video_top_more")); if (node != null) { description = node.InnerText.Replace("\r", "").Replace("\n", "").Replace("\t", ""); } return(description); }
private static string GetLinkOrigin(HtmlAgilityPack.HtmlNode documentNode) { string linkOrigin = string.Empty; var node = documentNode .SelectNodes( "//meta") .SingleOrDefault( a => a.Attributes["property"] != null && a.Attributes["property"].Value.Equals("og:url") && a.Attributes["itemprop"] != null && a.Attributes["itemprop"].Value.Equals("url")); if (node != null) { linkOrigin = node.Attributes["content"].Value; } return(linkOrigin); }
private List <VideoInfo> GetVids(HtmlNode node, string parentUrl) { List <VideoInfo> videos = new List <VideoInfo>(); var vids = node.SelectNodes(".//article"); foreach (var vid in vids) { VideoInfo video = new VideoInfo(); if (vid.SelectSingleNode(".//h2[contains(@class,'h3')]") == null) { video.Title = HttpUtility.HtmlDecode(vid.SelectSingleNode(".//a[@title]").Attributes["title"].Value.Trim()); video.VideoUrl = FormatDecodeAbsolutifyUrl(parentUrl, vid.SelectSingleNode(".//a[@href]").Attributes["href"].Value, null, UrlDecoding.None); } else { video.Title = vid.SelectSingleNode(".//h2[contains(@class,'h3')]").InnerText.Trim(); video.VideoUrl = FormatDecodeAbsolutifyUrl(parentUrl, vid.SelectSingleNode(".//a[@class='teaser__link' and @href]").Attributes["href"].Value, null, UrlDecoding.None); if (vid.SelectSingleNode(".//p[contains(@class,'teaser__description')]") != null) { video.Description = vid.SelectSingleNode(".//p[contains(@class,'teaser__description')]").InnerText.Trim(); } else { video.Description = vid.SelectSingleNode(".//h3[contains(@class,'teaser__subtitle')]").InnerText.Trim(); } } var moNode = vid.SelectSingleNode(".//span[@data-month]"); var daNode = vid.SelectSingleNode(".//span[@data-date]"); if (moNode != null && daNode != null) { video.Airdate = moNode.InnerText.Trim() + ' ' + daNode.InnerText.Trim(); } video.Thumb = getThumb(vid.SelectSingleNode(".//picture/img")); videos.Add(video); } var np = node.SelectSingleNode(".//a[@href and contains(text(),'More ')]"); nextPageAvailable = false; if (np != null) { nextPageAvailable = true; nextPageUrl = CreateUrl(parentUrl, np.Attributes["href"].Value); } return(videos); }
public object Transform(Dictionary <string, object> settings, HtmlNodeNavigator nodeNavigator, List <HtmlAgilityPack.HtmlNode> logicalParents) { var ret = new StringBuilder(); var foundParent = false; var currentNode = nodeNavigator?.CurrentNode; if (logicalParents != null && logicalParents.Count >= 2) { // We will skip out immediate parent because that's the list, we need the parent of the list, which is out grandparent var grandParentNode = logicalParents[logicalParents.Count - 2]; HtmlAgilityPack.HtmlNode parentNode = grandParentNode; if (settings != null && settings.ContainsKey("_startingXPath") && ((JValue)settings["_startingXPath"]).Type == JTokenType.String) { var startingXPath = ((JValue)settings["_startingXPath"]).ToObject <string>(); var nodes = parentNode.SelectNodes(startingXPath); if (nodes != null && nodes.Count > 0) { parentNode = nodes[0]; } else { return(0); } } while (currentNode != null && currentNode != parentNode && !foundParent) { var siblingText = this.GetTextFromSiblings(currentNode, parentNode, ref foundParent); if (!string.IsNullOrEmpty(siblingText)) { ret.Append(siblingText); ret.Append(" "); } currentNode = currentNode.ParentNode; } } var text = ret.ToString().Trim(); return(text.Length); }
private void GetNextPageVideosUrl(HtmlAgilityPack.HtmlNode node) { HasNextPage = false; nextPageUrl = ""; var a_o_buttons = node.SelectNodes("//a[contains(@class, 'svtoa-button')]"); if (a_o_buttons != null) { var a_o_next_button = a_o_buttons.Where(a => (a.InnerText ?? "").Contains("Visa fler")).FirstOrDefault(); if (a_o_next_button != null) { nextPageUrl = a_o_next_button.GetAttributeValue("href", ""); nextPageUrl = HttpUtility.UrlDecode(nextPageUrl); nextPageUrl = HttpUtility.HtmlDecode(nextPageUrl); //Some urls come html encoded HasNextPage = true; } } }
private static List <string> GetTags(HtmlAgilityPack.HtmlNode documentNode) { List <string> tags = new List <string>(); var node = documentNode.SelectNodes("//div") .FirstOrDefault(c => c.Attributes["class"] != null && c.Attributes["class"].Value.Equals("tag_video")); if (node != null) { var nodeTags = node.SelectNodes(".//a") .Where(a => a.Attributes["class"] != null && a.Attributes["class"].Value.Equals("eachTag_video")) .ToList(); if (nodeTags.Any()) { tags.AddRange(nodeTags.Select(nodeTag => nodeTag.InnerText)); } } return(tags); }
void ParseHTMLTable(ref List <object> loadData, HtmlAgilityPack.HtmlNode node) { foreach (HtmlAgilityPack.HtmlNode trNode in node.SelectNodes(".//tr")) { if (trNode.SelectNodes(".//td") == null) { continue; } int i = -1; List <object> rowData = new List <object>(); foreach (HtmlAgilityPack.HtmlNode tdNode in trNode.SelectNodes(".//td")) { i++; rowData.Add(tdNode.InnerText); } loadData.Add(rowData); } }
private static string ParsePsalm(HtmlAgilityPack.HtmlNode psalmText) { var resT = string.Empty; if (psalmText != null) { resT = psalmText.InnerText; if (!psalmText.InnerText.Contains("span")) { return(resT); } foreach (HtmlNode span in psalmText.SelectNodes(".//span")) { string attributeValue = span.GetAttributeValue("style", ""); if (string.IsNullOrEmpty(attributeValue)) { resT += $"{span.InnerHtml}"; } } } return(resT); }
/// <summary> /// Selects a list of nodes matching the HtmlAgilityPack.HtmlNode.XPath expression - will not return null. /// </summary> /// <param name="node">The node.</param> /// <param name="xpath">The XPath expression.</param> /// <returns>An HtmlAgilityPack.HtmlNodeCollection containing a collection of nodes matching the HtmlAgilityPack.HtmlNode.XPath query, or an empty collection if no node matched the XPath expression.</returns> public static HtmlNodeCollection SelectSafeNodes(this HtmlNode node, XPathExpression xpath) { return(node.SelectNodes(xpath) ?? new HtmlNodeCollection(node)); }
/// <summary> /// 分析得到数据表 /// </summary> /// <param name="content"></param> /// <returns></returns> private DataTable GetDataTable(Html.HtmlDocument content, bool istech) { DataTable dt = new DataTable(); // Html.HtmlDocument document = content; Html.HtmlNode table = document.DocumentNode.SelectSingleNode("/html/body/table/tr[2]/td/table/tr/td[2]/table[2]/tr[" + (istech ? "3" : "2") + "]/td/div/div/table"); //列 DataColumn dc; dc = new DataColumn("教练员"); //姓名 dt.Columns.Add(dc); dc = new DataColumn("车型"); //姓名 dt.Columns.Add(dc); Html.HtmlNodeCollection trTime = table.SelectNodes("./tr[1]/th[1]/font/b/th"); foreach (Html.HtmlNode tr in trTime) { dc = new DataColumn(tr.InnerText); dt.Columns.Add(dc); } dc = new DataColumn("教练号"); dt.Columns.Add(dc); //数据区 Html.HtmlNodeCollection trs = table.SelectNodes("./tr[position()>1]"); DataRow dr; foreach (Html.HtmlNode tr in trs) { Html.HtmlNodeCollection tds = tr.SelectNodes("./td"); dr = dt.NewRow(); int index = 0; foreach (Html.HtmlNode td in tds) { if (index == 0) { string name = td.SelectSingleNode("./font/span/child::text()[position()=1]").InnerText; string car = td.SelectSingleNode("./font/span/span").InnerText; dr[index] = name; index++; dr[index] = car; } else { if (td.InnerText == "可预约") { dr[index] = "yunyue"; } else { if (istech && index == 2) { dr[index] = td.SelectSingleNode("./font/span").InnerText; } else { dr[index] = td.InnerText; } } } index++; } dt.Rows.Add(dr); } return(dt); }
public static ViewCounterModel ViewCounter(string data, ProgressDialogController progressController) { HtmlNode td; string text; ViewCounterModel result = new ViewCounterModel(); //data = System.IO.File.ReadAllText("data"); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); if (doc == null) { return(null); } doc.LoadHtml(data); if (doc.DocumentNode == null && doc.DocumentNode.ChildNodes == null) { return(null); } if (doc.DocumentNode.ChildNodes.Count > 1) { #region азбор секции с информацией о канале связи HtmlAgilityPack.HtmlNode jqTabsDevices = doc.DocumentNode.SelectNodes("//div[@id='jqTabsDevices']").Single(); if (jqTabsDevices == null) { return(null); } HtmlNodeCollection info = jqTabsDevices.SelectNodes("div[2]/table/tbody/tr"); if (info == null) { return(null); } // Наименование точки учета td = info[0].SelectNodes("td[2]").Single(); text = td.InnerText; result.AccountPoint = text; // Тип счетчика td = info[1].SelectNodes("td[2]").Single(); text = td.InnerText; result.CounterType = text; // Заводской номер td = info[2].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.CounterNumber = text; // Сетевой адрес td = info[3].SelectNodes("td[2]").Single(); text = td.InnerText; result.CounterNetworkAddress = text; // Коэффициент трансформации td = info[5].SelectNodes("td[2]").Single(); text = td.InnerText; result.Ktt = text; // Производитель td = info[6].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.CounterManufacturer = text; // Тип учёта td = info[7].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.AccountType = text; // Полное название абонента td = info[8].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.AbonentFullName = text; // Название абонента td = info[9].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.AbonentName = text; // Короткое название абонента td = info[10].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.AbonentShortName = text; // Подстанция td = info[11].SelectNodes("td[2]").Single(); text = td.InnerText; result.Substation = text; // Название объекта td = info[12].SelectNodes("td[2]").Single(); text = td.InnerText; result.ObjectName = text; // Название точки учета td = info[13].SelectNodes("td[2]").Single(); text = td.InnerText; result.AccountPointName = text; // Номер ТП td = info[14].SelectNodes("td[2]").Single(); text = td.InnerText; result.TP = text; // Адрес объекта td = info[15].SelectNodes("td[2]").Single(); text = td.InnerText; result.ObjectAddress = text; // Населенный пункт объекта td = info[16].SelectNodes("td[2]").Single(); text = td.InnerText; result.ObjectState = text; // Адрес абонента td = info[17].SelectNodes("td[2]").Single(); text = td.InnerText; result.AbonentAddress = text; // Фидер td = info[18].SelectNodes("td[2]").Single(); text = td.InnerText; result.Fider = text; // Номер договора td = info[19].SelectNodes("td[2]").Single(); text = td.InnerText; result.DogNumber = text; // Родительский лиц счет td = info[20].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.AmperParentPointId = text; // РЭС td = info[21].SelectNodes("td[2]").Single(); text = td.InnerText; result.Departament = text; // Зав. номер из расч системы td = info[22].SelectNodes("td[2]").Single(); text = td.InnerText; result.AmperCounterNumber = text; // Лицевой счет td = info[23].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); result.AmperPointId = text; // Текущий статус td = info[24].SelectNodes("td[2]").Single(); text = td.InnerText; result.Status = text; // Последний сеанс td = info[25].SelectNodes("td[2]").Single(); text = td.InnerText.Trim(); DateTime date = new DateTime(); result.LastSessionDate = DateTime.TryParse( text, System.Globalization.CultureInfo.CreateSpecificCulture("en-US"), System.Globalization.DateTimeStyles.None, out date) ? date : date; #endregion #region азбор секции с показаниями HtmlAgilityPack.HtmlNode jqTabsSingleMeterIndications = doc.DocumentNode.SelectNodes("//div[@id='jqTabsSingleMeterIndications']").Single(); info = jqTabsSingleMeterIndications.SelectNodes("table/tbody/tr/td"); if (info == null) { return(null); } IndicationViewItem ivi = new IndicationViewItem(); ivi.PreviousIndications = new Indications(); ivi.NextIndications = new Indications(); #region Парсинг int startIndex = 0; // точка td = info[startIndex++]; text = td.InnerText; ivi.AccountingPoint = text; // тип td = info[startIndex++]; text = td.InnerText; ivi.CounterType = text; // предыдущие показания T0 td = info[startIndex++]; ivi.PreviousIndications.Tarriff0 = GetIndication(td.InnerText); // предыдущие показания T1 td = info[startIndex++]; ivi.PreviousIndications.Tarriff1 = GetIndication(td.InnerText); // предыдущие показания T2 td = info[startIndex++]; ivi.PreviousIndications.Tarriff2 = GetIndication(td.InnerText); // предыдущие показания T3 td = info[startIndex++]; ivi.PreviousIndications.Tarriff3 = GetIndication(td.InnerText); // предыдущие показания T4 td = info[startIndex++]; ivi.PreviousIndications.Tarriff4 = GetIndication(td.InnerText); // предыдущие показания достоверность td = info[startIndex++]; text = td.InnerText; ivi.PreviousIndications.DataReliability = text; // текущие показания T0 td = info[startIndex++]; ivi.NextIndications.Tarriff0 = GetIndication(td.InnerText); // текущие показания T1 td = info[startIndex++]; ivi.NextIndications.Tarriff1 = GetIndication(td.InnerText); // текущие показания T2 td = info[startIndex++]; ivi.NextIndications.Tarriff2 = GetIndication(td.InnerText); // текущие показания T3 td = info[startIndex++]; ivi.NextIndications.Tarriff3 = GetIndication(td.InnerText); // текущие показания T4 td = info[startIndex++]; ivi.NextIndications.Tarriff4 = GetIndication(td.InnerText); // предыдущие показания достоверность td = info[startIndex++]; text = td.InnerText; ivi.NextIndications.DataReliability = text; // разница td = info[startIndex++]; ivi.Difference = GetIndication(td.InnerText); #endregion result.IndicationViewItem = ivi; #endregion } return(result); }
public static ViewDeviceModel ViewDevice(string data, ProgressDialogController progressController) { ViewDeviceModel result = new ViewDeviceModel(); try { HtmlNode td; string text; //data = System.IO.File.ReadAllText("data"); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); if (doc == null) { return(null); } doc.LoadHtml(data); if (doc.DocumentNode == null && doc.DocumentNode.ChildNodes == null) { return(null); } if (doc.DocumentNode.ChildNodes.Count > 1) { #region азбор секции с информацией о канале связи HtmlAgilityPack.HtmlNode jqTabsDevices = doc.DocumentNode.SelectNodes("//div[@id='jqTabsDevices']").Single(); if (jqTabsDevices == null) { return(null); } HtmlNodeCollection sessionInformation = jqTabsDevices.SelectNodes("div[2]/div/table/tbody/tr"); if (sessionInformation == null) { return(null); } result.Session = new SessionInformation(); // производитель модема td = sessionInformation[2].SelectNodes("td[2]").Single(); text = td.InnerText; result.Session.ModemManufacturer = text; // Модель устройства td = sessionInformation[3].SelectNodes("td[2]").Single(); text = td.InnerText; result.Session.Model = text; // описание td = sessionInformation[4].SelectNodes("td[2]").Single(); text = td.InnerText; result.Session.Description = text; // статус td = sessionInformation[6].SelectNodes("td[2]").Single(); text = td.InnerText; result.Session.CurrentStatus = text; // сеанс td = sessionInformation[7].SelectNodes("td[2]").Single(); text = td.InnerText; DateTime date = new DateTime(); result.Session.LastSessionDate = DateTime.TryParse(text, out date) ? date : date; #endregion #region азбор секции с показаниями HtmlAgilityPack.HtmlNode jqTabsBalances = doc.DocumentNode.SelectNodes("//div[@id='jqTabsBalances']").Single(); if (jqTabsBalances == null) { return(null); } HtmlNodeCollection counters = jqTabsBalances.SelectNodes("table/tbody/tr"); if (counters == null) { return(null); } result.CountersIndications = new List <IndicationViewItem>(); byte startIndex = 0; for (int i = 0; i < counters.Count; i++) { HtmlNodeCollection hnc = counters[i].SelectNodes("td"); startIndex = 0; IndicationViewItem ivi = new IndicationViewItem(); ivi.PreviousIndications = new Indications(); ivi.NextIndications = new Indications(); #region Парсинг // точка td = hnc[startIndex++]; text = td.InnerText; ivi.AccountingPoint = text; // тип td = hnc[startIndex++]; text = td.InnerText; ivi.CounterType = text; // предыдущие показания T0 td = hnc[startIndex++]; ivi.PreviousIndications.Tarriff0 = GetIndication(td.InnerText); // предыдущие показания T1 td = hnc[startIndex++]; ivi.PreviousIndications.Tarriff1 = GetIndication(td.InnerText); // предыдущие показания T2 td = hnc[startIndex++]; ivi.PreviousIndications.Tarriff2 = GetIndication(td.InnerText); // предыдущие показания T3 td = hnc[startIndex++]; ivi.PreviousIndications.Tarriff3 = GetIndication(td.InnerText); // предыдущие показания T4 td = hnc[startIndex++]; ivi.PreviousIndications.Tarriff4 = GetIndication(td.InnerText); // предыдущие показания достоверность td = hnc[startIndex++]; text = td.InnerText; ivi.PreviousIndications.DataReliability = text; // текущие показания T0 td = hnc[startIndex++]; ivi.NextIndications.Tarriff0 = GetIndication(td.InnerText); // текущие показания T1 td = hnc[startIndex++]; ivi.NextIndications.Tarriff1 = GetIndication(td.InnerText); // текущие показания T2 td = hnc[startIndex++]; ivi.NextIndications.Tarriff2 = GetIndication(td.InnerText); // текущие показания T3 td = hnc[startIndex++]; ivi.NextIndications.Tarriff3 = GetIndication(td.InnerText); // текущие показания T4 td = hnc[startIndex++]; ivi.NextIndications.Tarriff4 = GetIndication(td.InnerText); // предыдущие показания достоверность td = hnc[startIndex++]; text = td.InnerText; ivi.NextIndications.DataReliability = text; // разница td = hnc[startIndex++]; ivi.Difference = GetIndication(td.InnerText); #endregion result.CountersIndications.Add(ivi); } #endregion #region Качество показаний if (result.QualityIndications == null) { result.QualityIndications = new List <QualityIndications>(); } HtmlNodeCollection indicationsQualityMonths = doc.DocumentNode.SelectNodes("//table[contains(@class,'tableQualityIndications')]"); if (indicationsQualityMonths != null) { int monthsCount = indicationsQualityMonths.Count; for (int monthIndex = 0; monthIndex < monthsCount; monthIndex++) { QualityIndications qi = new QualityIndications(); HtmlNode m = indicationsQualityMonths[monthIndex].SelectNodes("thead/tr[1]/th[2]").Single(); qi.Period = m == null ? "???" : m.InnerText; qi.PointsData = ParseMonthQualityIndications(indicationsQualityMonths[monthIndex].SelectNodes("tbody/tr")); result.QualityIndications.Add(qi); } } #endregion } } catch (Exception ex) { //TODO: Добавить логирование _logger?.Error(ex); return(null); } return(result); }
static void eventCrawler() { try { //Khai báo đường dẫn URL web cần lấy nội dung HTML for (int i = 0; i < 100; i++) { string _url = "https://www.adayroi.com/thuc-pham-r591?p="; HtmlAgilityPack.HtmlWeb htmlWeb = new HtmlAgilityPack.HtmlWeb(); htmlWeb.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0"; HtmlAgilityPack.HtmlDocument htmlDoc = htmlWeb.Load(_url + i); string _html = htmlDoc.DocumentNode.InnerHtml; _html = HttpUtility.HtmlDecode(_html); HtmlAgilityPack.HtmlNode _nodThreads = htmlDoc.DocumentNode.SelectSingleNode(@"//div[@class='row body-list-item']"); HtmlAgilityPack.HtmlNodeCollection nodChuDe = _nodThreads.SelectNodes(@"div"); //Khai báo biến tạm để hiện thị kết quả string ketqua = ""; //Duyet qua các nod nodChuDe vừa select được foreach (var n in nodChuDe) {//data-brand-id data-category-id data-product-item-id data-merchant-id string brand_id = n.SelectSingleNode("div").Attributes["data-brand-id"].Value.ToString().Trim(); string category_id = n.SelectSingleNode("div").Attributes["data-category-id"].Value.ToString().Trim(); string product_item_id = n.SelectSingleNode("div").Attributes["data-product-item-id"].Value.ToString().Trim(); string merchant_id = n.SelectSingleNode("div").Attributes["data-merchant-id"].Value.ToString().Trim(); string urlImage = n.SelectSingleNode("div/div[1]/span/a/img").Attributes["data-other-src"].Value.ToString().Trim(); string name = n.SelectSingleNode("div/div[2]/div/h4").InnerText.Trim(); name = HttpUtility.HtmlDecode(name); string gia = n.SelectSingleNode("div/div[2]/div/div/span").InnerText.Trim(); string urlSanPham = n.SelectSingleNode("div/div[1]/span/a").Attributes["href"].Value.ToString(); if (string.IsNullOrEmpty(urlImage) || string.IsNullOrEmpty(urlImage) || string.IsNullOrEmpty(urlImage) || string.IsNullOrEmpty(urlImage)) { break; } //HtmlAgilityPack.HtmlDocument htmlDocSanpham = htmlWeb.Load("https://www.adayroi.com" + urlSanPham); //string _htmlSP = htmlDocSanpham.DocumentNode.InnerHtml; //string _ItemInFoBlock = htmlDocSanpham.DocumentNode.SelectSingleNode(@"//div[@id='product_excerpt']").InnerText; //table table-bordered //string _Product_description = htmlDocSanpham.DocumentNode.SelectSingleNode(@"//div[@id='product_description']").InnerHtml; ketqua += brand_id + " $ " + category_id + " $ " + product_item_id + " $ " + merchant_id + " $ " + name + " $ " + gia + " $ " + urlImage + "\n"; } using (FileStream fs = new FileStream(@"F:\DATA.txt", FileMode.Append, FileAccess.Write)) using (StreamWriter w = new StreamWriter(fs, Encoding.UTF8)) { ketqua = HttpUtility.HtmlDecode(ketqua); w.WriteLine(ketqua + "\n"); Console.WriteLine("Trang " + i + " OK"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); Console.ReadLine(); } }
/// <summary> /// Search Shelfari page for possible series info, returning the next title in the series without downloading any other pages. /// TODO: Un-yuckify all the return paths without nesting a ton of ifs /// </summary> /// <param name="searchHtmlDoc">Book's Shelfari page, pre-downloaded</param> private string GetNextInSeriesTitle(HtmlAgilityPack.HtmlDocument searchHtmlDoc) { //Added estimated reading time and page count from Shelfari, for now... HtmlAgilityPack.HtmlNode pageNode = searchHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_FirstEdition']"); if (pageNode == null) { return(""); } HtmlAgilityPack.HtmlNode node1 = pageNode.SelectSingleNode(".//div/div"); if (node1 == null) { return(""); } //Parse page count and multiply by average reading time Match match1 = Regex.Match(node1.InnerText, @"Page Count: ((\d+)|(\d+,\d+))"); if (match1.Success) { double minutes = int.Parse(match1.Groups[1].Value, NumberStyles.AllowThousands) * 1.2890625; TimeSpan span = TimeSpan.FromMinutes(minutes); main.Log(String.Format("Typical time to read: {0} hours and {1} minutes ({2} pages)" , span.Hours, span.Minutes, match1.Groups[1].Value)); curBook.pagesInBook = match1.Groups[1].Value; curBook.readingHours = span.Hours.ToString(); curBook.readingMinutes = span.Minutes.ToString(); } //Added highlighted passage from Shelfari, dummy info for now... HtmlAgilityPack.HtmlNode members = searchHtmlDoc.DocumentNode.SelectSingleNode("//ul[@class='tabs_n tn1']"); int highlights = 0; if (members != null) { Match match3 = Regex.Match(members.InnerText, @"Reviews \(((\d+)|(\d+,\d+))\)"); if (match3.Success) { curBook.popularPassages = match3.Groups[1].Value.ToString(); } match3 = Regex.Match(members.InnerText, @"Readers \(((\d+)|(\d+,\d+))\)"); if (match3.Success) { curBook.popularHighlights = match3.Groups[1].Value.ToString(); highlights = int.Parse(match3.Groups[1].Value, NumberStyles.AllowThousands); } string textPassages = curBook.popularPassages == "1" ? String.Format("{0} passage has ", curBook.popularPassages) : String.Format("{0} passages have ", curBook.popularPassages); string textHighlights = curBook.popularHighlights == "1" ? String.Format("{0} time", curBook.popularHighlights) : String.Format("{0} times", curBook.popularHighlights); main.Log(String.Format("Popular Highlights: {0}been highlighted {1}" , textPassages, textHighlights)); } //If no "highlighted passages" found from Shelfari, add to log if (highlights == 0) { main.Log("Popular Highlights: No highlighted passages have been found for this book"); curBook.popularPassages = ""; curBook.popularHighlights = ""; } //Check if book series is available and displayed in Series & Lists on Shelfari page. HtmlAgilityPack.HtmlNode seriesNode = searchHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_Series']/div"); if (seriesNode != null) { //If multiple Series found, find and use standard series. foreach (HtmlAgilityPack.HtmlNode seriesType in seriesNode.SelectNodes(".//div")) { if (seriesType.InnerText.Contains("(standard series)", StringComparison.OrdinalIgnoreCase) && !seriesType.InnerText.Contains("(Reading Order)", StringComparison.OrdinalIgnoreCase)) { Match match = Regex.Match(seriesType.InnerText, @"This is book (\d+) of (\d+)"); if (!match.Success) { continue; } curBook.seriesName = seriesType.ChildNodes["a"].InnerText.Trim(); main.Log("About the series: " + seriesType.InnerText.Replace(". (standard series)", "")); if (!match.Success || match.Groups.Count != 3) { return(""); } curBook.seriesPosition = match.Groups[1].Value; curBook.totalInSeries = match.Groups[2].Value; HtmlAgilityPack.HtmlNode seriesInfo = seriesNode.SelectSingleNode(".//p"); //Parse preceding book if (seriesInfo != null && seriesInfo.InnerText.Contains("Preceded by ", StringComparison.OrdinalIgnoreCase)) { match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*),", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { previousTitle = match.Groups[1].Value; } else { match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*)\.", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { previousTitle = match.Groups[1].Value; } } main.Log("Preceded by: " + previousTitle); //Grab Shelfari Kindle edition link for this book previousShelfariUrl = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle"; } // Check if book is the last in the series if (!curBook.seriesPosition.Equals(curBook.totalInSeries)) { //Parse following book if (seriesInfo != null && seriesInfo.InnerText.Contains("followed by ", StringComparison.OrdinalIgnoreCase)) { match = Regex.Match(seriesInfo.InnerText, @"followed by (.*)\.", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { main.Log("Followed by: " + match.Groups[1].Value); //Grab Shelfari Kindle edition link for this book nextShelfariUrl = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle"; return(match.Groups[1].Value); } } } //Stop after first standard series is found maybe //add popup (already started implimentaton) in //future to pick which standard series you //want to use, not sure if worthwhile though. //eg: http://www.shelfari.com/books/37598923 break; } } } return(""); }
private object Extract(string name, ConfigSection config, HtmlAgilityPack.HtmlNode parentNode, List <HtmlAgilityPack.HtmlNode> logicalParents) { this.RemoveUnwantedTags(config, parentNode); // We will try to extract text for this item because it does not have children var containers = new JArray(); if (config.XPathRules != null && config.XPathRules.Count > 0) { foreach (var xpath in config.XPathRules) { // TODO: Add try catch Exception var nodes = parentNode.SelectNodes(xpath); if (nodes != null && nodes.Count > 0) { var newLogicalParents = logicalParents.GetRange(0, logicalParents.Count); newLogicalParents.Add(parentNode); foreach (var node in nodes) { if (config.Children != null && config.Children.Count > 0) { var container = new JObject(); this.ExtractChildren(config: config, parentNode: node, container: container, logicalParents: newLogicalParents); containers.Add(container); } else if (config.Transformations != null && config.Transformations.Count > 0) { var obj = this.RunTransformations(config.Transformations, node, newLogicalParents); if (obj != null) { containers.Add(obj); } } else if (node.InnerText != null) { containers.Add(HtmlEntity.DeEntitize(node.InnerText).Trim()); } } } } } else { var container = new JObject(); this.ExtractChildren(config: config, parentNode: parentNode, container: container, logicalParents: logicalParents); containers.Add(container); } if (!config.ForceArray && containers.Count == 0) { return(new JObject()); } else if (!config.ForceArray && containers.Count == 1) { return(containers.First); } else { return(containers); } }
private void textBox1_KeyDown(object sender, KeyEventArgs e) { if (e.KeyCode == Keys.Enter) { String serial = "INVALID_SERIAL"; if (textBox1.Text.Length > 0) { serial = textBox1.Text; } try { webBrowser1.AllowNavigation = true; string parsedHTML = ""; string movieTitle = "UNKNOWN"; string imgSrc = "UNKNOWN"; //string html = new WebClient().DownloadString("https://www.google.com/search?q=" + serial); //parsedHTML = parseHTML(html); //richTextBox1.Text = html; //webBrowser1.DocumentText = html; HtmlWeb web = new HtmlWeb(); document = web.Load("https://www.google.com/search?q=" + serial); HtmlAgilityPack.HtmlNode bodyNode = document.DocumentNode.SelectSingleNode("//td[@id='rhs_block']"); richTextBox1.Text = document.DocumentNode.InnerHtml; if (bodyNode.InnerText.Length > 1) { webBrowser1.DocumentText = bodyNode.InnerHtml; foreach (var image in bodyNode.SelectNodes(".//img")) { var src = image.GetAttributeValue("src", null); if (src != null) { imgSrc = src; break; } } textBox2.Enabled = false; normalQuery = true; try { movieTitle = bodyNode.SelectSingleNode("//div[@class='_B5d']").InnerText; } catch (Exception ex) { textBox2.Enabled = true; } } else { normalQuery = false; textBox2.Enabled = true; webBrowser1.Navigate("https://www.google.com/search?q=" + serial);// = new WebClient().DownloadString("https://www.google.com/search?q=" + serial); } if (list.Count == 0) { list.Add(new String[] { serial + " (1)", movieTitle, imgSrc }); listView1.Items.Add(serial + " (1)"); listBox2.Items.Add(movieTitle); } else { for (int i = 0; i < list.Count; ++i) { if (list[i][0].ToString().Contains(serial)) { int count = Int32.Parse(list[i][0].ToString().Substring(list[i][0].ToString().IndexOf('(')).Substring(1, list[i][0].ToString().Substring(list[i][0].ToString().IndexOf('(')).Length - 2)); list[i][0] = list[i][0].ToString().Substring(0, list[i][0].ToString().IndexOf('(')) + "(" + ++count + ")"; listView1.Items.Clear(); listBox2.Items.Clear(); foreach (String[] j in list) { listView1.Items.Add(j[0]); listBox2.Items.Add(j[1]); } i = list.Count; break; } else if (i == list.Count - 1) { list.Add(new String[] { serial + " (1)", movieTitle, imgSrc }); listView1.Items.Add(serial + " (1)"); listBox2.Items.Add(movieTitle); i = list.Count; break; } } } } } catch (Exception ex) { textBox1.Text = "INVALID SERIAL"; } textBox1.Text = ""; e.SuppressKeyPress = true; } }
public override void Parse(Response response) { //Create a new HTMLAglityPack document HtmlDocument ContentDocument = new HtmlDocument(); //load the #content of the page into the document ContentDocument.LoadHtml(response.Css("#content").First().OuterHtml); HtmlAgilityPack.HtmlNode BodyNode = ContentDocument.DocumentNode; patternObject.Title = BodyNode.SelectSingleNode("//*[@id=\"firstHeading\"]").InnerHtml; HtmlAgilityPack.HtmlNode ContentNode = BodyNode.SelectSingleNode("//*[@id=\"mw-content-text\"]"); //remove the "toc" and "jump" and "siteSub" sections to save space and later client-side processing time if (ContentNode.SelectSingleNode("//*[@id=\"toc\"]") != null) { ContentNode.SelectSingleNode("//*[@id=\"toc\"]").Remove(); } foreach (var node in ContentNode.SelectNodes("//comment()")) { node.Remove(); } ContentNode.PrependChild(BodyNode.SelectSingleNode("//*[@id=\"firstHeading\"]")); //set the patternObject's title patternObject.Title = ContentNode.SelectSingleNode("//*[@id=\"firstHeading\"]").InnerHtml; foreach (var link in ContentNode.SelectNodes("//a/@href")) { //skip if this is a redlink (page doesn't exist). if (link.Attributes["href"].Value.Contains("redlink=1")) { continue; } //skip if this links to this page if (link.Attributes["href"].Value.Split('#').First() == response.FinalUrl) { continue; } //if any of the links ancestor nodes is the "category links" part of the page if (link.Ancestors().Any(node => node.Id == "catlinks")) { if (link.InnerText != "Categories") //if it is not the "categories" special page { //add it to the patterns list of categories patternObject.Categories.Add(link.InnerText); } } else //assume its a normal text-body link { //check if we don't already know about this link patternObject.CreateOrGetPatternLink(link.InnerText); } //add relation info if this is a relation link if (GetNodeReleventPageHeading(link, "h2") != null && GetNodeReleventPageHeading(link, "h2").InnerText == "Relations") { //get the relation type of this relation and get its inner text HtmlAgilityPack.HtmlNode RelationHeadingNode = GetNodeReleventPageHeading(link, "h3"); String RelationName = RelationHeadingNode.InnerText; //if there is a h4 node before the previous h3 node if (GetNodeReleventPageHeading(link, "h4") != null && RelationHeadingNode.InnerStartIndex < GetNodeReleventPageHeading(link, "h4").InnerStartIndex) { //assume it is a "with x" sub-category of relation for the "Can Instantiate" section RelationName = RelationHeadingNode.InnerText + " " + GetNodeReleventPageHeading(link, "h4").InnerText; } //add the relevent relation to this link patternObject.CreateOrGetPatternLink(link.InnerText).Type.Add(RelationName); } } //get a cleaned copy of the #content HTML for giving in the JSON data patternObject.Content = ProcessPageContentToString(ContentNode); string Json = JsonConvert.SerializeObject(patternObject); File.WriteAllText(Pattern.GetFileName(patternObject.Title), Json); }
public static IList <HtmlNode> SelectNodesAsList( this HtmlNode node, string xpath) => node.SelectNodes(xpath)?.ToList() ?? new List <HtmlNode>(0);
public List <Entry> parseFile(string file) { List <Entry> result = new List <Entry>(); if (!File.Exists(file)) { log.Error(String.Format("File \"{0}\" does not exist.", file)); } else { log.InfoFormat("Parsing file : {0}", file); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.Load(file); if (htmlDoc.DocumentNode != null) { HtmlAgilityPack.HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//body"); foreach (HtmlParseError error in htmlDoc.ParseErrors) { log.Error(String.Format("Error \nCode: {0}\nLine: {1}\nPosition : {2}\nReason : {3}\nSource Text : {4} ", error.Code, error.Line, error.LinePosition, error.Reason, error.SourceText)); } if (htmlDoc.ParseErrors.Count() == 0) { if (bodyNode != null) { int year = extractYear(bodyNode); HtmlNodeCollection monthsNodeCollection = bodyNode.SelectNodes("//h3"); foreach (HtmlNode monthNode in monthsNodeCollection) { Month month = extractMonth(monthNode); HtmlNodeCollection entryNodeCollection = monthNode.SelectNodes("following-sibling::div[1]/ul/li/h2"); // If entries exist for the month, resume ... if (entryNodeCollection != null) { foreach (HtmlNode entryNode in entryNodeCollection) { DateTime entryDate = extractEntryDate(entryNode); String entryTitle = extractEntryTitle(entryNode); DateStamp entryKey = new DateStamp() { Year = year, Month = month, Day = entryDate.Day }; String entryText = extractEntryText(entryNode); List <Image> entryImages = extractEntryImages(entryNode); result.Add( new Entry() { Key = entryKey, Title = entryTitle, Text = entryText, Images = entryImages }); } } else { log.Error(String.Format("Could not find any entries for month. Html: {0}", monthNode.InnerHtml)); } } } } } } return(result); }
public void Crawler() { int previousPageGallNum = 1000000000; Console.WriteLine(initDate.ToString() + endDate.ToString()); string url = gallUrl + "&page="; var client = new WebClient(); client.Encoding = System.Text.Encoding.UTF8; //Dictionary value => count, replyNum, gallCount, gallRecommend Dictionary <UserInfo, int[]> userDic = new Dictionary <UserInfo, int[]>(); int currentPage = this.initPage; while (true) { string text; try { text = client.DownloadString(url + currentPage.ToString()); if (string.IsNullOrEmpty(text)) { continue; } } catch { continue; } hap.HtmlDocument textHap = new hap.HtmlDocument(); textHap.LoadHtml(text); hap.HtmlNodeCollection nicks = textHap.DocumentNode.SelectNodes("//tr[@class='ub-content us-post']"); //Console.WriteLine(nicks.Count); //Console.WriteLine("==================" + currentPage.ToString() + "=================="); try { foreach (hap.HtmlNode nick in nicks) { int gallNum, replyNum, gallCount, gallRecommend; DateTime gallDate; string subject; gallNum = GetOnlyInt(nick.SelectSingleNode("./td[@class='gall_num']").InnerText); gallDate = DateTime.ParseExact(nick.SelectSingleNode("./td[@class='gall_date']").Attributes["title"].Value, "yyyy-MM-dd HH:mm:ss", null); Console.WriteLine(gallNum.ToString() + " " + gallDate.ToString()); if (gallNum >= previousPageGallNum) { Console.WriteLine(previousPageGallNum.ToString() + " " + gallNum.ToString()); Console.WriteLine("번호 에러"); continue; } if (DateTime.Compare(gallDate, initDate) < 0 || DateTime.Compare(gallDate, endDate) > 0) { Console.WriteLine("날짜 에러"); continue; } hap.HtmlNode user = nick.SelectSingleNode("./td[@class='gall_writer ub-writer']"); UserInfo tempUserInfo = new UserInfo(user.Attributes["data-nick"].Value); if (user.Attributes["data-uid"].Value == "") { tempUserInfo.setFluidNick(user.Attributes["data-ip"].Value); } else { tempUserInfo.setFixedNick(user.Attributes["data-uid"].Value); } //replyNum and subject are in <td class='gall_tit ub-word'></td> hap.HtmlNode subjectNode = nick.SelectSingleNode("./td[2]"); try { if (subjectNode.Attributes["class"].Value == "gall_subject") { subjectNode = nick.SelectSingleNode("./td[3]"); } subject = subjectNode.SelectSingleNode("./a[1]").InnerText; if (subjectNode.SelectNodes("./a").Count == 2) { replyNum = GetOnlyInt(subjectNode.SelectSingleNode("./a[@class='reply_numbox']/span").InnerText); } else { replyNum = 0; } } catch { subject = "NullSubjectException"; replyNum = 0; } // Console.WriteLine("댓글: " + replyNum.ToString()); gallCount = GetOnlyInt(nick.SelectSingleNode("./td[@class='gall_count']").InnerText); gallRecommend = GetOnlyInt(nick.SelectSingleNode("./td[@class='gall_recommend']").InnerText); //Dictionary value => count, replyNum, gallCount, gallRecommend if (userDic.ContainsKey(tempUserInfo)) { userDic[tempUserInfo][0] += 1; userDic[tempUserInfo][1] += replyNum; userDic[tempUserInfo][2] += gallCount; userDic[tempUserInfo][3] += gallRecommend; } else { int[] tempInts = new int[] { 1, replyNum, gallCount, gallRecommend }; userDic.Add(tempUserInfo, tempInts); } UserData tempUserData = new UserData(tempUserInfo); tempUserData.DataInput(gallNum, replyNum, gallCount, gallRecommend, gallDate, subject); //gallDatas.Add(tempUserData); } } catch { if (ErrorOccured != null) { ErrorOccured(text, null); } currentPage++; continue; } previousPageGallNum = GetOnlyInt(nicks[nicks.Count - 1].SelectSingleNode("./td[@class='gall_num']").InnerText); DateTime currentDate = DateTime.ParseExact(nicks[nicks.Count - 1]. SelectSingleNode("./td[@class='gall_date']").Attributes["title"].Value, "yyyy-MM-dd HH:mm:ss", null); if (currentPage >= endPage || DateTime.Compare(currentDate, initDate) < 0) { break; } else { System.Collections.ArrayList arr = new System.Collections.ArrayList(); string str = currentPage.ToString() + " 페이지, 날짜: " + currentDate.ToString(); arr.Add(str); arr.Add(currentDate); arr.Add(currentPage - initPage); if (newPageHappened != null) { newPageHappened(arr, null); } currentPage++; } } //Dictionary value => count, replyNum, gallCount, gallRecommend foreach (KeyValuePair <UserInfo, int[]> user in userDic) { UserInfo tempUser = user.Key; tempUser.count = user.Value[0]; tempUser.replyNum = user.Value[1]; tempUser.gallCount = user.Value[2]; tempUser.gallRecommend = user.Value[3]; UserRank tempUserRank = new UserRank(tempUser, user.Value[0], user.Value[1], user.Value[2], user.Value[3]); userList.Add(tempUserRank); } var sorted = from userRank in userList orderby userRank.count descending select userRank; userList = sorted.ToList <UserRank>(); if (CrawlingEnded != null) { CrawlingEnded(userList, null); } string tempDataDir = Directory.GetCurrentDirectory() + "\\temp-data\\"; Directory.CreateDirectory(tempDataDir); string filename = tempDataDir + gallId + DateTime.Now.ToString("_yyyy-MM-dd_HH-mm-ss"); SaveResult(filename); }
/// <summary> /// return node collection that match _LabelData.XPath /// </summary> /// <returns></returns> protected HtmlAgilityPack.HtmlNodeCollection GetNodeCollection() { HtmlAgilityPack.HtmlNode ndTempParent = GetTempParentNode(); return(ndTempParent.SelectNodes(_LabelData.XPath)); }