public SelectSingleNode ( string xpath ) : |
||
xpath | string | The XPath expression. May not be null. |
return |
public Segment GetSegment(HtmlNode segmentDiv, string day) { var imgDiv = segmentDiv.SelectSingleNode("./div[@class='programListContentImgArea']"); var txtDiv = segmentDiv.SelectSingleNode("./div[@class='programListContentTextArea scheduleColumnTextArea']"); var anchor = txtDiv.GetChildById("hypProgramTxt"); var authorNode = txtDiv.GetChildById("lblAuthor"); var imageNode = imgDiv.SelectSingleNode(".//img"); //TODO: Move url building (below) into UrlBuilder class var segment = new Segment { DayDescription = day, Title = anchor.DecodeHtml(), SegmentDetailUrl = configurationManager.BookTvDomain + anchor.GetAttributeValue("href", ""), Time = txtDiv.GetChildById("lblAirTime").DecodeHtml(), Duration = txtDiv.GetChildById("lblLength").DecodeHtml(), Author = (authorNode == null) ? "" : authorNode.DecodeHtml(), ImageUrl = configurationManager.BookTvDomain + imageNode.Attributes["src"].Value, }; segment.DurationInMinutes = durationParser.GetDurationInMinutes(segment.Duration); segment.Date = dayParser.Parse(segment.DayDescription); return segment; }
private ProductBasicData BuildProductBasicData(HtmlNode productNode) { log.DebugFormat("[BuildProductBasicData] OuterHtml= {0}.", productNode.OuterHtml); //throw new NotImplementedException(); ProductBasicData returnValue = new ProductBasicData(); //should be transfered to barcode, but last digitals of the barcode is productId returnValue.Barcode = productNode.Attributes.First(c => c.OriginalName == "DdPiD").Value; returnValue.ProductId = productNode.Attributes.First(c => c.OriginalName == "DdPiD").Value; returnValue.pbcatid = productNode.Attributes.First(c => c.OriginalName == "pbcatid").Value; returnValue.qty = productNode.Attributes.First(c => c.OriginalName == "qty").Value; returnValue.iq = productNode.Attributes.First(c => c.OriginalName == "iq").Value; returnValue.inb = productNode.Attributes.First(c => c.OriginalName == "inb").Value; //returnValue.ImageSource = productNode.SelectSingleNode("//img[@src]").Attributes.FirstOrDefault(c=> c.Name == "src").Value; returnValue.ImageSource = productNode.SelectNodes("child::*/child::div/child::img").First().Attributes.First(c => c.Name == "src").Value; returnValue.EffectivePrice = productNode.SelectSingleNode("child::*/child::div/child::div/child::div/child::span").InnerText; //returnValue.EffectivePrice = productNode.SelectSingleNode("//span[@id='spnEffectivePrice']").InnerText; //returnValue.Description = HttpUtility.HtmlDecode(productNode.SelectSingleNode("//div[@class='ProdBoxSupplierText']").InnerText); returnValue.Description = HttpUtility.HtmlDecode(productNode.SelectSingleNode("child::*/child::div/child::div/child::a").InnerText); returnValue.ProductName = HttpUtility.HtmlDecode(productNode.SelectSingleNode("child::*/child::div/child::div/child::a").InnerText); log.DebugFormat("[BuildProductBasicData] fetched product={0}.", returnValue.ToString()); return returnValue; }
private static void AddPackage(SteamApp app, HtmlNode packageNode) { var package = app.AddNewPackage(); var packageTitleNode = packageNode.SelectSingleNode($"//{PackageTitle}"); package.Title = packageTitleNode.InnerHtml.Replace("Buy ", "").Trim(); var priceNodes = packageNode.SelectNodes($"//div[@class='{PackagePriceXPath}']"); if (priceNodes != null) { var priceNode = priceNodes[0]; package.CurrentPrice = ParseNodeWithCurrencyToDecimal(priceNode); package.OriginalPrice = package.CurrentPrice; } else { var originalPriceNode = packageNode.SelectSingleNode($"//div[@class='{PackageOriginalPriceXPath}']"); package.OriginalPrice = ParseNodeWithCurrencyToDecimal(originalPriceNode); var discountPriceNode = packageNode.SelectSingleNode($"//div[@class='{PackageDiscountPriceXPath}']"); package.CurrentPrice = ParseNodeWithCurrencyToDecimal(discountPriceNode); } }
public Trade(HtmlNode code) { htmlCode = code; // fetch the trade ID (well, actually its the url) id = htmlCode.SelectSingleNode("//a[contains(@href,'/trade/')]").GetAttributeValue("href", ""); // fetch the name, donator type is used to find it HtmlNode node = htmlCode.SelectSingleNode("//span[@class='regular' or starts-with(@class, 'donator')]"); name = (node != null) ? node.InnerText : null; // the trade's description HtmlNode descriptionNode = htmlCode.SelectSingleNode("//div[@class='notes expandable show']"); description = (descriptionNode != null) ? descriptionNode.InnerText : ""; /* * here we construct the item objects * */ HtmlNode itemOfferBundleHtml = htmlCode.SelectSingleNode("(div[@class='four-column'])[1]"); HtmlNode itemRequestBundleHtml = htmlCode.SelectSingleNode("(div[@class='four-column'])[2]"); HtmlNodeCollection itemOfferHtml = itemOfferBundleHtml.SelectNodes("div[starts-with(@class, 'item')]"); HtmlNodeCollection itemRequestHtml = itemRequestBundleHtml.SelectNodes("div[starts-with(@class, 'item')]"); offer = new List<Item>(); request = new List<Item>(); for (int i = 0; i < itemOfferHtml.Count; i++) { offer.Add(new Item(itemOfferHtml[i])); } for (int i = 0; i < itemRequestHtml.Count; i++) { request.Add(new Item(itemRequestHtml[i])); } }
internal Match MapHtmlNodeToMatch(HtmlNode root, MatchPath matchPath, int currentCount) { Match match = new Match(); match.Id = HtmlEntity.DeEntitize(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Id, currentCount)).Attributes[matchPath.IdAttribute].Value).Replace(matchPath.IdReplace, ""); HtmlNode heroNode = root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Hero, currentCount)); if (heroNode != null) { string heroReference = root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Hero, currentCount)).Attributes[HtmlAttributes.Hero.Attribute.Value].Value.Replace(HtmlAttributes.Hero.Replace.Value, ""); match.Hero = mainController.HeroController.GetHero(heroReference); } match.Result = mainController.MapStringToEnum<Results>(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Result, currentCount)).InnerText); match.TimeAgo = DateTime.Parse(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.TimeAgo, currentCount)).Attributes[MainController.HTML_ATTRIBUTE_DATETIME].Value); match.Type = mainController.MapStringToEnum<Types>(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Type, currentCount)).InnerText); match.Mode = mainController.MapStringToEnum<Modes>(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Mode, currentCount)).InnerText); HtmlNode skillBracketNode = root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Skillbracket, currentCount)); if (skillBracketNode != null) match.Skillbracket = mainController.MapStringToEnum<Skillbrackets>(HtmlEntity.DeEntitize(skillBracketNode.InnerText)); match.Duration = mainController.ConvertStringToTimeSpan(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Duration, currentCount)).InnerText); match.Kda = mainController.ConvertStringToKda(root.SelectSingleNode(mainController.CombinePathWithListCount(matchPath.Kda, currentCount)).InnerText); return match; }
private static ArticleInfo ParseArticleInfoDiv(HtmlNode articleDiv) { var linkToArticle = articleDiv.SelectSingleNode("h3/a"); var dateDiv = articleDiv.SelectSingleNode("div[@class='headline-date']"); var commentCountNode = articleDiv.SelectSingleNode("h3/a[@class='commentCount']"); var articleInfo = new ArticleInfo(); articleInfo.Url = linkToArticle.Attributes["href"].Value; if (articleInfo.Url.Contains(@"/video/")) { throw new CommonParsingException("Delfi TV article"); } articleInfo.Id.ExternalId = articleInfo.Url.GetQueryParameterValueFromUrl("id"); articleInfo.Title = linkToArticle.InnerText; articleInfo.DatePublished = DelfiWordyDateParser.Parse(dateDiv.InnerText); articleInfo.DateScraped = DateTime.UtcNow.AddHours(2); articleInfo.Id.Portal = Portal.Delfi; articleInfo.CommentCount = commentCountNode == null ? 0 : Convert.ToInt32(commentCountNode.InnerText.TrimStart('(').TrimEnd(')')); var articleId = Convert.ToInt32(articleInfo.Url.GetQueryParameterValueFromUrl("id")); if (articleId == 0) throw new CommonParsingException("Article id not found"); return articleInfo; }
private string VariableNameResolver(HtmlNode row) { var isOptional = row.SelectSingleNode(@".//td[contains(@class,'description')]/span[@class='optional']"); var variable = row.SelectSingleNode(@".//td[@class=""name"" ]").InnerText + (isOptional != null ? "?" : ""); return variable; }
private static string getMoney(HtmlNode node) { try { string money = ""; string gold = HtmlEntity.DeEntitize(node.SelectSingleNode("descendant::span[@class='mgold']").InnerText).TrimStart(); string silver = HtmlEntity.DeEntitize(node.SelectSingleNode("descendant::span[@class='msilver']").InnerText).TrimStart(); string bronze = HtmlEntity.DeEntitize(node.SelectSingleNode("descendant::span[@class='mbronze']").InnerText).TrimStart(); if (gold != "") money += gold; if (silver != "") if (silver.Length > 1) money += silver; else if (gold == "") money += silver; else money += '0' + silver; if (bronze.Length > 1) money += bronze; else money += '0' + bronze; return money; } catch (NullReferenceException e) { return ""; } }
internal Stat MapHtmlNode(HtmlNode root, int currentCount) { string matchesPlayed = root.SelectSingleNode(string.Format(PlayerPath.MostPlayedHeroes.MatchesPlayed.Value, currentCount)).InnerText; string winRate = root.SelectSingleNode(string.Format(PlayerPath.MostPlayedHeroes.Winrate.Value, currentCount)).InnerText; string kdaRatio = root.SelectSingleNode(string.Format(PlayerPath.MostPlayedHeroes.Kda.Value, currentCount)).InnerText; return new Stat(matchesPlayed.Replace(",", ""), winRate, kdaRatio.Replace(".", ",")); }
public static Remark ParseRemark(HtmlNode remarkNode) { var remark = new Remark(); //发现物 var discoveryNode = remarkNode.SelectSingleNode("a[@title!='']"); if (discoveryNode!=null) { var levelNode = discoveryNode.PreviousSibling.PreviousSibling; var typeNode = levelNode.PreviousSibling.PreviousSibling; remark.DiscoveryType = Enum.Parse(typeof(DisType), typeRegex.Match(typeNode.Attributes["src"].Value).Groups["type"].Value).ToString(); remark.DiscoveryLevel = Int32.Parse(levelNode.InnerText.Substring(0, 1)); remark.DiscoveryExp = Int32.Parse(discoveryNode.Attributes["title"].Value.Remove(0,5)); remark.Discovery = discoveryNode.InnerText; } //奖励物 var awardNode = remarkNode.SelectSingleNode("span[@style='color:#804000;']"); if (awardNode != null) remark.AwardItem = awardNode.InnerText; //相关任务 var relativeNodes = remarkNode.SelectNodes("descendant::a[@style='color:#C000C0;' or @style='color:DarkBlue;']"); if (relativeNodes != null) { foreach (HtmlNode relativeNode in relativeNodes) { IList<int> questList = null; IList<string> foundNameList = null; if (relativeNode.InnerText.StartsWith("前:")) { foundNameList = remark.PreFoundName; questList = remark.PreQuestID; } else { questList = remark.FollowQuestID; } var match = questRegex.Match(relativeNode.Attributes["href"].Value); if (relativeNode.InnerText.StartsWith("前:港口-") == false) questList.Add(Int32.Parse(match.Groups["id"].Value)); else foundNameList.Add(relativeNode.InnerText.Replace("前:港口-","")); } } //接受城市 //last br next a var cityNodes = remarkNode.SelectNodes("descendant::a[@class='MisCity']"); if (cityNodes != null) { cityNodes.All(node => { if (node.InnerText == "南美开拓港" || node.InnerText == "东南亚开拓港" || node.InnerText == "掠夺地图" || node.InnerText == "沉船资讯") return true; remark.FromCityList.Add(node.InnerText); return true; }); } return remark; }
private string VariableTypeResolver(HtmlNode row) { var tdType = row.SelectSingleNode(@".//td[@class=""type"" ]"); //EntityCollection contains(entity) as no type information. var typeNodes = tdType.SelectNodes(@".//span[@class='param-type']"); if (typeNodes == null) return "any"; var types = typeNodes.Select(Program.TypeReader).ToArray().Distinct(); if (!types.Skip(1).Any() && types.First() == "Object") { var props = GetSignatureTypes(row.SelectSingleNode(@".//td[contains(@class,'description')]")); if (props.Keys.Any()) { // var type = "opt_" + Program.CalculateMD5Hash(string.Join("", props.Keys.OrderBy(k => k))); var type = this._name + "Options"; var dependencies = new List<string>(); var writer = Program.GetWriter(type,_source); if (writer != null) { props = props.ToDictionary(k => k.Key, v => Program.extractDependencies(dependencies, v.Value)); Program.WriteDependencies(type, dependencies, writer, null, null, _source); writer.WriteLine($"interface {type}"); writer.WriteLine("{"); foreach (var prop in props) { writer.WriteLine($"\t{prop.Key}: {prop.Value};"); } writer.WriteLine("}"); writer.WriteLine($"export = {type}"); } return "Cesium." + type; } } return string.Join("|", types); }
protected override string ExtractAuthor(HtmlNode node) { if (node.SelectSingleNode("div/div[2]/div/div/span") != null) { return node.SelectSingleNode("div/div[2]/div/div/span").InnerText.Split('-')[0].Trim(); } return string.Empty; }
private string getDescription(HtmlNode node) { var node2 = node.SelectSingleNode(".//div[@class='hoverbox-details']/p"); if (node2 == null) node2 = node.SelectSingleNode(".//div[@class='list-item-main1']");//klipit if (node2 != null) return node2.InnerText; return String.Empty; }
private string GetAuthorName(HtmlNode docNode) { var authorNode = docNode.SelectSingleNode(".//strong[@itemprop='author']"); if (authorNode == null) { authorNode = docNode.SelectSingleNode(".//div[@class='rtl-info']/strong"); } return authorNode != null ? authorNode.InnerText.Trim() : null; }
protected override string ExtractSnippet(HtmlNode node) { if (node.SelectSingleNode("div/div[2]") != null) { return node.SelectSingleNode("div/div[2]").InnerText; } return string.Empty; }
private string getDescription(HtmlNode node) { var node2 = node.SelectSingleNode(".//p[@class='expanding-card__description']");//kausi if (node2 == null) node2 = node.SelectSingleNode(".//div[@class='list-item-main1']");//klipit if (node2 != null) return node2.InnerText; return String.Empty; }
protected override string ExtractTitle(HtmlNode node) { if (node.SelectSingleNode("div/div[1]/h3/a") != null) { return node.SelectSingleNode("div/div[1]/h3/a").InnerText; } return string.Empty; }
public override string ReadContent(HtmlNode node) { var content = node.SelectSingleNode("//div[@id='mediaarticlebody']"); if (content == null) content = node.SelectSingleNode("//div[@itemprop='articleBody']"); RemoveScripts(content); RemoveTags(GetNodeByXpathAndClass(node, "//div", "yog-ycb")); return content.InnerHtml; }
public virtual void Parser(HtmlNode node) { userURL = node.SelectSingleNode("div[@class='log-content']/div[@class='log-body']/a[last()]").Attributes["href"].Value; userName = node.SelectSingleNode("div[@class='log-content']/div[@class='log-body']/a[last()]").InnerText; stampName = node.SelectSingleNode("div[@class='log-content']/div[@class='log-details log-target log-target-stamp']/div[@class='log-target-info']/a").InnerText; stampURL = node.SelectSingleNode("div[@class='log-content']/div[@class='log-details log-target log-target-stamp']/div[@class='log-target-info']/a").Attributes["href"].Value; stampImageURL = node.SelectSingleNode("div[@class='log-content']/div[@class='log-details log-target log-target-stamp']/div[@class='log-target-thumbnail']/a/img").Attributes["data-src"].Value; longago = new NicorepoItemSubLongago(node); nicoru = new NicorepoItemSubNicoru(node); }
public override string ReadContent(HtmlNode node) { var content = node.SelectSingleNode("//div[@class='article-body']"); if (content == null) content = node.SelectSingleNode("//div[@class='slide-description']"); RemoveScripts(content); RemoveTags(GetNodeByXpathAndClass(node,"//div","poll_module")); return CleanHtml(content.InnerHtml); }
public override string ReadAuthor(HtmlNode node) { var author = node.SelectSingleNode("//a[@rel='author']"); if (author != null) return author.InnerText; else { author = node.SelectSingleNode("//meta[@itemprop='author']"); } return string.Empty; }
private static IUsage CreateUsageNode(HtmlNode node) { var typeNode = node.SelectSingleNode( ".//em/text()" ); var textNode = node.SelectSingleNode( ".//strong/text()" ); return new Usage { Text = textNode.InnerText, Type = GetWordType( typeNode ), Synonyms = ExtractSynonyms( node ) }; }
public HtmlAgilityPackTableXpath(string html, int skip) { htmlTableAttDic = new Dictionary<string, int>(); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); this.skip = skip; doc.LoadHtml(html); rootDomNode = doc.DocumentNode; GetHtmlAllTableXpath(rootDomNode); maxValue = rootDomNode.SelectSingleNode(maxKey).GetAttributeValue("id", ""); secondValue = rootDomNode.SelectSingleNode(secondKey).GetAttributeValue("id", ""); }
public Departure(HtmlNode departure) { var time = departure.SelectSingleNode(TimetabledDepartureXPath).ChildNodes[0].InnerText; TimetabledDeparture = new DepartureTime(time); if (departure.SelectSingleNode(PlatformXPath) != null) { Platform = departure.SelectSingleNode(PlatformXPath).ChildNodes[2].InnerText; } }
public override string ReadContent(HtmlNode node) { var content = node.SelectSingleNode("//div[@class='bg-opaque pad-16 article']"); if (content == null) content = node.SelectSingleNode("//article"); RemoveScripts(content); RemoveTags(content, "//div"); return CleanHtml(content.InnerHtml); }
public TorrentLeechEntry(HtmlNode node, BrowserClient client) { _client = client; Title = node.SelectSingleNode(".//span[@class='title']/a").InnerText; Friendly = Title.TorrentName(); var size = node.SelectSingleNode(".//td[5]").InnerText; double number = double.Parse(size.RegexMatch(@"\d+").Value); if(size.Contains("GB")) number *= 1024; Size = number; Torrent = "http://torrentleech.org" + node.SelectSingleNode(".//td[@class='quickdownload']/a").Attributes["href"].Value; }
private static string GetAuthorName(HtmlNode docNode) { var node = docNode.SelectSingleNode("//div[@class='delfi-author-name']") ?? docNode.SelectSingleNode("//div[@class='delfi-source-name']"); if (node == null) { var t = 5; } return node == null ? null : node.InnerText; }
public override string ReadContent(HtmlNode node) { var content = node.SelectSingleNode("//div[@class='fs-article']"); if (content == null) content = node.SelectSingleNode("//div[@id='fsn_v3_main']"); if (content == null) return null; RemoveScripts(content); RemoveTags(content, "//div[@id='story-top-container']"); RemoveTags(content, "//div[@class='parentWrapper']"); return CleanHtml(content.InnerHtml); }
private BsonDocument ParseMessageViewNode(HtmlNode messageViewNode) { var mainContentNode = messageViewNode.SelectSingleNode("div//div[contains(@class,'lia-quilt-column-main-right')]"); var messageAuthorNode = messageViewNode.SelectSingleNode("div//div[contains(@class, 'lia-message-author')]"); var idNode = messageViewNode.SelectSingleNode("div//div[@data-message-id]"); ThrowExceptionIfNull(idNode, "can not find the id node"); ThrowExceptionIfNull(mainContentNode, "can not find the content node"); var subjectNode = mainContentNode.SelectSingleNode("div//div[contains(@class, 'lia-message-subject')]"); ThrowExceptionIfNull(subjectNode, "can not find the subject node"); var titleNode = subjectNode.SelectSingleNode("h1"); var solutionNode = subjectNode.SelectSingleNode("span[contains(@class, 'solution')]"); var messagePostDateNode = mainContentNode.SelectSingleNode("div//span[contains(@class,'lia-message-posted-on')]"); var bodyNode = mainContentNode.SelectSingleNode("div//div[contains(@class, 'lia-message-body-content')]"); var userNameNode = messageAuthorNode.SelectSingleNode("div//a[contains(@class, 'lia-user-name-link')]/span"); var userRegisterDateNode = messageAuthorNode.SelectSingleNode("div//span[contains(@class, 'DateTime')]"); var author = new BsonDocument { { "name", userNameNode.InnerText}, { "registeredOn", ParseDateTimeNode(userRegisterDateNode)} }; var message = new BsonDocument { { "id", idNode.Attributes["data-message-id"].Value }, { "body", bodyNode == null ? string.Empty : bodyNode.InnerHtml}, { "isAnswer", solutionNode != null }, { "createdOn", ParseDateTimeNode(messagePostDateNode)}, { "author", author } }; if(titleNode != null) { message.Add("title", titleNode.InnerText); } return message; }
public TorrentLeechEntry(HtmlNode node, BrowserClient client) { _client = client; var titleNode = node.SelectSingleNode(".//span[@class='title']/a"); Title = titleNode.InnerText; Id = int.Parse(titleNode.Attributes["href"].Value.RegexMatch(@"\d+").Value); Friendly = Title.TorrentName(); var size = node.SelectSingleNode(".//td[5]").InnerText; double number = double.Parse(size.RegexMatch(@"\d+").Value); if (size.Contains("GB")) number *= 1024; Size = number; Torrent = "http://www.torrentleech.org/rss/download/{0}/ed2597d8977cde9da218/{1}".Template(Id,Title.RegexReplace(@"\s", ".")); }
private int AddSubcats(HtmlNode node, RssLink parentCat) { var subs = node.SelectNodes(".//article"); foreach (var sub in subs) { RssLink subcat = new RssLink() { ParentCategory = parentCat }; subcat.Name = HttpUtility.HtmlDecode(sub.SelectSingleNode(".//a[@title]").Attributes["title"].Value.Trim()); subcat.Url = FormatDecodeAbsolutifyUrl(parentCat.Url, sub.SelectSingleNode(".//a[@href]").Attributes["href"].Value, null, UrlDecoding.None); subcat.Thumb = getThumb(sub.SelectSingleNode(".//picture/img")); parentCat.SubCategories.Add(subcat); } var np = node.SelectSingleNode(".//a[@href and text()='More shows']"); nextPageAvailable = false; if (np != null) { string url = CreateUrl(parentCat.Url, np.Attributes["href"].Value); var npCat = new NextPageCategory() { Url = url, ParentCategory = parentCat }; parentCat.SubCategories.Add(npCat); } parentCat.SubCategoriesDiscovered = true; return(parentCat.SubCategories.Count); }
private static void ParseEventDiv(List <DateTimeOffset> weekDays, int columnIndex, List <NewsEvent> list1, HAP.HtmlNode div) { var name = div.SelectNode("a").InnerText.Trim(); var country = name.Split(':').FirstOrDefault(); if (string.IsNullOrWhiteSpace(country)) { country = "ALL"; } else { name = name.Substring(country.Length + 1); } var childNodes = div.ChildNodes; var dates = (from node in childNodes where node.NodeType == HAP.HtmlNodeType.Text select node.InnerText.Decode()).ToArray(); if (!dates.Any()) { throw new NewsParserException("No text nodes found in column " + (columnIndex + 1) + " for " + name); } var date = dates.FirstOrDefault(s => !string.IsNullOrWhiteSpace(s)); if (date == null) { return; } //throw new NewsParserException("No dates found in column " + (columnIndex + 1) + " for " + name); var level = div.SelectSingleNode(starXPath("Star")) != null ? NewsEventLevel.H : div.SelectSingleNode(starXPath("djStar")) != null ? NewsEventLevel.M : NewsEventLevel.L; var n = new NewsEvent() { Country = country, Name = name, Time = weekDays[columnIndex].Add(ParseEventDate(date).TimeOfDay), Type = name.Contains("Speaks") ? NewsEventType.Speech : NewsEventType.Report, Level = level }; list1.Add(n); }
private void buttonParseHtml_Click(object sender, EventArgs e) { OpenFileDialog openFileDialog = new OpenFileDialog(); openFileDialog.Filter = "HTML File (*.html;)|*.html"; openFileDialog.Multiselect = false; if (openFileDialog.ShowDialog() == DialogResult.OK) { if (String.IsNullOrEmpty(openFileDialog.FileName)) { return; } string strHtml = String.Empty; using (StreamReader reader = new StreamReader(openFileDialog.FileName, Encoding.UTF8)) { strHtml = reader.ReadToEnd(); reader.Close(); } HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(strHtml); //加载html foreach (var err in doc.ParseErrors) { Console.WriteLine(err.Code); } //Console.WriteLine(doc.Text); HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode; //获取文档的根节点 //Console.WriteLine(rootNode.OuterHtml); #if true string xpath = @"//table"; HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath); //获取表格 foreach (var script in node.Descendants("script").ToArray()) { script.Remove(); } foreach (var style in node.Descendants("style").ToArray()) { style.Remove(); } string innerText = node.OuterHtml; //到这里就是纯纯的表格了 var trNodes = node.SelectNodes("tr"); foreach (var trnod in trNodes) //遍历行 { var tdNodes = trnod.SelectNodes("td"); for (int i = 0; i < tdNodes.Count; i++) //遍历列 { Console.WriteLine(tdNodes[i].InnerText); } } #endif } }
public static HAP.HtmlNode SelectNode(this HAP.HtmlNode parent, string xPath) { var node = parent.SelectSingleNode(xPath); if (node == null) { throw new NewsParserException(new { xPath }); } return(node); }
/// <summary> /// 检测服务器是否存在错误 /// </summary> /// <returns></returns> public static bool TestServiceError(Html.HtmlNode doucmentNode) { var node = doucmentNode.SelectSingleNode("/html[1]/body[1]/strong[1]/span[1]"); if (node != null && node.InnerText.IndexOf("网站忙") != -1) { return(true); } return(false); }
public DAL.Entities.Article ParseArticle(HtmlAgilityPack.HtmlNode rootNode, ScrapingExpressions expressions) { var title = rootNode.SelectSingleNode(expressions.TitleXPath); var post = rootNode.SelectSingleNode(expressions.PostXPath); var date = rootNode.SelectSingleNode(expressions.DateXPath); if (title == null || post == null || date == null) { Console.WriteLine("Failed to parse document"); return(null); } return(new DAL.Entities.Article() { Contents = post.InnerText, Title = title.InnerText, CreationDate = Program.ParseDateTime(date.InnerText), }); }
public static BsonDocument AnalizeGeneralListInformation(HtmlAgilityPack.HtmlNode node, Checker checker, BufferBlock <string> imageTargetBlock) { var bson = new BsonDocument(); var linkNode = node.SelectSingleNode(".//div[@class='h16']/a"); if (linkNode != null) { if (checker(linkNode.Attributes["href"].Value)) { return(null); } var link = linkNode.Attributes["href"].Value; bson.Add("link", link); bson.Add("title", linkNode.InnerText); } var dateNode = node.SelectSingleNode(".//div[@class='date']"); if (dateNode != null) { bson.Add("date", dateNode.InnerText); } var contentNode = node.SelectSingleNode(".//div[@class='p']"); if (contentNode != null) { bson.Add("content", contentNode.InnerText); } var imageNode = node.SelectSingleNode(".//img"); if (imageNode != null) { var imgUrl = imageNode.Attributes["src"].Value; bson.Add("img", GetSubUrl(imgUrl)); bson.Add("compressImg", WebImageSaver.Instance.GetComressImageName(GetSubUrl(imgUrl))); imageTargetBlock.Post(GetIhChina.MainPage + imgUrl); } Console.WriteLine(bson.ToString()); return(bson); }
protected override float?getRating(HtmlAgilityPack.HtmlNode node) { var ratingNode = node.SelectSingleNode(ratingXPath); if (ratingNode != null) { float rating = 0; if (!float.TryParse(ratingNode.Attributes["content"].Value, out rating)) { } return(rating); } return(null); }
private List <VideoInfo> GetVids(HtmlNode node, string parentUrl) { List <VideoInfo> videos = new List <VideoInfo>(); var vids = node.SelectNodes(".//article"); foreach (var vid in vids) { VideoInfo video = new VideoInfo(); if (vid.SelectSingleNode(".//h2[contains(@class,'h3')]") == null) { video.Title = HttpUtility.HtmlDecode(vid.SelectSingleNode(".//a[@title]").Attributes["title"].Value.Trim()); video.VideoUrl = FormatDecodeAbsolutifyUrl(parentUrl, vid.SelectSingleNode(".//a[@href]").Attributes["href"].Value, null, UrlDecoding.None); } else { video.Title = vid.SelectSingleNode(".//h2[contains(@class,'h3')]").InnerText.Trim(); video.VideoUrl = FormatDecodeAbsolutifyUrl(parentUrl, vid.SelectSingleNode(".//a[@class='teaser__link' and @href]").Attributes["href"].Value, null, UrlDecoding.None); if (vid.SelectSingleNode(".//p[contains(@class,'teaser__description')]") != null) { video.Description = vid.SelectSingleNode(".//p[contains(@class,'teaser__description')]").InnerText.Trim(); } else { video.Description = vid.SelectSingleNode(".//h3[contains(@class,'teaser__subtitle')]").InnerText.Trim(); } } var moNode = vid.SelectSingleNode(".//span[@data-month]"); var daNode = vid.SelectSingleNode(".//span[@data-date]"); if (moNode != null && daNode != null) { video.Airdate = moNode.InnerText.Trim() + ' ' + daNode.InnerText.Trim(); } video.Thumb = getThumb(vid.SelectSingleNode(".//picture/img")); videos.Add(video); } var np = node.SelectSingleNode(".//a[@href and contains(text(),'More ')]"); nextPageAvailable = false; if (np != null) { nextPageAvailable = true; nextPageUrl = CreateUrl(parentUrl, np.Attributes["href"].Value); } return(videos); }
public static BookInfo AmazonSearchBook(string title, string author) { BookInfo result = null; string authorTrim = ""; Regex regex = new Regex(@"( [A-Z]\.)", RegexOptions.Compiled); Match match = Regex.Match(author, @"( [A-Z]\.)", RegexOptions.Compiled); if (match.Success) { foreach (Match m in regex.Matches(author)) { authorTrim = author.Replace(m.Value, m.Value.Trim()); } } else { authorTrim = author; } if (title.IndexOf(" (") >= 0) { title = title.Substring(0, title.IndexOf(" (")); } string searchUrl = @"http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Ddigital-text&field-keywords=" + Uri.EscapeDataString(title + " " + authorTrim + " kindle edition"); HAP.HtmlDocument searchDoc = new HAP.HtmlDocument(); searchDoc.LoadHtml(HttpDownloader.GetPageHtml(searchUrl)); HAP.HtmlNode node = searchDoc.DocumentNode.SelectSingleNode("//li[@id='result_0']"); //At least attempt to verify it might be the same book? //Ignore case of title if (node != null && node.InnerText.IndexOf(title, StringComparison.OrdinalIgnoreCase) >= 0) { string foundASIN = node.GetAttributeValue("data-asin", ""); node = node.SelectSingleNode(".//div/div/div/div[@class='a-fixed-left-grid-col a-col-right']/div/a"); if (node != null) { result = new BookInfo(node.InnerText, author, foundASIN); result.amazonUrl = node.GetAttributeValue("href", ""); // Grab the true link for good measure } } return(result); }
internal static int GetReviewLastPageNumber(HtmlAgilityPack.HtmlNode html) { /* * <ul class="a-pagination"> * <li class="a-disabled"> * <span class="a-declarative" data-action="reviews:page-action" data-reviews:page-action="{"allowLinkDefault":"1"}"> * ←<span class="a-letter-space"></span><span class="a-letter-space"></span>Previous * </span> * </li> * <li class="a-selected page-button" data-reftag="cm_cr_pr_btm_link"> * <a href="/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_1?ie=UTF8&pageSize=50&sortBy=recent"> * 1 * </a> * </li> * ... * <li class="page-button" data-reftag="cm_cr_pr_btm_link"> * <a href="/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_5?ie=UTF8&pageNumber=5&pageSize=50&sortBy=recent"> * 5 * </a> * </li> * <li class="a-last"> * <a href="/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_paging_btm_2?ie=UTF8&pageNumber=2&pageSize=50&sortBy=recent"> * Next<span class="a-letter-space"></span><span class="a-letter-space"></span>→ * </a> * </li> * </ul> */ int pageNum = 0; var lastPageNode = html.SelectSingleNode("//ul/li[@class='page-button' and position() = (last()-1)]/a"); if (lastPageNode != null) { Match m = pageNumberFromUrl.Match(lastPageNode.GetAttributeValue("href", "")); if (m.Groups.Count > 1) { int.TryParse(m.Groups[1].Value, out pageNum); } } return(pageNum); }
public string ParseNewsItem(HtmlAgilityPack.HtmlNode item) { // to visualize, news items have this structure: // <p>сегодня, 12:30</p // <a href="/relative-uri"> // <span>title</span> // </a> try { var link = item.SelectSingleNode("a"); var uriText = link.GetAttributeValue("href", ""); return(uriText); } catch (NullReferenceException ex) { // something wrong with markup, log Console.WriteLine("Error parsing item {0}, malformed markup: {1}, {2}", item.XPath, ex.Message, ex.StackTrace); } return(null); }
Boolean PageHasData(string documentText) { try { HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(documentText); HtmlAgilityPack.HtmlNode bloodDonor = htmlDocument.DocumentNode.SelectSingleNode("//*[@id='regmain']"); HtmlAgilityPack.HtmlNode userLink = bloodDonor.SelectSingleNode("//*[@id='regPage']/div/div/div"); // Of Label // LabelElement.InnerHTML.Equals("of") // linkPageCount = aElement.InnHTML var n1 = userLink.Descendants().ToArray(); var link = userLink.Descendants().Count(); totalPages = Int32.Parse(n1[(link - 5)].InnerText); if (totalPages > 0) { return(true); } else { return(false); } } catch (Exception ex) { totalPages = 0; return(false); } }
//static int nLevels = 64; //static Size testPadding = new Size(32, 32); //static double hitThreshold = 0; //static int groupThreshold = 2; //static double scaleStep = 1.05; //static bool useMeanShiftGrouping = false; public static List <PSM4TxSample> LoadSamples(string folder) { //Load all samples from all the folders List <PSM4TxSample> samples = new List <PSM4TxSample>(); List <string> missing_xml_samples = new List <string>(); string[] files = Directory.EnumerateFiles(folder) .Where(file => file.ToLower().EndsWith(".bmp") || file.ToLower().EndsWith(".jpg")) .ToArray(); for (int k = 0; k < files.Length; k++) { string img_file = files[k].ToLower(); string xml_file = img_file.EndsWith("bmp") ? img_file.Replace(".bmp", "_data.xml") : img_file.Replace(".jpg", "_data.xml"); if (!File.Exists(xml_file)) { missing_xml_samples.Add(img_file); continue; } PSM4TxSample sample = new PSM4TxSample(img_file); // .// Means descendants, which includes children of children (and so forth). // ./ Means direct children. //If a XPath starts with a / it becomes relative to the root of the document; //to make it relative to your own node start it with ./. HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.Load(xml_file); HtmlAgilityPack.HtmlNodeCollection shape_nodes = doc.DocumentNode.SelectNodes("//shape"); foreach (HtmlAgilityPack.HtmlNode shape_node in shape_nodes) { HtmlAgilityPack.HtmlNode blocktext_node = shape_node.SelectSingleNode(".//blocktext"); HtmlAgilityPack.HtmlNode text_node = blocktext_node.SelectSingleNode("./text"); HtmlAgilityPack.HtmlNode data_node = shape_node.SelectSingleNode(".//data"); HtmlAgilityPack.HtmlNode extent_node = data_node.SelectSingleNode("./extent"); string s_x = extent_node.GetAttributeValue("X", ""); string s_y = extent_node.GetAttributeValue("Y", ""); string s_w = extent_node.GetAttributeValue("Width", ""); string s_h = extent_node.GetAttributeValue("Height", ""); if (text_node.InnerText.ToLower() == "isolator") { sample.SetIsolator(s_x, s_y, s_w, s_h); //(787.18896484375, 1370.0) } else if (text_node.InnerText.ToLower() == "arrayblock") { sample.SetArrayBlock(s_x, s_y, s_w, s_h); //(1052.86828613281, 1201.8359375) } else if (text_node.InnerText.ToLower() == "aperture") { sample.SetAperture(s_x, s_y, s_w, s_h); //(209.156982421875, 1885.03271484375) } } samples.Add(sample); } using (StreamWriter sw = new StreamWriter("missing_xml.txt")) { for (int i = 0; i < missing_xml_samples.Count; i++) { sw.WriteLine(missing_xml_samples[i]); } } return(samples); }
void ExtractUsers(string documentText) { try { HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(documentText); HtmlAgilityPack.HtmlNode bloodDonor = htmlDocument.DocumentNode.SelectSingleNode("//*[@id='regmain']"); HtmlAgilityPack.HtmlNode linkPage = htmlDocument.DocumentNode.SelectSingleNode("//*[@id='regPage']/div/div/div/a[4]"); //linkPageCount = linkPage.InnerLength; HtmlAgilityPack.HtmlNode userTable = htmlDocument.DocumentNode.SelectSingleNode("//*[@id='regPage']/div/table"); HtmlAgilityPack.HtmlNodeCollection userTableCount = userTable.SelectNodes("./tr"); for (int k = 2; k <= userTableCount.Count(); k++) { HtmlAgilityPack.HtmlNode moreBtn = userTable.SelectSingleNode("./tr[" + k + "]/td[6]/a"); string linkBtn = moreBtn.GetAttributeValue("onclick", null); string[] mainLink = linkBtn.Split('b'); string moreLink = mainLink[1]; using (WebClient client = new WebClient()) { var link = client.DownloadString("http://bloodhelpers.com/b" + moreLink); HtmlAgilityPack.HtmlDocument userDocument = new HtmlAgilityPack.HtmlDocument(); userDocument.LoadHtml(link); HtmlAgilityPack.HtmlNode selectTable = userDocument.DocumentNode.SelectSingleNode("//*[@id='search']/form/table"); //User Name HtmlAgilityPack.HtmlNode userName = selectTable.SelectSingleNode("./tr[1]/td[2]"); string Name = userName.InnerText; //User Email HtmlAgilityPack.HtmlNode userEmail = selectTable.SelectSingleNode("./tr[2]/td[2]"); string[] Mail = userEmail.InnerHtml.Split('='); string addMail = Mail[1] + "=" + Mail[2]; string Email = addMail.Replace("border", ""); //fuction to convert png image into jpg //User BloodGroup HtmlAgilityPack.HtmlNode userBloodGroup = selectTable.SelectSingleNode("./tr[3]/td[2]"); string BloodGroup = userBloodGroup.InnerText; //User Gender HtmlAgilityPack.HtmlNode userGender = selectTable.SelectSingleNode("./tr[4]/td[2]"); string Gender = userGender.InnerText; //User Age HtmlAgilityPack.HtmlNode userAge = selectTable.SelectSingleNode("./tr[5]/td[2]"); int Age = Int32.Parse(userAge.InnerText.Replace("Years", "")); //User City HtmlAgilityPack.HtmlNode userCity = selectTable.SelectSingleNode("./tr[6]/td[2]"); string City = userCity.InnerText; //User Mobile Number HtmlAgilityPack.HtmlNode userMobile = selectTable.SelectSingleNode("./tr[7]/td[2]"); string[] num = userMobile.InnerHtml.Split('='); string addNum = num[1] + "=" + num[2]; string mobileNumber = addNum.Replace("border", ""); //function to convert png image into jpg //User Land Line Number HtmlAgilityPack.HtmlNode userLandLine = selectTable.SelectSingleNode("./tr[8]/td[2]"); string[] landNum = userLandLine.InnerHtml.Split('='); string addLandNum = landNum[1] + "=" + landNum[2]; string landLineNum = addLandNum.Replace("border", ""); //Function to convert png image into jpg //User Last Donation Date HtmlAgilityPack.HtmlNode userLastDonationDate = selectTable.SelectSingleNode("./tr[9]/td[2]"); string LastDonationDate = userLastDonationDate.InnerText; //Store Data in DataBase StoreUserData(Name, Email, BloodGroup, Gender, Age, City, mobileNumber, landLineNum, LastDonationDate); } } } catch { } }
public void start_conversion() { WebClient wc = new WebClient(); wc.Encoding = Encoding.UTF8; string html = wc.DownloadString(url.Text); agi.HtmlDocument doc = new agi.HtmlDocument(); doc.LoadHtml(html); agi.HtmlNode id_main = doc.GetElementbyId("main"); string title = id_main.SelectSingleNode("//h1").InnerText; //제목 string re_title = ""; String[] result_title; if (title.Contains("» ")) { result_title = title.Split(' '); /*for (int r = 0; r < result_title.Length; r++) * { * if (result_title[r] != "»") * { * if (r != 0) re_title += " "; * //title = result_title[r]; * re_title += result_title[r]; * } else * { * break; * } * }*/ re_title += result_title[0]; } else { re_title = title; } //result_context.Text = title + "\r\n"; int now_num = 1; int data_i = 1; String[] now_split = new String[] { }; var result_date = ""; //실제날짜 추출 var ani_date = doc.DocumentNode.SelectSingleNode("//div[@class='tidHeader']//table/tr/td[1]").InnerHtml; String result_ani_date = ani_date.ToString().Split('月')[0]; //result_context.Text = ani_date.ToString().Split('月')[0] + "\r\n"; for (;;) { result_date = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText; //날짜 분기 if (data_i == 3) { if (result_date.Contains('~')) { now_split = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText.Split('~'); break; } } if (data_i == 4) { if (result_date.Contains('~')) { now_split = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText.Split('~'); break; } } if (data_i == 5) { if (result_date.Contains('~')) { now_split = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText.Split('~'); break; } } data_i++; } String[] now_split2 = now_split[0].Split('-'); if (Convert.ToInt32(now_split2[1]) == 4) { now_num = 2; } if (Convert.ToInt32(now_split2[1]) == 7) { now_num = 3; } if (Convert.ToInt32(now_split2[1]) == 10) { now_num = 4; } String nowyear = now_split2[0]; ArrayList ch_ = new ArrayList(); foreach (var cell in doc.DocumentNode.SelectNodes("//td[@class='ch']")) { if (cell.InnerText != " ") { ch_.Add(cell.InnerText); } } ArrayList now_date = new ArrayList(); int ch_num = 1; var team_date = ""; foreach (var cell in doc.DocumentNode.SelectNodes("//table[@class='schedule']/tbody/tr")) { foreach (var cell2 in cell.SelectNodes(".//td").Count.ToString()) { //result_context.Text += Convert.ToInt32(cell2); if (Convert.ToInt32(cell2) == 53) { if (cell.SelectSingleNode(".//td[3]").InnerText != " ") { //result_context.Text += doc.DocumentNode.SelectSingleNode("//table[@class='schedule']/tbody/tr[" + ch_num + "]/td[1]").InnerText + "\r\n"; team_date = doc.DocumentNode.SelectSingleNode("//table[@class='schedule']/tbody/tr[" + ch_num + "]/td[1]").InnerText.Split(' ')[0]; now_date.Add(doc.DocumentNode.SelectSingleNode("//table[@class='schedule']/tbody/tr[" + ch_num + "]/td[1]").InnerText.Split(' ')[0]); } } else if (Convert.ToInt32(cell2) == 52) { if (cell.SelectSingleNode(".//td[2]").InnerText != " ") { //result_context.Text += team_date + "\r\n"; now_date.Add(team_date); } } else if (Convert.ToInt32(cell2) == 50) { team_date = doc.DocumentNode.SelectSingleNode("//table[@class='schedule']/tbody/tr[" + ch_num + "]/td[1]").InnerText.Split(' ')[0] + "\r\n"; } ch_num++; } } ArrayList subtitle_all = new ArrayList(); ArrayList subtitle_num = new ArrayList(); ArrayList subtitle_content = new ArrayList(); ArrayList numsss = new ArrayList(); int numssd = 0; foreach (var cell in doc.DocumentNode.SelectNodes("//td[@class='subtitle']")) { if (cell.InnerText != " ") { //MessageBox.Show(cell.InnerText.Substring(0, 1)); if (cell.InnerText.Substring(0, 1) == "^") { subtitle_all.Add(cell.InnerText); subtitle_num.Add(cell.InnerText.Split('#')[1].Split(' ')[0]); if (1 < cell.InnerText.Split(' ').Length) { string cells_text = ""; for (int c = 1; c < cell.InnerText.Split(' ').Length; c++) { if (c != 1) { cells_text += " "; } cells_text += cell.InnerText.Split(' ')[c].Replace("!", "!").Replace("?", "?"); } subtitle_content.Add(cells_text); } else { subtitle_content.Add(cell.InnerText.Split(' ')[1]); } numsss.Add(numssd); numssd++; } else if (cell.InnerText.Split(' ')[0].Contains("#")) { //result_context.Text += cell.InnerText + "\r\n"; subtitle_all.Add(cell.InnerText); subtitle_num.Add(cell.InnerText.Split(' ')[0].Replace("#", "")); if (1 < cell.InnerText.Split(' ').Length) { string cells_text = ""; for (int c = 1; c < cell.InnerText.Split(' ').Length; c++) { if (c != 1) { cells_text += " "; } cells_text += cell.InnerText.Split(' ')[c].Replace("!", "!").Replace("?", "?"); } subtitle_content.Add(cells_text); } else { subtitle_content.Add(cell.InnerText.Split(' ')[1]); } numsss.Add(numssd); numssd++; } else { numssd++; continue; } } } int notget = 0; ArrayList flag_ = new ArrayList(); foreach (var cell in doc.DocumentNode.SelectNodes("//td[@class='flag nobr']")) { if ((int)numsss[0] <= (int)notget) { //result_context.Text += cell.InnerText.Replace("!", "").Replace("●", "").Replace("再", "").Replace("Update", "").Replace(" ", "").Trim() + "\r\n"; flag_.Add("[" + cell.InnerText.Replace("!", "").Replace("●", "").Replace("注", "").Replace("再", "").Replace("Update", "").Replace(" ", "").Trim() + "] "); } notget++; } ArrayList date_sort = new ArrayList(); ArrayList date_text_sort = new ArrayList(); int now_info = 0; for (int i = 0; i < subtitle_all.Count; i++) { Application.DoEvents(); //MessageBox.Show(subtitle_num[i].ToString()); int nums = Convert.ToInt32(subtitle_num[i]); String resert_num = ""; if (nums < 10) { resert_num = "0" + Convert.ToString(subtitle_num[i]); } else { resert_num = Convert.ToString(subtitle_num[i]); } string clb_string = ""; for (int x = 0; x <= brocas.CheckedItems.Count - 1; x++) { clb_string = clb_string + brocas.CheckedItems[x].ToString(); } String flag = ""; if (Convert.ToString(flag_[i]) == "[] ") { flag = ""; } else { flag = Convert.ToString(flag_[i]); } if (result_context.Text.Contains("第" + resert_num + "話 ") || ch_[i].ToString().Contains(clb_string)) { continue; } else { if (all_radio.Checked) { //전체 string sub_content = ""; if (subtitle_content[i].ToString() != "") { sub_content = "〔 " + subtitle_content[i].ToString() + " 〕"; } result_context.Text += "(" + nowyear + "Q" + now_num + ") " + re_title + " - 第" + resert_num + "話 " + flag + sub_content; result_context.Text += clb_string + " 1280x720 x264 \r\n"; } else if (now_radio.Checked) { //현재 DateTime time_1 = new DateTime(Convert.ToInt32(result_ani_date.Split('年')[0]), Convert.ToInt32(result_ani_date.Split('年')[1]), Convert.ToInt32(now_date[i]), 0, 0, 0); //result_context.Text += now_date_text.Text.Split('-')[0]+ now_date_text.Text.Split('-')[1]+ now_date_text.Text.Split('-')[2]; int year = Convert.ToInt32(DateTime.Now.ToString("yyyy")); int month = Convert.ToInt32(DateTime.Now.ToString("MM")); int day = Convert.ToInt32(DateTime.Now.ToString("dd")); DateTime time_2 = new DateTime(year, month, day, 0, 0, 0); if (DateTime.Compare(time_1, time_2) <= 0) { now_info = 1; string sub_content = ""; if (subtitle_content[i].ToString() != "") { sub_content = "〔 " + subtitle_content[i].ToString() + " 〕"; } //date_sort.Add(Convert.ToInt32(DateTime.Now.ToString("dd")) - Convert.ToInt32(now_date[i])+ "(" + nowyear + "Q" + now_num + ") " + re_title + " - 第" + resert_num + "話 " + flag + sub_content + clb_string + " 1280x720 x264"); result_context.Text += "(" + nowyear + "Q" + now_num + ") " + re_title + " - 第" + resert_num + "話 " + flag + " " + sub_content; result_context.Text += clb_string + " 1280x720 x264 \r\n"; } } else //미래 { DateTime time_1 = new DateTime(Convert.ToInt32(result_ani_date.Split('年')[0]), Convert.ToInt32(result_ani_date.Split('年')[1]), Convert.ToInt32(now_date[i]), 0, 0, 0); //result_context.Text += now_date_text.Text.Split('-')[0]+ now_date_text.Text.Split('-')[1]+ now_date_text.Text.Split('-')[2]; int year = Convert.ToInt32(DateTime.Now.ToString("yyyy")); int month = Convert.ToInt32(DateTime.Now.ToString("MM")); int day = Convert.ToInt32(DateTime.Now.ToString("dd")); DateTime time_2 = new DateTime(year, month, day, 0, 0, 0); if (DateTime.Compare(time_1, time_2) > 0) { string sub_content = ""; if (subtitle_content[i].ToString() != "") { sub_content = "〔 " + subtitle_content[i].ToString() + " 〕"; } result_context.Text += "(" + nowyear + "Q" + now_num + ") " + re_title + " - 第" + resert_num + "話 " + flag + sub_content; result_context.Text += clb_string + " 1280x720 x264 \r\n"; } } } } if (now_info == 1) { string[] tempArray = result_context.Lines; result_context.Text = tempArray[result_context.Lines.Length - 2]; now_info = 0; } if (result_context.Text == "") { MessageBox.Show("추출결과가 없습니다."); } //result_context.Text += subtitle_.Count; }
public void start_conversion_time() { result_context.Text = ""; WebClient wc = new WebClient(); wc.Encoding = Encoding.UTF8; string html = wc.DownloadString(url.Text); agi.HtmlDocument doc = new agi.HtmlDocument(); doc.LoadHtml(html); agi.HtmlNode id_main = doc.GetElementbyId("main"); //제목 타이틀 추출 string title = id_main.SelectSingleNode("//h1").InnerText; //제목 string re_title = ""; String result_title; //MessageBox.Show(title.Split('年')[0].ToString()); if (title.Contains("» ")) { result_title = ""; if (title.Contains("年")) { result_title = title.Split('年')[0].Substring(0, title.Split('年')[0].LastIndexOf(' ')).ToString(); } if (title.Contains("&")) { result_title = title.Split('&')[0].ToString().Trim(); } /*for (int r = 0; r < result_title.Length; r++) * { * if (result_title[r] != "»") * { * if (r != 0) re_title += " "; * //title = result_title[r]; * re_title += result_title[0]; * } * else * { * break; * } * }*/ re_title = result_title; } else { re_title = title; } //re_title========================================== //날짜분기추출 int now_num = 1; int data_i = 1; String[] now_split = new String[] { }; var result_date = ""; //실제날짜 추출 var ani_date = doc.DocumentNode.SelectSingleNode("//table[@class='section basic']//tr/td[1]").InnerHtml; String result_ani_date = ani_date.ToString().Split('月')[0]; //result_context.Text = ani_date.ToString().Split('月')[0] + "\r\n"; for (;;) { result_date = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText; //날짜 분기 if (data_i == 3) { if (result_date.Contains('~')) { now_split = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText.Split('~'); break; } } if (data_i == 4) { if (result_date.Contains('~')) { now_split = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText.Split('~'); break; } } if (data_i == 5) { if (result_date.Contains('~')) { now_split = doc.DocumentNode.SelectSingleNode("//table[@class='data']//tr[" + data_i + "]/td[1]").InnerText.Split('~'); break; } } data_i++; } String[] now_split2 = now_split[0].Split('-'); if (Convert.ToInt32(now_split2[1]) == 4) { now_num = 2; } if (Convert.ToInt32(now_split2[1]) == 7) { now_num = 3; } if (Convert.ToInt32(now_split2[1]) == 10) { now_num = 4; } String nowyear = now_split2[0]; //nowyear+now_num============================== //flag_ 추출 ArrayList flag_ = new ArrayList(); foreach (var cell in doc.DocumentNode.SelectNodes("//table[@id='ProgList']/tr/td[6]")) { //MessageBox.Show(cell.InnerText); //result_context.Text += cell.InnerText.Replace("!", "").Replace("●", "").Replace("再", "").Replace("Update", "").Replace(" ", "").Trim() + "\r\n"; flag_.Add(" [" + cell.InnerText.Replace("!", "").Replace("●", "").Replace("注", "").Replace("再", "").Replace("Update", "").Replace(" ", "").Trim() + "]"); } //flag_============================ //ch_ 추출 ============================== ArrayList ch_ = new ArrayList(); foreach (var cell in doc.DocumentNode.SelectNodes("//table[@id='ProgList']/tr/td[5]")) { if (cell.InnerText != " ") { if (cell.SelectSingleNode(".//div[@class='peNotice']") != null) { if (cell.SelectSingleNode(".//div[@class='peNotice']").InnerText != "") { ch_.Add(cell.InnerText.Replace(cell.SelectSingleNode(".//div[@class='peNotice']").InnerText, "").Replace("!", "!").Replace("?", "?")); } else { ch_.Add(cell.InnerText.Replace("!", "!").Replace("?", "?")); } } else if (cell.SelectSingleNode(".//div[@class='peComment']") != null) { if (cell.SelectSingleNode(".//div[@class='peComment']").InnerText != "") { ch_.Add(cell.InnerText.Replace(cell.SelectSingleNode(".//div[@class='peComment']").InnerText, "").Replace("!", "!").Replace("?", "?")); } else { ch_.Add(cell.InnerText.Replace("!", "!").Replace("?", "?")); } } else { ch_.Add(cell.InnerText.Replace("!", "!").Replace("?", "?")); } } } //ch_ ======================================== //날짜============== ArrayList day_date = new ArrayList(); ArrayList time_date = new ArrayList(); foreach (var cell in doc.DocumentNode.SelectNodes("//table[@id='ProgList']/tr/td[2]")) { day_date.Add(cell.InnerText.Split('(')[0]); time_date.Add(cell.InnerText.Split(')')[1].Replace(" ", "").Replace(" ", "")); } //t_date================= string clb_string = ""; for (int x = 0; x <= brocas.CheckedItems.Count - 1; x++) { clb_string = clb_string + brocas.CheckedItems[x].ToString(); } int cell_num = 0; //ArrayList Rs_Array_LIst = new ArrayList(); Dictionary <int, string> Rs_Array_LIst = new Dictionary <int, string>(); foreach (var cell in doc.DocumentNode.SelectNodes("//table[@id='ProgList']/tr/td[4]")) { System.Windows.Forms.Application.DoEvents(); string numbers = cell.InnerText; //MessageBox.Show(time_date[cell_num].ToString()); string minute_date = ""; if (time_date[cell_num].ToString().Split(':')[1].Contains('↑')) { minute_date = time_date[cell_num].ToString().Split(':')[1].Split('↑')[0]; } else if (time_date[cell_num].ToString().Split(':')[1].Contains('↓')) { minute_date = time_date[cell_num].ToString().Split(':')[1].Split('↓')[0]; } else { minute_date = time_date[cell_num].ToString().Split(':')[1]; } int hours = 0; if (Convert.ToInt32(time_date[cell_num].ToString().Split(':')[0]) < 24) { hours = Convert.ToInt32(time_date[cell_num].ToString().Split(':')[0]); } else { hours = Convert.ToInt32(time_date[cell_num].ToString().Split(':')[0]) - 24; } string flag = ""; if (flag_[cell_num].ToString() == " []") { flag = " "; } else { flag = flag_[cell_num].ToString() + " "; } //MessageBox.Show(Convert.ToInt32(day_date[cell_num].ToString().Split('-')[0])+" "+ Convert.ToInt32(day_date[cell_num].ToString().Split('-')[1])+" "+ Convert.ToInt32(day_date[cell_num].ToString().Split('-')[2])+" "+ Convert.ToInt32(time_date[cell_num].ToString().Split(':')[0])+" "+ Convert.ToInt32(minute_date)); DateTime time_1 = new DateTime(Convert.ToInt32(day_date[cell_num].ToString().Split('-')[0]), Convert.ToInt32(day_date[cell_num].ToString().Split('-')[1]), Convert.ToInt32(day_date[cell_num].ToString().Split('-')[2]), hours, Convert.ToInt32(minute_date), 0); int year = Convert.ToInt32(DateTime.Now.ToString("yyyy")); int month = Convert.ToInt32(DateTime.Now.ToString("MM")); int day = Convert.ToInt32(DateTime.Now.ToString("dd")); int hour = Convert.ToInt32(DateTime.Now.ToString("HH")); int minute = Convert.ToInt32(DateTime.Now.ToString("mm")); DateTime time_2 = new DateTime(year, month, day, hour, minute, 0); int result = DateTime.Compare(time_1, time_2); if (result > 0) { cell_num++; continue; } if (numbers == "") { cell_num++; continue; } if (Convert.ToInt32(numbers) < 10) { numbers = "0" + cell.InnerText; } if (result_context.Text.Contains(" - 第" + numbers + "話 " + ch_[cell_num])) { cell_num++; continue; } string ch_str = "〔 " + ch_[cell_num] + " 〕 "; if (ch_str == "〔 〕 ") { ch_str = ""; } try { Rs_Array_LIst.Add(Convert.ToInt32(numbers), "(" + nowyear + "Q" + now_num + ") " + re_title + " - 第" + numbers + "話" + flag + "" + ch_str + clb_string + " 1280x720 x264" + "\r\n"); } catch { } //result_context.AppendText("(" + nowyear + "Q" + now_num + ") " + re_title + " - 第" + numbers + "話 "+ ch_str + clb_string+" 1280x720 x264"+ "\r\n"); cell_num++; } //Rs_Array_LIst.Keys.Sort(); foreach (String re_line in Rs_Array_LIst.Values) { Application.DoEvents(); result_context.AppendText(re_line); } }
public static BookModels GetBookDetails(string url) { var book = new BookModels(); book.AmazonUrl = url; var webGet = new HtmlWeb(); var htmlDoc = webGet.Load(url); htmlDoc.OptionFixNestedTags = true; // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { // Handle any parse errors as required } if (htmlDoc.DocumentNode != null) { HtmlAgilityPack.HtmlNode bodyNode = htmlDoc.DocumentNode.SelectSingleNode("//body"); if (bodyNode != null) { var image = bodyNode.SelectSingleNode("//img[@id='main-image']"); if (image == null) { image = bodyNode.SelectSingleNode("//img[@id='imgBlkFront']"); } if (image != null) { book.CoverUrl = image.Attributes["src"].Value; book.CoverUrl = book.CoverUrl.Substring(0, book.CoverUrl.IndexOf("._")) + ".jpg"; } var title = bodyNode.Descendants("span") .Where(x => x.Id == "btAsinTitle") .Select(s => s.InnerText); book.Title = title.FirstOrDefault(); if (string.IsNullOrEmpty(book.Title)) { title = bodyNode.Descendants("h1") .Where(x => x.Id == "title") .Select(s => s.InnerText); book.Title = title.FirstOrDefault(); } var price = bodyNode.SelectSingleNode("//b[@class='priceLarge']"); if (price != null) { book.Price = Convert.ToDecimal(price.InnerText.Trim().Replace("$", string.Empty).Replace("\n", string.Empty)); } var description = bodyNode.SelectSingleNode("//div[@id='postBodyPS']") .InnerText; book.Description = description; } } return(book); }
/// <summary> /// Search Shelfari for series info, scrape series page, and return next title in series. /// </summary> /// <param name="searchHtmlDoc">Book's Shelfari page, pre-downloaded</param> private string GetNextInSeriesTitle2(HtmlAgilityPack.HtmlDocument searchHtmlDoc) { bool hasSeries = false; string series = ""; string seriesShort = ""; string seriesURL = ""; int currentSeriesIndex = 0; int currentSeriesCount = 0; string nextTitle = ""; //Check if book's Shelfari page contains series info HtmlAgilityPack.HtmlNode node = searchHtmlDoc.DocumentNode.SelectSingleNode("//span[@class='series']"); if (node != null) { //Series name and book number series = node.InnerText.Trim(); //Convert book number string to integer Int32.TryParse(series.Substring(series.LastIndexOf(" ") + 1), out currentSeriesIndex); //Parse series Shelfari URL seriesURL = node.SelectSingleNode("//span[@class='series']/a[@href]") .GetAttributeValue("href", ""); seriesShort = node.FirstChild.InnerText.Trim(); //Add series name and book number to log, if found searchHtmlDoc.LoadHtml(HttpDownloader.GetPageHtml(String.Format(seriesURL))); //Parse number of books in series and convert to integer node = searchHtmlDoc.DocumentNode.SelectSingleNode("//h2[@class='f_m']"); string test = node.FirstChild.InnerText.Trim(); Match match = Regex.Match(test, @"\d+"); if (match.Success) { Int32.TryParse(match.Value, out currentSeriesCount); } hasSeries = true; //Check if there is a next book if (currentSeriesIndex < currentSeriesCount) { //Add series name and book number to log, if found main.Log(String.Format("This is book {0} of {1} in the {2} Series...", currentSeriesIndex, currentSeriesCount, seriesShort)); foreach (HtmlAgilityPack.HtmlNode seriesItem in searchHtmlDoc.DocumentNode.SelectNodes(".//ol/li")) { node = seriesItem.SelectSingleNode(".//div/span[@class='series bold']"); if (node != null) { if (node.InnerText.Contains((currentSeriesIndex + 1).ToString())) { node = seriesItem.SelectSingleNode(".//h3/a"); //Parse title of the next book nextTitle = node.InnerText.Trim(); //Add next book in series to log, if found main.Log(String.Format("The next book in this series is {0}!", nextTitle)); return(nextTitle); } } } } if (hasSeries) { return(""); } } return(""); }
/// <summary> /// Search Shelfari page for possible series info, returning the next title in the series without downloading any other pages. /// TODO: Un-yuckify all the return paths without nesting a ton of ifs /// </summary> /// <param name="searchHtmlDoc">Book's Shelfari page, pre-downloaded</param> private string GetNextInSeriesTitle(HtmlAgilityPack.HtmlDocument searchHtmlDoc) { //Added estimated reading time and page count from Shelfari, for now... HtmlAgilityPack.HtmlNode pageNode = searchHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_FirstEdition']"); if (pageNode == null) { return(""); } HtmlAgilityPack.HtmlNode node1 = pageNode.SelectSingleNode(".//div/div"); if (node1 == null) { return(""); } //Parse page count and multiply by average reading time Match match1 = Regex.Match(node1.InnerText, @"Page Count: ((\d+)|(\d+,\d+))"); if (match1.Success) { double minutes = int.Parse(match1.Groups[1].Value, NumberStyles.AllowThousands) * 1.2890625; TimeSpan span = TimeSpan.FromMinutes(minutes); main.Log(String.Format("Typical time to read: {0} hours and {1} minutes ({2} pages)" , span.Hours, span.Minutes, match1.Groups[1].Value)); curBook.pagesInBook = match1.Groups[1].Value; curBook.readingHours = span.Hours.ToString(); curBook.readingMinutes = span.Minutes.ToString(); } //Added highlighted passage from Shelfari, dummy info for now... HtmlAgilityPack.HtmlNode members = searchHtmlDoc.DocumentNode.SelectSingleNode("//ul[@class='tabs_n tn1']"); int highlights = 0; if (members != null) { Match match3 = Regex.Match(members.InnerText, @"Reviews \(((\d+)|(\d+,\d+))\)"); if (match3.Success) { curBook.popularPassages = match3.Groups[1].Value.ToString(); } match3 = Regex.Match(members.InnerText, @"Readers \(((\d+)|(\d+,\d+))\)"); if (match3.Success) { curBook.popularHighlights = match3.Groups[1].Value.ToString(); highlights = int.Parse(match3.Groups[1].Value, NumberStyles.AllowThousands); } string textPassages = curBook.popularPassages == "1" ? String.Format("{0} passage has ", curBook.popularPassages) : String.Format("{0} passages have ", curBook.popularPassages); string textHighlights = curBook.popularHighlights == "1" ? String.Format("{0} time", curBook.popularHighlights) : String.Format("{0} times", curBook.popularHighlights); main.Log(String.Format("Popular Highlights: {0}been highlighted {1}" , textPassages, textHighlights)); } //If no "highlighted passages" found from Shelfari, add to log if (highlights == 0) { main.Log("Popular Highlights: No highlighted passages have been found for this book"); curBook.popularPassages = ""; curBook.popularHighlights = ""; } //Check if book series is available and displayed in Series & Lists on Shelfari page. HtmlAgilityPack.HtmlNode seriesNode = searchHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_Series']/div"); if (seriesNode != null) { //If multiple Series found, find and use standard series. foreach (HtmlAgilityPack.HtmlNode seriesType in seriesNode.SelectNodes(".//div")) { if (seriesType.InnerText.Contains("(standard series)", StringComparison.OrdinalIgnoreCase) && !seriesType.InnerText.Contains("(Reading Order)", StringComparison.OrdinalIgnoreCase)) { Match match = Regex.Match(seriesType.InnerText, @"This is book (\d+) of (\d+)"); if (!match.Success) { continue; } curBook.seriesName = seriesType.ChildNodes["a"].InnerText.Trim(); main.Log("About the series: " + seriesType.InnerText.Replace(". (standard series)", "")); if (!match.Success || match.Groups.Count != 3) { return(""); } curBook.seriesPosition = match.Groups[1].Value; curBook.totalInSeries = match.Groups[2].Value; HtmlAgilityPack.HtmlNode seriesInfo = seriesNode.SelectSingleNode(".//p"); //Parse preceding book if (seriesInfo != null && seriesInfo.InnerText.Contains("Preceded by ", StringComparison.OrdinalIgnoreCase)) { match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*),", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { previousTitle = match.Groups[1].Value; } else { match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*)\.", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { previousTitle = match.Groups[1].Value; } } main.Log("Preceded by: " + previousTitle); //Grab Shelfari Kindle edition link for this book previousShelfariUrl = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle"; } // Check if book is the last in the series if (!curBook.seriesPosition.Equals(curBook.totalInSeries)) { //Parse following book if (seriesInfo != null && seriesInfo.InnerText.Contains("followed by ", StringComparison.OrdinalIgnoreCase)) { match = Regex.Match(seriesInfo.InnerText, @"followed by (.*)\.", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { main.Log("Followed by: " + match.Groups[1].Value); //Grab Shelfari Kindle edition link for this book nextShelfariUrl = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle"; return(match.Groups[1].Value); } } } //Stop after first standard series is found maybe //add popup (already started implimentaton) in //future to pick which standard series you //want to use, not sure if worthwhile though. //eg: http://www.shelfari.com/books/37598923 break; } } } return(""); }
private void textBox1_KeyDown(object sender, KeyEventArgs e) { if (e.KeyCode == Keys.Enter) { String serial = "INVALID_SERIAL"; if (textBox1.Text.Length > 0) { serial = textBox1.Text; } try { webBrowser1.AllowNavigation = true; string parsedHTML = ""; string movieTitle = "UNKNOWN"; string imgSrc = "UNKNOWN"; //string html = new WebClient().DownloadString("https://www.google.com/search?q=" + serial); //parsedHTML = parseHTML(html); //richTextBox1.Text = html; //webBrowser1.DocumentText = html; HtmlWeb web = new HtmlWeb(); document = web.Load("https://www.google.com/search?q=" + serial); HtmlAgilityPack.HtmlNode bodyNode = document.DocumentNode.SelectSingleNode("//td[@id='rhs_block']"); richTextBox1.Text = document.DocumentNode.InnerHtml; if (bodyNode.InnerText.Length > 1) { webBrowser1.DocumentText = bodyNode.InnerHtml; foreach (var image in bodyNode.SelectNodes(".//img")) { var src = image.GetAttributeValue("src", null); if (src != null) { imgSrc = src; break; } } textBox2.Enabled = false; normalQuery = true; try { movieTitle = bodyNode.SelectSingleNode("//div[@class='_B5d']").InnerText; } catch (Exception ex) { textBox2.Enabled = true; } } else { normalQuery = false; textBox2.Enabled = true; webBrowser1.Navigate("https://www.google.com/search?q=" + serial);// = new WebClient().DownloadString("https://www.google.com/search?q=" + serial); } if (list.Count == 0) { list.Add(new String[] { serial + " (1)", movieTitle, imgSrc }); listView1.Items.Add(serial + " (1)"); listBox2.Items.Add(movieTitle); } else { for (int i = 0; i < list.Count; ++i) { if (list[i][0].ToString().Contains(serial)) { int count = Int32.Parse(list[i][0].ToString().Substring(list[i][0].ToString().IndexOf('(')).Substring(1, list[i][0].ToString().Substring(list[i][0].ToString().IndexOf('(')).Length - 2)); list[i][0] = list[i][0].ToString().Substring(0, list[i][0].ToString().IndexOf('(')) + "(" + ++count + ")"; listView1.Items.Clear(); listBox2.Items.Clear(); foreach (String[] j in list) { listView1.Items.Add(j[0]); listBox2.Items.Add(j[1]); } i = list.Count; break; } else if (i == list.Count - 1) { list.Add(new String[] { serial + " (1)", movieTitle, imgSrc }); listView1.Items.Add(serial + " (1)"); listBox2.Items.Add(movieTitle); i = list.Count; break; } } } } } catch (Exception ex) { textBox1.Text = "INVALID SERIAL"; } textBox1.Text = ""; e.SuppressKeyPress = true; } }
public void Crawler() { int previousPageGallNum = 1000000000; Console.WriteLine(initDate.ToString() + endDate.ToString()); string url = gallUrl + "&page="; var client = new WebClient(); client.Encoding = System.Text.Encoding.UTF8; //Dictionary value => count, replyNum, gallCount, gallRecommend Dictionary <UserInfo, int[]> userDic = new Dictionary <UserInfo, int[]>(); int currentPage = this.initPage; while (true) { string text; try { text = client.DownloadString(url + currentPage.ToString()); if (string.IsNullOrEmpty(text)) { continue; } } catch { continue; } hap.HtmlDocument textHap = new hap.HtmlDocument(); textHap.LoadHtml(text); hap.HtmlNodeCollection nicks = textHap.DocumentNode.SelectNodes("//tr[@class='ub-content us-post']"); //Console.WriteLine(nicks.Count); //Console.WriteLine("==================" + currentPage.ToString() + "=================="); try { foreach (hap.HtmlNode nick in nicks) { int gallNum, replyNum, gallCount, gallRecommend; DateTime gallDate; string subject; gallNum = GetOnlyInt(nick.SelectSingleNode("./td[@class='gall_num']").InnerText); gallDate = DateTime.ParseExact(nick.SelectSingleNode("./td[@class='gall_date']").Attributes["title"].Value, "yyyy-MM-dd HH:mm:ss", null); Console.WriteLine(gallNum.ToString() + " " + gallDate.ToString()); if (gallNum >= previousPageGallNum) { Console.WriteLine(previousPageGallNum.ToString() + " " + gallNum.ToString()); Console.WriteLine("번호 에러"); continue; } if (DateTime.Compare(gallDate, initDate) < 0 || DateTime.Compare(gallDate, endDate) > 0) { Console.WriteLine("날짜 에러"); continue; } hap.HtmlNode user = nick.SelectSingleNode("./td[@class='gall_writer ub-writer']"); UserInfo tempUserInfo = new UserInfo(user.Attributes["data-nick"].Value); if (user.Attributes["data-uid"].Value == "") { tempUserInfo.setFluidNick(user.Attributes["data-ip"].Value); } else { tempUserInfo.setFixedNick(user.Attributes["data-uid"].Value); } //replyNum and subject are in <td class='gall_tit ub-word'></td> hap.HtmlNode subjectNode = nick.SelectSingleNode("./td[2]"); try { if (subjectNode.Attributes["class"].Value == "gall_subject") { subjectNode = nick.SelectSingleNode("./td[3]"); } subject = subjectNode.SelectSingleNode("./a[1]").InnerText; if (subjectNode.SelectNodes("./a").Count == 2) { replyNum = GetOnlyInt(subjectNode.SelectSingleNode("./a[@class='reply_numbox']/span").InnerText); } else { replyNum = 0; } } catch { subject = "NullSubjectException"; replyNum = 0; } // Console.WriteLine("댓글: " + replyNum.ToString()); gallCount = GetOnlyInt(nick.SelectSingleNode("./td[@class='gall_count']").InnerText); gallRecommend = GetOnlyInt(nick.SelectSingleNode("./td[@class='gall_recommend']").InnerText); //Dictionary value => count, replyNum, gallCount, gallRecommend if (userDic.ContainsKey(tempUserInfo)) { userDic[tempUserInfo][0] += 1; userDic[tempUserInfo][1] += replyNum; userDic[tempUserInfo][2] += gallCount; userDic[tempUserInfo][3] += gallRecommend; } else { int[] tempInts = new int[] { 1, replyNum, gallCount, gallRecommend }; userDic.Add(tempUserInfo, tempInts); } UserData tempUserData = new UserData(tempUserInfo); tempUserData.DataInput(gallNum, replyNum, gallCount, gallRecommend, gallDate, subject); //gallDatas.Add(tempUserData); } } catch { if (ErrorOccured != null) { ErrorOccured(text, null); } currentPage++; continue; } previousPageGallNum = GetOnlyInt(nicks[nicks.Count - 1].SelectSingleNode("./td[@class='gall_num']").InnerText); DateTime currentDate = DateTime.ParseExact(nicks[nicks.Count - 1]. SelectSingleNode("./td[@class='gall_date']").Attributes["title"].Value, "yyyy-MM-dd HH:mm:ss", null); if (currentPage >= endPage || DateTime.Compare(currentDate, initDate) < 0) { break; } else { System.Collections.ArrayList arr = new System.Collections.ArrayList(); string str = currentPage.ToString() + " 페이지, 날짜: " + currentDate.ToString(); arr.Add(str); arr.Add(currentDate); arr.Add(currentPage - initPage); if (newPageHappened != null) { newPageHappened(arr, null); } currentPage++; } } //Dictionary value => count, replyNum, gallCount, gallRecommend foreach (KeyValuePair <UserInfo, int[]> user in userDic) { UserInfo tempUser = user.Key; tempUser.count = user.Value[0]; tempUser.replyNum = user.Value[1]; tempUser.gallCount = user.Value[2]; tempUser.gallRecommend = user.Value[3]; UserRank tempUserRank = new UserRank(tempUser, user.Value[0], user.Value[1], user.Value[2], user.Value[3]); userList.Add(tempUserRank); } var sorted = from userRank in userList orderby userRank.count descending select userRank; userList = sorted.ToList <UserRank>(); if (CrawlingEnded != null) { CrawlingEnded(userList, null); } string tempDataDir = Directory.GetCurrentDirectory() + "\\temp-data\\"; Directory.CreateDirectory(tempDataDir); string filename = tempDataDir + gallId + DateTime.Now.ToString("_yyyy-MM-dd_HH-mm-ss"); SaveResult(filename); }
public static int GetCoverImageUrl( IWin32Window owner, string strISSN, string strYear, string strIssueNo, ref CookieContainer cookie, out string strImageUrl, out string strError) { strError = ""; strImageUrl = ""; string strUrl = "http://xxdy.qikan.com/MagInfo.aspx?issn=" + strISSN + "&year=" + strYear + "&periodNum=" + strIssueNo; /* * Connection: Keep-Alive * Cookie: xxdy=Default|Default_blue|1674-3121|%e4%b8%ad%e5%b0%8f%e5%ad%a6%e5%be%b7%e8%82%b2|2016|8|xxdy|459|||%e5%b0%8f%e5%ad%a6%e5%be%b7%e8%82%b2%e7%bd%91%e7%ab%99%3ahttp%3a%2f%2fxxdy.qikan.com|2016|9|True|ãä¸å°å¦å¾·è²ãæ¯ç±å½å®¶æè²é¨å§æååå¸è大å¦ä¸»åçä¸æ¬å * ¨é¢åæ ä¸å°å¦å¾·è²å·¥ä½çä¸ä¸æåï¼åæ¶ä½ä¸ºä¸å½æè²å¦ä¼ä¸å°å¦å¾·è²ç 究åä¼ä¼åãæ¬åè´åäºä¸ºä¸å°å¦å¾·è²ç论ç 究ä¸å®è·µå·¥ä½è * æä¾ææ°çå¾·è²æ¹é©å¨åãæå¨çå¾·è²æ¿çæå¼ãæ°éçå¾·è²ç 究ææãé²æ´»çå¾·è²å®è·µç»éªï¼åäºæ为ä¸å°å¦å¾·è²æ¹é©çâé£åæ âï¼éææ¹é©çâæ头å * µâï¼å¾·è²å·¥ä½è * 交æµç»éªãæ¢ç´¢å¾·è²è§å¾çâ大èå°âï¼å¼é¢å¾·è²è¡æ¿é¨é¨ãæç é¨é¨å广大德è²æå¸çâåè°é¨âã%0d%0aç¼ è¾ é¨ï¼020-85215129 85211209%0d%0aå è¡ é¨ï¼020-85215179 85211443ï¼ä¼ çï¼%0d%0açµåé®ç®±ï¼[email protected]ï¼æ稿ï¼%0d%0a [email protected]ï¼è®¢é * ï¼%0d%0aå客ï¼http://blog.sina.com.cn/s/articlelist_2734759432_0_1.html */ if (cookie == null) { cookie = new CookieContainer(); } WebClientEx webClient = new WebClientEx(cookie); #if NO { byte[] byteArray = x.DownloadData(new Uri("http://xxdy.qikan.com")); } x = new WebClientEx(cookie); #endif webClient.Headers.Add("Accept", "text/html, application/xhtml+xml, image/jxr, */*"); webClient.Headers.Add("Accept-Encoding", "gzip, deflate"); webClient.Headers.Add("Accept-Language", "zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3"); // Host: xxdy.qikan.com webClient.Headers.Add("Host", "xxdy.qikan.com"); // x.BaseAddress = "xxdy.qikan.com"; webClient.Headers.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"); try { #if NO byte[] byteArray = x.DownloadData(new Uri(strUrl)); Stream stream = new MemoryStream(byteArray); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.OptionFixNestedTags = true; htmlDoc.Load(stream, true); // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { // Handle any parse errors as required strError = "parse html error: " + htmlDoc.ParseErrors.ToString(); return(-1); } #endif byte[] byteArray = webClient.DownloadData(new Uri(strUrl)); string strContent = Encoding.UTF8.GetString(byteArray); // string strContent = x.DownloadString(new Uri(strUrl)); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.OptionFixNestedTags = true; htmlDoc.LoadHtml(strContent); #if NO // ParseErrors is an ArrayList containing any errors from the Load statement if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { // Handle any parse errors as required strError = "parse html error: " + htmlDoc.ParseErrors.ToString(); return(-1); } #endif if (htmlDoc.DocumentNode == null) { strError = "htmlDoc.DocumentNode == null"; return(-1); } /* * <div class="left1"> * <!--最新封面开始--> * * * <div class="cover1"> * <h1>封面</h1> * <div class="cover1_box"> * <a href="../../MagInfo.aspx?issn=1674-3121&year=2013&periodNum=7"><img src="http://img.qikan.com.cn/qkimages/xxdy/xxdy201307-l.jpg" width="190" height="270" border="0" alt="2013年第7期" /></a> * <span class="f14 fBold"><a href="MagInfo.aspx?issn=1674-3121&year=2013&periodNum=7" title="2013年第7期">2013年第7期</a> * </span> * * */ #if NO HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//img"); foreach (HtmlNode node in nodes) { string src1 = node.GetAttributeValue("src", ""); int i = 0; i++; } #endif HtmlAgilityPack.HtmlNode cover1_box = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='cover1_box']"); if (cover1_box == null) { strError = "cover1_box 没有找到"; return(-1); } HtmlNode img = cover1_box.SelectSingleNode("*/img"); string src = img.GetAttributeValue("src", ""); strImageUrl = src; return(1); } catch (Exception ex) { strError = "异常: " + ex.Message; return(-1); } }
public override void Parse(Response response) { //Create a new HTMLAglityPack document HtmlDocument ContentDocument = new HtmlDocument(); //load the #content of the page into the document ContentDocument.LoadHtml(response.Css("#content").First().OuterHtml); HtmlAgilityPack.HtmlNode BodyNode = ContentDocument.DocumentNode; patternObject.Title = BodyNode.SelectSingleNode("//*[@id=\"firstHeading\"]").InnerHtml; HtmlAgilityPack.HtmlNode ContentNode = BodyNode.SelectSingleNode("//*[@id=\"mw-content-text\"]"); //remove the "toc" and "jump" and "siteSub" sections to save space and later client-side processing time if (ContentNode.SelectSingleNode("//*[@id=\"toc\"]") != null) { ContentNode.SelectSingleNode("//*[@id=\"toc\"]").Remove(); } foreach (var node in ContentNode.SelectNodes("//comment()")) { node.Remove(); } ContentNode.PrependChild(BodyNode.SelectSingleNode("//*[@id=\"firstHeading\"]")); //set the patternObject's title patternObject.Title = ContentNode.SelectSingleNode("//*[@id=\"firstHeading\"]").InnerHtml; foreach (var link in ContentNode.SelectNodes("//a/@href")) { //skip if this is a redlink (page doesn't exist). if (link.Attributes["href"].Value.Contains("redlink=1")) { continue; } //skip if this links to this page if (link.Attributes["href"].Value.Split('#').First() == response.FinalUrl) { continue; } //if any of the links ancestor nodes is the "category links" part of the page if (link.Ancestors().Any(node => node.Id == "catlinks")) { if (link.InnerText != "Categories") //if it is not the "categories" special page { //add it to the patterns list of categories patternObject.Categories.Add(link.InnerText); } } else //assume its a normal text-body link { //check if we don't already know about this link patternObject.CreateOrGetPatternLink(link.InnerText); } //add relation info if this is a relation link if (GetNodeReleventPageHeading(link, "h2") != null && GetNodeReleventPageHeading(link, "h2").InnerText == "Relations") { //get the relation type of this relation and get its inner text HtmlAgilityPack.HtmlNode RelationHeadingNode = GetNodeReleventPageHeading(link, "h3"); String RelationName = RelationHeadingNode.InnerText; //if there is a h4 node before the previous h3 node if (GetNodeReleventPageHeading(link, "h4") != null && RelationHeadingNode.InnerStartIndex < GetNodeReleventPageHeading(link, "h4").InnerStartIndex) { //assume it is a "with x" sub-category of relation for the "Can Instantiate" section RelationName = RelationHeadingNode.InnerText + " " + GetNodeReleventPageHeading(link, "h4").InnerText; } //add the relevent relation to this link patternObject.CreateOrGetPatternLink(link.InnerText).Type.Add(RelationName); } } //get a cleaned copy of the #content HTML for giving in the JSON data patternObject.Content = ProcessPageContentToString(ContentNode); string Json = JsonConvert.SerializeObject(patternObject); File.WriteAllText(Pattern.GetFileName(patternObject.Title), Json); }
/// <summary> /// return single node using _LabelData.XPath /// </summary> /// <returns></returns> protected HtmlAgilityPack.HtmlNode GetCurrentNode() { HtmlAgilityPack.HtmlNode ndTempParent = GetTempParentNode(); return(ndTempParent.SelectSingleNode(_LabelData.XPath)); }