private void GetImageList(List <HtmlNode> rlist, string url) { string html = helper.GetHTML(url); HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(html); HtmlNode rootNode = document.DocumentNode; HtmlNode titleNode = rootNode.SelectSingleNode("//*[@id=\"gj\"]"); HtmlNodeCollection imglist = rootNode.SelectNodes("//*[@id=\"gdt\"]/div"); rlist.AddRange(imglist.ToList()); HtmlNodeCollection pageList = rootNode.SelectNodes("//*[@class=\"ptb\"]/*/td"); HtmlNode last = pageList.Last(); if (last.InnerHtml == ">") { return; } string purl = pageList.Last().ChildNodes[0].Attributes["href"].Value; GetImageList(rlist, purl); }
MatchName GetFullMatchName(HtmlDocument doc) { try { string matchName = string.Empty; HtmlNodeCollection Team = doc.DocumentNode.SelectNodes("//div[@class='live-today-member-name nowrap ']"); HtmlDocument FirstDoc = new HtmlDocument(); HtmlDocument SecondDoc = new HtmlDocument(); FirstDoc.LoadHtml(Team.First().InnerHtml); SecondDoc.LoadHtml(Team.Last().InnerHtml); string firstTeam = FirstDoc.DocumentNode.SelectNodes("//span").First().InnerText; string secondTeam = SecondDoc.DocumentNode.SelectNodes("//span").First().InnerText; matchName = firstTeam + " vs " + secondTeam; var matchNameSplit = matchName.Split(new string[] { " vs ", " @ ", " - " }, StringSplitOptions.RemoveEmptyEntries); if (matchNameSplit[0].Contains("(")) { matchNameSplit[0] = matchNameSplit[0].Split(new string[] { " (" }, StringSplitOptions.RemoveEmptyEntries)[0]; } if (matchNameSplit[1].Contains("(")) { matchNameSplit[1] = matchNameSplit[1].Split(new string[] { " (" }, StringSplitOptions.RemoveEmptyEntries)[0]; } return(new MatchName(matchNameSplit[0], matchNameSplit[1])); } catch { return(null); } }
private void ProcessResult(SquashContext context, Tournament tournament, HtmlNodeCollection results, int position) { var player = GetPlayer(context, players.ElementAt(position - 1).Name); for (int i = 2 + position; i < players.Count + 2; i++) { var player2 = GetPlayer(context, players.ElementAt(i - 2).Name); var result = results.ElementAt(i).InnerHtml.Trim().Split(" "); AddResults(context, player, player2, result, tournament); } double rating; if (double.TryParse(results.Last().InnerText.Trim().Replace(" ", ""), out rating)) { player.TournamentResults.Add(new PlayerTournamentResult { Position = position, Points = rating, Tournament = tournament }); } else { player.TournamentResults.Add(new PlayerTournamentResult { Position = position, Points = null, Tournament = tournament }); } context.SaveChanges(); }
public int GetPageCount(string xpath) { HtmlDocument htmlDocument = this.htmlService.GetHtmlDocument(this.url).Result; int page = 0; HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(xpath); if (nodes == null) { throw new NodeNotFoundException("The XPath does not return results."); } HtmlNode last = nodes.Last(); if (Int32.TryParse(last.InnerText, out page)) { return(page); } else { HtmlNode l = nodes[nodes.Count - SKIP_NEXT_BUTTON_INDEX]; Int32.TryParse(l.InnerText, out page); } return(page); }
public async Task LoadGames() { Library.Games.Clear(); string profileLink = $"{Url}/badges"; int totalPages = 1; int currentPage = 0; do { currentPage++; string response = await UserWebClient.GetHttp($"{profileLink}?p={currentPage}"); HtmlDocument document = new HtmlDocument(); document.LoadHtml(response); if (!ProcessBadgesOnPage(document)) { break; } if (currentPage != 1) { continue; } HtmlNodeCollection pages = document.DocumentNode.SelectNodes("//a[@class='pagelink']"); string href = pages?.Last().Attributes["href"]?.Value; string maxPages = href?.Split('=').Last(); int.TryParse(maxPages, out totalPages); }while (currentPage < totalPages); }
internal async Task <bool> RefreshGamesToFarm() { if (!Initialized) // Is webBot initialize? { return(false); } byte Pagecount = 1; // Count of badges page by default await _bot.RefreshSessionIfNeeded().ConfigureAwait(false); Uri url = new Uri(SteamCommunityURL + "/profiles/" + steamID.ConvertToUInt64() + "/badges"); // Uri to page with badges HtmlDocument doc = null; for (int i = 0; doc == null && i < WebClient.MaxRetries; i++) { doc = await webClient.GetDocument(url).ConfigureAwait(false); // Get first page with badges } if (doc == null) { _bot.Log("Cant get badge page", LogType.Error); return(false); } HtmlNodeCollection collection = doc.DocumentNode.SelectNodes("//a[@class='pagelink']"); // Are we have page navigation? if (collection != null) { if (!byte.TryParse(collection.Last().InnerText, out Pagecount)) // If yes, change ours count of page { Pagecount = 1; } } GamesToFarmMulti.Clear(); // Clear up our list, because we will check badges level every N time GamesToFarmSolo.Clear(); List <Task> tasks = new List <Task>(Pagecount); // Make list of task to check page for game to idle for (byte i = 1; i <= Pagecount; i++) { byte currentPage = i; // Need save our page for use async HtmlDocument page = await webClient.GetDocument(new Uri(url, "?p=" + currentPage)).ConfigureAwait(false); tasks.Add(CheckPage(page)); } await Task.WhenAll(tasks).ConfigureAwait(false); // Wait for all page checked if (!GamesToFarmSolo.Any() && !GamesToFarmMulti.Any()) { _bot.Log("Have nothing to farm", LogType.Info); } else { _bot.Log($"We have {GamesToFarmSolo.Count} to farm solo and {GamesToFarmMulti.Count} to farm together", LogType.Info); // Log count of game to idle } return(true); }
public static int GetTotalChapters(HtmlDocument doc) { string pattern = "(href=\"/)(?<series>.+)/(?<num>.+)/\">(?<title>.+)(?=</a>)"; Regex r = new Regex(pattern); HtmlNodeCollection chapterNode = doc.DocumentNode.SelectNodes("//div[@class='index_box']/dl/dd[@class='subtitle']"); Match m = r.Match(chapterNode.Last().OuterHtml); return(Convert.ToInt32(m.Groups["num"].Value)); }
public void GetAttributesFromInfoBox(Dictionary <string, string> vehicleAttributes, HtmlNodeCollection rows) { // Traverse the info box and pull out all of the attribute title and value pairs foreach (HtmlNode row in rows) { HtmlNodeCollection cells = row.SelectNodes("td"); // Get the property name and value and add them to the dictionary before writing them out string rowTitle = cells.First().SelectNodes("b").Single().InnerText.Trim(); string rowValue = cells.Last().InnerText.Trim(); vehicleAttributes.Add(rowTitle, rowValue); _consoleManager.WriteLineInColour(ConsoleColor.DarkGreen, $"{rowTitle}: {rowValue}"); } }
public static int GetTotalChapters(HtmlDocument doc, Constants details) { if (details.Site() == Constants.SiteType.Syousetsu) // syosetu { string pattern = "(href=\"/)(?<series>.+)/(?<num>.+)/\">(?<title>.+)(?=</a>)"; Regex r = new Regex(pattern); HtmlNodeCollection chapterNode = doc.DocumentNode.SelectNodes("//div[@class='index_box']/dl/dd[@class='subtitle']"); Match m = r.Match(chapterNode.Last().OuterHtml); return(Convert.ToInt32(m.Groups["num"].Value)); } else // kakuyomu { HtmlNodeCollection chapterNode = doc.DocumentNode.SelectNodes("//div[@class='widget-toc-main']/ol/li[@class='widget-toc-episode']"); return(chapterNode.Count); } }
/// <summary> /// 附录需要在word里按目录要求,手动改为一级或者二级标题的格式 /// </summary> /// <param name="rootConvention"></param> public ReturnInfo ReadHtml(ConventionRow rootConvention) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.Load(htmlPath); HtmlNode htmlRootNode = doc.DocumentNode; HtmlNodeCollection title1Nodes_init; HtmlNodeCollection title2Nodes_init; List <string> str_contentList = new List <string>(); List <string> str_titleList = new List <string>(); List <string> str_title1List = new List <string>(); List <string> str_title2List = new List <string>(); HtmlNodeCollection contentNodes = new HtmlNodeCollection(htmlRootNode.Clone()); Dictionary <int, string> dic_title1Content = new Dictionary <int, string>(); HtmlNodeCollection titleNodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection title1Nodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection title2Nodes = new HtmlNodeCollection(htmlRootNode.Clone()); HtmlNodeCollection ftNoteRefnodes = new HtmlNodeCollection(htmlRootNode.Clone()); string htmlTxt = htmlRootNode.InnerHtml; //正文识别标题 #region (废弃选项:一级标题粗体识别) //if (method == ReadMethod.TITLE1_BOLD) //{ // //一级标题 // title1Nodes_init = htmlRootNode.SelectNodes(title1_select); // //二级标题可能所在span // title2Nodes_init = htmlRootNode.SelectNodes(title2_select); // #region 找出一级标题,HtmlNode保存在title1Nodes,文本存储在 str_title1List // if (title1Nodes_init != null) // { // for (int i = 0; i < title1Nodes_init.Count; i++) // { // if ((title1Nodes_init[i].ParentNode.Name == "p" && title1Nodes_init[i].ParentNode.ParentNode.Name == "div" && title1Nodes_init[i].HasChildNodes) // || (title1Nodes_init[i].Name == "h1" && title1Nodes_init[i].ParentNode.Name == "div") // || (title1Nodes_init[i].Name == "h2" && title1Nodes_init[i].ParentNode.Name == "div") // || (title1Nodes_init[i].ParentNode.Name == "a" && title1Nodes_init[i].ParentNode.ParentNode.Name == "p") // ) // { // foreach (var child in title1Nodes_init[i].DescendantsAndSelf()) // { // if (child.Name == "span" && child.HasAttributes) // { // foreach (var atbt in child.Attributes) // { // if (atbt.Name == "style")//&& atbt.Value== "font-size:15.0pt;font-family:黑体") // { // if ((title1Nodes_init[i].ParentNode.InnerText.Contains("第") && title1Nodes_init[i].ParentNode.InnerText.Contains("章")) // ) // { // if (title1Nodes_init[i].ParentNode.ParentNode.Name == "p") // { // title1Nodes.Add(title1Nodes_init[i].ParentNode.ParentNode); // str_title1List.Add(title1Nodes_init[i].ParentNode.ParentNode.InnerText.Replace(" ", " ").Replace("\r\n", "")); // } // else if (title1Nodes_init[i].ParentNode.Name == "p") // { // title1Nodes.Add(title1Nodes_init[i].ParentNode); // str_title1List.Add(title1Nodes_init[i].ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // else if (title1Nodes_init[i].Name == "h" || title1Nodes_init[i].Name == "h1" || title1Nodes_init[i].Name == "h2") // { // title1Nodes.Add(title1Nodes_init[i]); // str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // break; // } // } // break; // } // } // } // } // } //#region 找出二级小节标题,HtmlNode保存在title2Nodes ,文本存储在str_title2List ////span所在的几种情形:div->p->a->span div->p->span div->h1->span //if (title2Nodes_init != null) //{ // for (int i = 0; i < title2Nodes_init.Count; i++) // { // //标题span存在的情形1 // if (title2Nodes_init[i].ParentNode.Name == "a" && title2Nodes_init[i].ParentNode.ParentNode.Name == "p") // { // //避免添加重复的部分 // if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.ParentNode.Line != title2Nodes_init[i - 1].ParentNode.ParentNode.Line)) // { // title2Nodes.Add(title2Nodes_init[i].ParentNode.ParentNode); // str_title2List.Add(title2Nodes_init[i].ParentNode.ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // //标题span存在的情形2、3 // else if ((title2Nodes_init[i].ParentNode.Name == "p" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div") // || (title2Nodes_init[i].ParentNode.Name == "h1" && title2Nodes_init[i].ParentNode.ParentNode.Name == "div")) // { // //避免添加重复的部分 // if ((i == 0) || (i > 0 && title2Nodes_init[i].ParentNode.Line != title2Nodes_init[i - 1].ParentNode.Line)) // { // title2Nodes.Add(title2Nodes_init[i].ParentNode); // str_title2List.Add(title2Nodes_init[i].ParentNode.InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } // } // for (int i = 0; i < title2Nodes.Count; i++) // { // if ((i > 0 && title2Nodes[i].Line == title2Nodes[i - 1].Line)) // { // str_title2List.RemoveAt(i); // title2Nodes.RemoveAt(i); // } // } //} //#endregion // //} #endregion #region 项1:pdf转为图片的word文件后,通过p节点class属性提取标题 if (method == ReadMethod.TITLE_CLASS) { //HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone()); title1Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=1]"); title2Nodes_init = htmlRootNode.SelectNodes(@"//p[@class=2]"); for (int i = 0; i < title1Nodes_init.Count; i++) { if (title1Nodes_init[i].InnerText.Replace(" ", "").Trim() != string.Empty) { str_title1List.Add(title1Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title1Nodes.Add(title1Nodes_init[i]); } } for (int i = 0; i < title2Nodes_init.Count; i++) { if (title2Nodes_init[i].InnerText.Replace(" ", "").Trim() != string.Empty) { str_title2List.Add(title2Nodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title2Nodes.Add(title2Nodes_init[i]); } } } #endregion #region 项2:标题中Span 标签 Style属性识别 else if (method == ReadMethod.TITLE_SPANSTYLE) { HtmlNodeCollection title1Nodes_tmp = new HtmlNodeCollection(htmlRootNode.Clone()); #region 提取一级标题节点,生成一级目录的节点集合title1Nodes,和字符串集合str_title1List title1Nodes_init = htmlRootNode.SelectNodes(@"//p"); if (title1Nodes_init != null) { for (int i = 0; i < title1Nodes_init.Count; i++) { string str_style = title1Nodes_init[i].InnerHtml.Replace("\r\n", ""); bool condition = str_style.Contains(title1_select); //bool condition = str_style.Contains(title1_select) // && (title1Nodes_init[i].InnerText.Substring(0, 1) == "第") //|| title1Nodes_init[i].InnerText.Substring(0, 1) == "附"; if (RecogOptions.title1_has_zitizihao) { string str_style_zihao = title1_select.Substring(0, title1_select.IndexOf(';')); string str_style_ziti = title1_select.Substring(title1_select.IndexOf(';') + 1); condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti); } if (condition) { foreach (var match in title1Nodes_init[i].DescendantsAndSelf()) { if (RecogOptions.title1_child == 0 && match.Name == "p") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } if (RecogOptions.title1_child == 1 && match.Name == "b") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } if (RecogOptions.title1_child == 2 && match.Name == "a") { title1Nodes_tmp.Add(title1Nodes_init[i]); break; } } } } for (int i = 0; i < title1Nodes_tmp.Count; i++) { if (title1Nodes_tmp[i].InnerText.Replace(" ", "").Trim() != string.Empty && (i == 0 || (i > 0 && title1Nodes_tmp[i].Line != title1Nodes_tmp[i - 1].Line))) { str_title1List.Add(title1Nodes_tmp[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); title1Nodes.Add(title1Nodes_tmp[i]); } } } #endregion #region 提取二级标题节点,生成二级目录的节点集合title2Nodes,和字符串集合str_title2List HtmlNodeCollection tempNodes = new HtmlNodeCollection(htmlRootNode.Clone()); if (RecogOptions.title2RecogMethod == 1) { title2Nodes_init = htmlRootNode.SelectNodes(@"//p"); if (title2Nodes_init != null) { for (int i = 0; i < title2Nodes_init.Count; i++) { string str_tmp = title2Nodes_init[i].InnerText.Replace(" ", " "); string regExp = Patterns.title2_x_dot_x_XXX; Regex reg = new Regex(regExp, RegexOptions.Multiline); MatchCollection matches = reg.Matches(str_tmp); if (matches.Count > 0) { string tmp = matches[0].Value; //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改 //if(tmp.Substring(0, 1) == "第" || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "修") if (!tmp.Contains("。") //&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";") //&&!tmp.Contains("p")) ) //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修")) { foreach (var match in title2Nodes_init[i].DescendantsAndSelf()) { if (RecogOptions.title2_child == 0 && match.Name == "p") { tempNodes.Add(title2Nodes_init[i]); break; } if (RecogOptions.title2_child == 1 && match.Name == "b") { tempNodes.Add(title2Nodes_init[i]); break; } if (RecogOptions.title2_child == 2 && match.Name == "a") { tempNodes.Add(title2Nodes_init[i]); break; } } } } } } } if (RecogOptions.title2RecogMethod == 0) { title2Nodes_init = htmlRootNode.SelectNodes(@"//span[@style]"); if (title2Nodes_init != null) { for (int i = 0; i < title2Nodes_init.Count; i++) { string str_style = title2Nodes_init[i].Attributes["style"].Value.Replace("\r\n", ""); bool condition = str_style.Contains(title2_select); if (RecogOptions.title2_has_zitizihao) { string str_style_zihao = title2_select.Substring(0, title2_select.IndexOf(';')); string str_style_ziti = title2_select.Substring(title2_select.IndexOf(';') + 1); condition = str_style.Contains(str_style_zihao) && str_style.Contains(str_style_ziti); } if (condition) { if ((RecogOptions.title2_child == 0) || (RecogOptions.title2_child == 1 && title2Nodes_init[i].ParentNode.Name == "b") || (RecogOptions.title2_child == 2 && title2Nodes_init[i].ParentNode.Name == "a")) { foreach (var match in title2Nodes_init[i].AncestorsAndSelf()) { if (match.Name == "p") { //foreach(var match1 in match.Descendants()) //{ // if (match1.Name == "a") // { // tempNodes.Add(match); // break; // } //} string tmp = match.InnerText.Replace(" ", "").Replace("\r\n", "").Trim(); int a = 0; if (tmp.Length > 1) { //有些文档中形如“1 XXX”的不是二级标题,需要手动在程序中修改 //if((tmp.Contains("条") && tmp.Substring(0, 1) == "第") || tmp.Substring(0, 1) == "附" || tmp.Substring(0, 1) == "标") //if(tmp.Contains("条")&&tmp.Substring(0,1)=="第") //if(!(tmp.Substring(0,1)=="第")&& !(tmp.Substring(0, 1) == "附")) //if(int.TryParse(tmp.Substring(0, 1),out a)==true) if (!tmp.Contains("。"))//&& tmp.Substring(tmp.Length - 1, 1) != ":" && !tmp.Contains(";")) //tmp.Length>0&&(tmp.Substring(0,1)=="第"|| tmp.Substring(0, 1) == "附"|| tmp.Substring(0, 1) == "修")) { tempNodes.Add(match); } } break; } } } } } } } for (int i = 0; i < tempNodes.Count; i++) { if (tempNodes[i].InnerText.Replace(" ", "").Trim() != String.Empty && (i == 0 || (i > 0 && tempNodes[i].Line != tempNodes[i - 1].Line))) { title2Nodes.Add(tempNodes[i]); string tmp = tempNodes[i].InnerText.Replace("\r\n", "").Replace(" ", " "); str_title2List.Add(tmp.Trim()); } } #endregion } #endregion #region 项3:h1/h2/h3标签识别标题 //else if (method == ReadMethod.TITLE_TAG) //{ // titleNodes_init = htmlRootNode.SelectNodes(@"//" + title1_select + @"|" + @"//" + title2_select); // title1Nodes_init = htmlRootNode.SelectNodes(@"//" + title1_select); // title2Nodes_init = htmlRootNode.SelectNodes(@"//" + title2_select); // for (int i = 0; i < titleNodes_init.Count; i++) // { // string tmpstr = titleNodes_init[i].InnerText; // if (titleNodes_init[i].Name == title1_select && tmpstr.Contains("第") && tmpstr.Contains("章")) // { // titleNodes.Add(titleNodes_init[i]); // title1Nodes.Add(titleNodes_init[i]); // str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // str_title1List.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // else if (titleNodes_init[i].Name == title2_select) // { // titleNodes.Add(titleNodes_init[i]); // title2Nodes.Add(titleNodes_init[i]); // str_titleList.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // str_title2List.Add(titleNodes_init[i].InnerText.Trim().Replace(" ", " ").Replace("\r\n", "")); // } // } //} #endregion #region 生成包含按序排列的一二级目录的节点集合titleNodes,和字符串集合str_titleList foreach (var match in title1Nodes) { titleNodes.Add(match); } foreach (var match in title2Nodes) { titleNodes.Add(match); } for (int i = 0; i < titleNodes.Count; i++) { for (int j = i; j < titleNodes.Count; j++) { if (titleNodes[i].Line > titleNodes[j].Line) { var temp = titleNodes[i]; titleNodes[i] = titleNodes[j]; titleNodes[j] = temp; } } } for (int i = 0; i < titleNodes.Count; i++) { string tmp = titleNodes[i].InnerText.Replace(" ", " ").Replace("\r\n", ""); str_titleList.Add(tmp.Trim()); } #endregion try { #region 找出html文本末尾可能存在的各脚注div,HtmlNode存储在ftNoteRefnodes foreach (var match in htmlRootNode.Descendants()) { if (match.Name == "div" && match.HasAttributes) { string tmp = match.GetAttributeValue("id", "notfound"); if (tmp != "notfound") { ftNoteRefnodes.Add(match); } } } #endregion #region html文档中去除文档末尾的脚注,保存在 htmlTxt 字符串 if (ftNoteRefnodes != null) { for (int i = 0; i < ftNoteRefnodes.Count; i++) { htmlTxt = htmlTxt.Replace(ftNoteRefnodes[i].OuterHtml, ""); } } htmlTxt = htmlTxt.Replace("</body>", "").Replace("</html>", "").Replace("<body>", "").Replace("<html>", ""); #endregion #region 替换图片路径 Regex reg = new Regex(Patterns.imageSrc); MatchCollection matches = reg.Matches(htmlTxt); if (matches.Count == 0) { retInfo.picResult = "无匹配图片"; } else { htmlTxt = reg.Replace(htmlTxt, "${1}" + imageFilePath + "${2}"); retInfo.picResult = "识别到图片数目:" + matches.Count.ToString(); } //System.IO.File.WriteAllText(@"../../../htmlRcgTest/全文.html", htmlTxt); #endregion #region 提取一级标题下可能有的正文,此标题序号和正文键值对 存储在字典dic_title1Content(包含脚注)dic_title1Content_tmp(不含脚注) Dictionary <int, string> dic_title1Content_tmp = new Dictionary <int, string>(); for (int i = 0; i < titleNodes.Count; i++) { for (int j = 0; j < title1Nodes.Count - 1; j++) { if (titleNodes[i].Line == title1Nodes[j].Line) { if ((i < titleNodes.Count - 1 && titleNodes[i + 1].Line == title1Nodes[j + 1].Line)) { int start = htmlTxt.IndexOf(title1Nodes[j].OuterHtml); int end = htmlTxt.IndexOf(title1Nodes[j + 1].OuterHtml, start + 1); if (start != -1 && end > start) { dic_title1Content_tmp.Add(j, htmlTxt.Substring(start, end - start)); break; } else { throw new Exception("title1 content提取出错"); } } } } } for (int i = 0; i < title1Nodes.Count; i++) { if (titleNodes.Last().Line == title1Nodes[i].Line) { int start = htmlTxt.IndexOf(title1Nodes.Last().OuterHtml); if (start != -1) { dic_title1Content_tmp.Add(title1Nodes.Count - 1, htmlTxt.Substring(start)); break; } else { throw new Exception("title1 last content提取出错"); } } } foreach (var pair in dic_title1Content_tmp) { string v = pair.Value; foreach (var ftnref in ftNoteRefnodes) { if (pair.Value.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\"")) { v = v + ftnref.OuterHtml; } } dic_title1Content.Add(pair.Key, v); } #endregion #region 更新 htmlTxt 字符串,将html文本中一级标题和一级标题下直接的正文 删除 if (title1Nodes != null) { for (int i = 0; i < title1Nodes.Count; i++) { if (dic_title1Content_tmp.Count != 0)//若存在一级标题下直接的正文 { foreach (var pair in dic_title1Content_tmp) { int index = htmlTxt.IndexOf(pair.Value); htmlTxt = htmlTxt.Replace(pair.Value, ""); if (i != pair.Key) { htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, ""); } } } else //若不存在一级标题下直接的正文 { htmlTxt = htmlTxt.Replace(title1Nodes[i].OuterHtml, ""); } } } #endregion #region 提取二级标题下Html正文,分小节存储,HtmlNode节点存储在contentNodes,文本存储在str_contentList int index_PartStart = 0, index_PartEnd = 0; for (int i = 0; i < title2Nodes.Count; i++) { HtmlAgilityPack.HtmlDocument contentNodeDoc = new HtmlAgilityPack.HtmlDocument(); string str_content; if (i < title2Nodes.Count - 1) { index_PartStart = htmlTxt.IndexOf(title2Nodes[i].OuterHtml, index_PartStart + 1); index_PartEnd = htmlTxt.IndexOf(title2Nodes[i + 1].OuterHtml, index_PartStart + 1); if (index_PartStart != -1 && index_PartEnd > index_PartStart) { str_content = htmlTxt.Substring(index_PartStart, index_PartEnd - index_PartStart); } else { throw new Exception("提取出错"); } } else { index_PartStart = htmlTxt.IndexOf(title2Nodes[title2Nodes.Count - 1].OuterHtml, index_PartStart + 1); if (index_PartStart != -1) { str_content = htmlTxt.Substring(index_PartStart); } else { throw new Exception("提取出错"); } } foreach (var ftnref in ftNoteRefnodes) { if (str_content.Contains("href=\"#_" + ftnref.Attributes["id"].Value + "\"")) { str_content = str_content + ftnref.OuterHtml; } } contentNodeDoc.LoadHtml(str_content); contentNodes.Add(contentNodeDoc.DocumentNode); str_contentList.Add(contentNodes[i].OuterHtml); System.IO.File.WriteAllText(@"../../../htmlRcgTest/" + i + @".html", str_contentList[i]); } #endregion // 断点位置:在局部变量窗口中检查str_contentList/str_titleList/ // str_title1List /str_title2List/dic_title1Content // 1、数目是否正确 // 2、的内容是否正确,是否有缺失(二级标题下的正文可以在输出的文件 // "../../../htmlRcgTest/" + i + @".html"中查看) } catch (Exception err) { Console.WriteLine(err.Message); } #region 将一、二级标题及内容录入数据库 try { SQLUtils sqlUtils = SQLUtils.getInstance(); sqlUtils.makeConnect(); ConventionRow tmp_rootConvention = rootConvention; for (int i = 0; i < title1Nodes.Count; i++) { ConventionRow tempRow1 = null; foreach (var pair in dic_title1Content) { if (pair.Key == i)//若一级标题下有内容,而无二级目录 { tempRow1 = new ConventionRow(rootConvention, str_title1List[i], i + 1, ConventionOptions.CATEGORY.IS_CONTENT, pair.Value); sqlUtils.writeRow_local(tempRow1); retInfo.title1Guids.Add(tempRow1.Guid); //retInfo.retTable.Rows.Add(tempRow1); break; } } if (tempRow1 == null)////若一级标题下无内容,有二级目录 { tempRow1 = new ConventionRow(rootConvention, str_title1List[i], i + 1, ConventionOptions.CATEGORY.IS_CATEGORY); sqlUtils.writeRow_local(tempRow1); retInfo.title1Guids.Add(tempRow1.Guid); //retInfo.retTable.Rows.Add(tempRow1); } for (int j = 0, k = 0; j < title2Nodes.Count; j++) { tmp_rootConvention = tempRow1; if (i < title1Nodes.Count - 1) { if (title2Nodes[j].Line <title1Nodes[i + 1].Line && title2Nodes[j].Line> title1Nodes[i].Line) { ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j], ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]); sqlUtils.writeRow_local(tempRow2); //retInfo.retTable.Rows.Add(tempRow2); } } else if (title2Nodes[j].Line > title1Nodes[i].Line) { ConventionRow tempRow2 = new ConventionRow(tmp_rootConvention, str_title2List[j], ++k, ConventionOptions.CATEGORY.IS_CONTENT, str_contentList[j]); sqlUtils.writeRow_local(tempRow2); //retInfo.retTable.Rows.Add(tempRow2); } } } retInfo.title1s = str_title1List; retInfo.title2s = str_title2List; retInfo.title2Contents = str_contentList; retInfo.titles = str_titleList; retInfo.title1ContentsNum = dic_title1Content.Count; } catch (Exception err) { Console.WriteLine(err.Message); retInfo.errorInfo = "录入失败。错误原因:" + err.Message; } return(retInfo); #endregion }
private static async Task Scrape(World world) { int statsadded = 0; HtmlNodeCollection htmlNodeCollectionFromWorld = await GetHtmlNodeCollectionFromWorld(world.Name); if (htmlNodeCollectionFromWorld == null) { Console.WriteLine($"{DateTime.UtcNow} UTC: {world.Name} - Error reading stats"); return; } if (IsThereAnyNewStats(world, htmlNodeCollectionFromWorld) == false) { Console.WriteLine($"{DateTime.UtcNow} UTC: {world.Name} - No new stats found"); return; } List <KillStats> killStats = HtmlNodeCollectionToKillStats(htmlNodeCollectionFromWorld); if (killStats == null) { Console.WriteLine($"{DateTime.UtcNow} UTC: {world.Name} - Error parsing stats"); return; } await AddMissingCreatures(killStats); List <Creature> creatures = await RestService.GetCreaturesAsync(); if (creatures == null) { Console.WriteLine($"{DateTime.UtcNow} UTC: {world.Name} - Error getting creatures"); return; } foreach (KillStats killStatsTableRow in killStats) { Creature creature = creatures.Single(x => x.Name == killStatsTableRow.Name); if (creature.Monitored) { int amountOfKills = 1; if (killStatsTableRow.LastDayKills > 0 || killStatsTableRow.LastDayDeaths > 0) { List <Location> locations = await RestService.GetCreatureLocations(creature); if (locations == null) { Console.WriteLine($"{DateTime.UtcNow} UTC: {world.Name} - Error getting locations for {creature.Name}"); return; } if (killStatsTableRow.LastDayKills > 0 && locations.Count > 1) { amountOfKills = killStatsTableRow.LastDayKills; } for (int i = 0; i < amountOfKills; i++) { Spawn spawn = new Spawn { CreatureId = creature.Id, WorldId = world.Id, TimeMinUtc = new DateTime(DateTime.UtcNow.Year, DateTime.UtcNow.Month, DateTime.UtcNow.Day, 2, 0, 0, DateTimeKind.Utc).AddDays(-1), TimeMaxUtc = new DateTime(DateTime.UtcNow.Year, DateTime.UtcNow.Month, DateTime.UtcNow.Day, 2, 0, 0, DateTimeKind.Utc) }; statsadded++; if (await RestService.PostSpawnAsync(spawn) == false) { Console.WriteLine($"{DateTime.UtcNow} UTC: Error posting spawn: {creature.Name} on {world.Name}"); return; } } } } } world.LastDayDeaths = int.Parse(htmlNodeCollectionFromWorld.Last().ChildNodes[1].InnerText.Replace(" ", "")); world.LastDayKills = int.Parse(htmlNodeCollectionFromWorld.Last().ChildNodes[2].InnerText.Replace(" ", "")); world.LastScrapeTime = DateTime.UtcNow; if (await RestService.PutWorldAsync(world)) { Console.WriteLine($"{DateTime.UtcNow} UTC: {world.Name}: {statsadded} kills added"); } }
private static bool IsThereAnyNewStats(World world, HtmlNodeCollection htmlNodeCollection) { return(htmlNodeCollection.Last().ChildNodes[1].InnerText.Replace(" ", "").Trim() != world.LastDayDeaths.ToString() || htmlNodeCollection.Last().ChildNodes[2].InnerText.Replace(" ", "").Trim() != world.LastDayKills.ToString()); }
private bool ScrapeTopEscortBabes(string urlToScrape) { try { List <string> ProfileURLs = new List <string>(); using WebClient client = new WebClient(); client.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"); client.Headers.Add(HttpRequestHeader.Cookie, "plus18=1"); string htmlCode = client.DownloadString(urlToScrape); var doc = new HtmlDocument(); doc.LoadHtml(htmlCode); HtmlNode parentNode = doc.GetElementbyId("homepage_right"); HtmlNodeCollection itemParentsNodes = parentNode.SelectNodes(".//div[contains(@class, 'items')]"); if (itemParentsNodes.Count() == 2) { foreach (HtmlNode mainNode in itemParentsNodes) { HtmlNodeCollection profilesNodes = mainNode.SelectNodes(".//li"); if (profilesNodes != null) { foreach (HtmlNode singleProfileNode in profilesNodes) { HtmlNode hrefNode = singleProfileNode.SelectSingleNode(".//a[@href]"); string hrefValue = hrefNode.GetAttributeValue("href", string.Empty); if (string.IsNullOrWhiteSpace(hrefValue)) { continue; } ProfileURLs.Add(hrefValue); } } } foreach (string url in ProfileURLs) { try { string finalInfo = ""; htmlCode = browser.Get(url, true, Properties.Resources.TopEscortBabes); Log(url); finalInfo += url + newLine; doc = new HtmlDocument(); doc.LoadHtml(htmlCode); HtmlNode mainParentNode = doc.GetElementbyId("homepage"); HtmlNode headerNode = mainParentNode.SelectSingleNode(".//div[@class='profile-cover']"); HtmlNode titleNode = headerNode.SelectSingleNode(".//h2[@class='header-title']"); string profileName = titleNode.InnerText.Replace("\n", "").Replace("\r", "").Trim().Replace(" ", " "); finalInfo += "Name:" + profileName + newLine; try { HtmlNode afiliationNode = doc.GetElementbyId("accord-agency"); if (afiliationNode == null) { finalInfo += "Affiliation:Independent" + newLine; } else { HtmlNode afiliationBodyNode = afiliationNode.SelectSingleNode(".//div"); HtmlNode afiliationNameNode = afiliationBodyNode.SelectSingleNode(".//h4"); if (afiliationNameNode == null) { finalInfo += "Affiliation:Independent" + newLine; } else { string afiliation = afiliationNameNode.InnerText.Replace("\n", "").Replace("\r", "").Trim().Replace(" ", " "); finalInfo += "Affiliation:" + afiliation + newLine; } } } catch { } HtmlNode personalNode = doc.GetElementbyId("accord-personal-data"); HtmlNode detailsNode = personalNode.SelectSingleNode(".//div[contains(@class, 'detail-block-body')]"); HtmlNodeCollection detailLineNode = detailsNode.SelectNodes(".//div[@class='personal-data-item']"); foreach (HtmlNode lineNode in detailLineNode) { string infoLine = lineNode.InnerText.Replace("\n", "").Replace("\r", "").Trim().Replace(" ", " "); RegexOptions options = RegexOptions.None; Regex regex = new Regex("[ ]{2,}", options); infoLine = regex.Replace(infoLine, ":"); infoLine = infoLine.Replace(":•:", ","); finalInfo += infoLine + newLine; } try { HtmlNode priceNode = doc.GetElementbyId("prices"); HtmlNodeCollection priceListNode = priceNode.SelectNodes(".//div[@class='price-item']"); if (priceListNode != null) { finalInfo += "Prices:"; foreach (HtmlNode lineNode in priceListNode) { string infoLine = lineNode.InnerText.Replace("\n", "").Replace("\r", "").Trim().Replace(" ", " "); RegexOptions options = RegexOptions.None; Regex regex = new Regex("[ ]{2,}", options); infoLine = regex.Replace(infoLine, ":"); infoLine = "-" + infoLine.Replace(" Price:", "-").Replace(":", " ") + "-"; finalInfo += infoLine; if (lineNode != priceListNode.Last()) { finalInfo += " | "; } } finalInfo += newLine; } } catch { } finalInfo = finalInfo.Trim(); string profilesPath = Path.Combine(AppPath, "Profiles"); if (!Directory.Exists(profilesPath)) { Directory.CreateDirectory(profilesPath); } string ProfilePath = Path.Combine(profilesPath, profileName); if (Directory.Exists(ProfilePath)) { Directory.Delete(ProfilePath, true); } Directory.CreateDirectory(ProfilePath); string profileTxtPath = Path.Combine(ProfilePath, "profile.txt"); StreamWriter sw = new StreamWriter(profileTxtPath); sw.WriteLine(finalInfo); sw.Close(); Log(ProfilePath); //try //{ // HtmlNode mainImageNode = doc.DocumentNode.SelectSingleNode(".//div[@class='profile-details-right']"); // HtmlNode imagesParentNode = mainImageNode.SelectSingleNode(".//div[@class='photos-wrapper']"); // if (imagesParentNode != null) // { // HtmlNodeCollection imagesNode = imagesParentNode.SelectNodes(".//a[@class='ilightbox']"); // if (imagesNode != null) // { // int i = 0; // foreach (HtmlNode singleImageNode in imagesNode) // { // string imgURL = singleImageNode.GetAttributeValue("href", string.Empty); // Log(imgURL); // //Clipboard.SetText(imgURL); // client.DownloadFile(imgURL, Path.Combine(ProfilePath, $"{i}.jpg")); // i++; // } // } // } //} //catch { } } catch (ThreadAbortException) { break; } catch (Exception ex) { LogError("Url invalid: " + url + ". Error: " + ex.ToString()); break; } } } else { LogError($"Invalid parent item count, expected 2 got {itemParentsNodes.Count()}"); } } catch (ThreadAbortException) { //nothing, stopped } catch (Exception ex) { LogError(ex.ToString()); } finally { GC.Collect(); GC.WaitForPendingFinalizers(); } return(true); }
private bool ScrapeEuroGirlsEscort(string urlToScrape) { try { List <string> ProfileURLs = new List <string>(); using WebClient client = new WebClient(); client.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"); client.Headers.Add(HttpRequestHeader.Cookie, "over18=1"); string htmlCode = client.DownloadString(urlToScrape); var doc = new HtmlDocument(); doc.LoadHtml(htmlCode); HtmlNodeCollection parentItem = doc.DocumentNode.SelectNodes("//div[contains(@class, 'list-items')]"); if (parentItem.Count() == 1) { HtmlNodeCollection childDivs = parentItem.First().SelectNodes(".//div"); foreach (HtmlNode node in childDivs) { HtmlNodeCollection profileURLNodes = node.SelectNodes(".//a[@href]"); if (profileURLNodes != null) { foreach (HtmlNode profileNode in profileURLNodes.Where(x => x.InnerHtml != null)) { string hrefValue = profileNode.GetAttributeValue("href", string.Empty); string url = "https://" + new Uri(urlToScrape).Host + hrefValue; ProfileURLs.Add(url); } } } foreach (string url in ProfileURLs) { try { string finalInfo = ""; htmlCode = browser.Get(url, true, Properties.Resources.EuroGirlsEscort); finalInfo += url + newLine; doc = new HtmlDocument(); doc.LoadHtml(htmlCode); HtmlNode parentProfile = doc.GetElementbyId("main-content"); HtmlNode descriptionNode = parentProfile.SelectSingleNode(".//div[@class='description']"); HtmlNode nameNode = descriptionNode.SelectSingleNode(".//h1"); string nameStr = nameNode.InnerText.Replace("\n", "").Replace("\r", "").Trim().Replace(" ", " "); if (nameStr.Contains(",")) { string Name = nameStr.Split(',')[0]; string Afiliation = nameStr.Split(',')[1]; finalInfo += "Name:" + Name + newLine; finalInfo += "Affiliation:" + Afiliation + newLine; } else { finalInfo += "Name:" + nameStr + newLine; } HtmlNode profileParent = parentProfile.SelectSingleNode(".//a[contains(@class, 'js-gallery')]"); string imageURL = profileParent.GetAttributeValue("href", string.Empty); ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12; client.DownloadFile(new Uri(imageURL), Path.Combine(AppPath, "Images/" + imageURL.Split('/').Last())); HtmlNode infoNodes = parentProfile.SelectSingleNode(".//div[@class='params']"); HtmlNodeCollection singleInfoNodes = infoNodes.SelectNodes(".//div"); foreach (HtmlNode lineNode in singleInfoNodes) { string lineInfo = lineNode.InnerText.Replace("\n", "").Replace("\r", "").Trim().Replace(" ", " "); finalInfo += lineInfo + newLine; } HtmlNode phoneNode = parentProfile.SelectSingleNode(".//a[contains(@class, 'js-phone')]"); string phoneNumber = phoneNode.InnerText.Replace(" ", " "); finalInfo += "Phone number:" + phoneNumber + newLine; try { HtmlNode ratesNodes = parentProfile.SelectSingleNode(".//div[@class='rates']"); if (ratesNodes != null) { HtmlNode ratesTableNodes = ratesNodes.SelectSingleNode(".//tbody"); HtmlNodeCollection ratesLines = ratesTableNodes.SelectNodes(".//tr"); finalInfo += "Rates:"; foreach (HtmlNode rateLine in ratesLines) { string rate = rateLine.InnerText.Replace(" ", " ").Replace("\n", "-").Replace("\r", "-").Trim(); rate = rate.Replace("--", "-").Replace("--", "-"); finalInfo += rate; if (rateLine != ratesLines.Last()) { finalInfo += " | "; } } finalInfo += newLine; } } catch { } try { HtmlNode servicesNodes = parentProfile.SelectSingleNode(".//div[@class='services']"); if (servicesNodes != null) { HtmlNode servicesTableNodes = servicesNodes.SelectSingleNode(".//tbody"); HtmlNodeCollection servicesLines = servicesNodes.SelectNodes(".//tr"); finalInfo += "Services:"; foreach (HtmlNode serviceLine in servicesLines) { string service = serviceLine.InnerText.Replace(" ", " ").Replace("\n", "-").Replace("\r", "-").Trim(); service = service.Replace("--", "-").Replace("--", "-").Replace("--", "-"); if (service == "-Services-Included-Extra-") { continue; } finalInfo += service; if (serviceLine != servicesLines.Last()) { finalInfo += " | "; } } finalInfo += newLine; } } catch { } finalInfo = finalInfo.Trim(); string profilesPath = Path.Combine(AppPath, "Profiles"); if (!Directory.Exists(profilesPath)) { Directory.CreateDirectory(profilesPath); } string ProfilePath = Path.Combine(profilesPath, nameStr); if (Directory.Exists(ProfilePath)) { Directory.Delete(ProfilePath, true); } Directory.CreateDirectory(ProfilePath); string profileTxtPath = Path.Combine(ProfilePath, "profile.txt"); StreamWriter sw = new StreamWriter(profileTxtPath); sw.WriteLine(finalInfo); sw.Close(); HtmlNode imgNode = doc.GetElementbyId("js-gallery"); HtmlNodeCollection imgsNodes = imgNode.SelectNodes(".//a[@class='js-gallery']"); int i = 0; foreach (HtmlNode imgElm in imgsNodes) { string imgURL = imgElm.GetAttributeValue("href", string.Empty); client.DownloadFile(imgURL, Path.Combine(ProfilePath, $"{i}.jpg")); i++; } } catch (ThreadAbortException) { break; } catch (Exception ex) { LogError("Url invalid: " + url + ". Error: " + ex.ToString()); break; } } } else { LogError($"Invalid parent item count, expected 1 got {parentItem.Count()}"); } } catch (ThreadAbortException) { //nothing, stopped } catch (Exception ex) { LogError(ex.ToString()); } finally { GC.Collect(); GC.WaitForPendingFinalizers(); } return(true); }
public void Run(object obj) { if (IsCancel) { return; } try { #region Run // 设置Status m_SyncContext.Post(SetStatus, "running"); string url = dInfo.URL; html = helper.GetHTML(url); HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(html); HtmlNode rootNode = document.DocumentNode; HtmlNode tt = rootNode.SelectSingleNode("//*[@id=\"gn\"]"); HtmlNode titleNode = rootNode.SelectSingleNode("//*[@id=\"gj\"]"); string title = string.IsNullOrEmpty(titleNode.InnerText) ? tt.InnerText : titleNode.InnerText; title = title.Replace(":", "").Replace("?", "").Replace("*", ""); SavePath = filePath + "\\" + RelpaceFileName(title); if (!Directory.Exists(SavePath)) { Directory.CreateDirectory(SavePath); } // 设置TItle m_SyncContext.Post(SetTitle, title); dInfo.Title = titleNode.InnerText; HtmlNodeCollection imgColl = rootNode.SelectNodes("//*[@id=\"gdt\"]/div"); List <HtmlNode> imglist = imgColl.ToList(); HtmlNodeCollection pageList = rootNode.SelectNodes("//*[@class=\"ptb\"]/*/td"); HtmlNode last = pageList.Last(); if (last.InnerHtml != ">") { string purl = last.ChildNodes[0].Attributes["href"].Value; GetImageList(imglist, purl); } //HtmlNodeCollection pageList = rootNode.SelectNodes("//*[@class=\"ptb\"]/*/td"); //while(true) //{ // HtmlNode last = pageList.Last(); // if(last.InnerHtml== ">") // { // break; // } // string purl = pageList.Last().ChildNodes[0].Attributes["href"].Value; // string phtml = helper.GetHTML(purl); //} dInfo.CNT = imglist.Count; // 如果新开始的,Current=0,如果重新加载的,那么Current不为0的话,不重置 if (dInfo.Current == 0) { dInfo.Current = 1; } int i = 1; foreach (HtmlNode node in imglist) { try { #region image // 设置Status m_SyncContext.Post(SetProgress, dInfo.Current + "/" + dInfo.CNT); if (node.Attributes["class"].Value != "gdtm") { i++; continue; } if (i < dInfo.Current) { i++; continue; } string imgUrl = node.ChildNodes[0].FirstChild.Attributes[0].Value; string DetailHTML = helper.GetHTML(imgUrl); HtmlAgilityPack.HtmlDocument imgDoc = new HtmlAgilityPack.HtmlDocument(); imgDoc.LoadHtml(DetailHTML); string iurl = imgDoc.DocumentNode.SelectSingleNode("//*[@id=\"img\"]").Attributes["src"].Value; string hz = iurl.Substring(iurl.Length - 4, 4); string filePath = SavePath + "\\" + i.ToString("000") + hz; try { helper.DownLoadImage(iurl, filePath); FileInfo info = new FileInfo(filePath); if (info.Length == 28658) { m_SyncContext.Post(SetStatus, "Limit Error"); info.Delete(); return; } //Image img = Image.FromFile(filePath); } catch (Exception e) { Log(dInfo.URL, e.Message); m_SyncContext.Post(SetStatus, "Error"); return; } i++; dInfo.Current = i; //*[@id="gdt"]/div[1]/div/a #endregion } catch (Exception e) { Log(dInfo.URL, e.Message); m_SyncContext.Post(SetStatus, "Error"); return; } } // 设置Status m_SyncContext.Post(SetStatus, "finished"); #endregion } catch (Exception e) { // 设置Status m_SyncContext.Post(SetStatus, "Error"); Log(dInfo.URL, e.Message); Log(dInfo.URL, html); } }
public void SetProperties() { try { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(this.HtmlBody); HtmlNodeCollection nodesParent = doc.DocumentNode.SelectNodes("//body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table"); HtmlNodeCollection nodes = nodesParent[3].SelectNodes(".//tbody/tr/td"); this.ContactName = Utilities.WebExtension.DecodeHtml(nodes[1].InnerText.Trim()); this.CompanyName = Utilities.WebExtension.DecodeHtml(nodes[2].InnerText.Trim()); HtmlNode node = nodesParent[5].SelectSingleNode(".//tbody/tr/td"); this.Body = Utilities.WebExtension.DecodeHtml(node.InnerText.Trim()); nodesParent = doc.DocumentNode.SelectNodes("//body/i"); String feedbackid = String.Empty; if (nodesParent != null) { node = nodesParent.Last <HtmlNode>(); feedbackid = node == null ? String.Empty : node.InnerText.Trim().Replace("==", "").Replace(""", "\""); JObject json = (JObject)JsonConvert.DeserializeObject(feedbackid); if (json != null) { this.FeedbackId = json["feedbackid"].ToString().Trim(); } } nodes = doc.DocumentNode.SelectNodes("//body/img"); if (nodes != null) { String value = nodes.Last <HtmlNode>().GetAttributeValue("src", "").Replace("&", "&"); String[] values = value.Split('?'); if (values.Length > 1) { values = values[1].Split('&'); foreach (String item in values) { String[] pair = item.Split('='); if (pair.Length == 2) { value = pair[1]; switch (pair[0]) { case "crm_mtn_tracelog_log_id": break; case "crm_mtn_tracelog_task_id": break; case "from": break; case "to": this.SentToOriginal = value; break; case "from_sys": break; case "biz_type": break; case "template": break; } } } } } } catch (System.Exception ex) { this.Errors["HtmlBody"] = ex.Message; } }
/* * private void ParseMatch(ChromiumWebBrowser browser) * { * if (!browser.IsBrowserInitialized || browser.IsLoading) * return; * * var task = browser.GetSourceAsync(); * task.Wait(2000); * if (!task.IsCompleted) return; * string html = task.Result; * * HtmlDocument doc = new HtmlDocument(); * doc.LoadHtml(html); * * ParseMatchPageHtml(doc, browser.Address); * }*/ public override void ParseMatchPageHtml(HtmlDocument doc, string url) { MatchName matchName = GetFullMatchName(doc); if (matchName == null) { return; } Sport sport = GetSport(doc); if (sport == Sport.NotSupported) { return; } string BetUrl = url; Bet result = null; HtmlNodeCollection maindocument = doc.DocumentNode.SelectNodes("//li[@class='groupedListItem']"); if (maindocument == null) { return; } foreach (var node in maindocument) { result = null; try { string all_main = node.InnerHtml; HtmlDocument document = new HtmlDocument(); document.LoadHtml(all_main); string Way = document.DocumentNode.SelectNodes("//table[@class]").First().Attributes["class"].Value; string maintype = document.DocumentNode.SelectNodes("//h3").First().InnerText; HtmlNodeCollection betsNodes = document.DocumentNode.SelectNodes("//a[@id]"); Team team = GetTeam(maintype); Time time = GetTime(maintype);// зробити час foreach (var node2 in betsNodes) { string value = node2.InnerHtml; if (!value.Contains("class=\"oddHolder\"")) { continue; } HtmlDocument document2 = new HtmlDocument(); document2.LoadHtml(value); HtmlNodeCollection test = document2.DocumentNode.SelectNodes("//div"); HtmlAttribute idAttribude = node.Attributes["id"]; if (idAttribude == null) { continue; } JavaSelectCode = "(function() { var element = document.evaluate( '" + node2.XPath + "' ,document, null, XPathResult.ANY_TYPE, null ).singleNodeValue; element.click(); })();"; string type = test.First().InnerText; string coeff = test.Last().InnerText; double Probability = Convert.ToDouble(coeff.Replace(".", ",")); if (maintype.Contains("1X2")) { if (Way == "list cell2") { if (type == "1") { result = new ResultBet(ResultBetType.P1, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } if (type == "2") { result = new ResultBet(ResultBetType.P2, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } if (Way == "list cell3") { if (type == "1") { result = new ResultBet(ResultBetType.First, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } if (type == "x" || type == "X") { result = new ResultBet(ResultBetType.Draw, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } if (type == "2") { result = new ResultBet(ResultBetType.Second, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } } if (maintype.Contains("Double Chance") || maintype.Contains("Double chance"))// за весь час чи нормальний час брати? Inc All OT { if (type == "1x" || type == "1X") { result = new ResultBet(ResultBetType.FirstOrDraw, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } if (type == "12") { result = new ResultBet(ResultBetType.FirstOrSecond, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } if (type == "x2" || type == "X2") { result = new ResultBet(ResultBetType.SecondOrDraw, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } // importantly!!!! // You have to see it. if ((maintype.Contains("Total") || maintype.Contains("total")) && (!maintype.Contains("aggregated") && !maintype.Contains("Totals"))) { if (sport == Sport.Basketball && (!maintype.Contains("including overtime") && !maintype.Contains("Including overtime") && !maintype.Contains("including Overtime"))) { continue; } // перевірити правильність написання including overtime(регістр) if (type.Contains("Under")) { try { double param = Convert.ToDouble(type.Split(new string[] { "Under (", ")" }, StringSplitOptions.RemoveEmptyEntries)[0].Replace(".", ",")); result = new TotalBet(TotalBetType.Under, param, time, team, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } catch { } } if (type.Contains("Over")) { double param = Convert.ToDouble(type.Split(new string[] { "Over (", ")" }, StringSplitOptions.RemoveEmptyEntries)[0].Replace(".", ",")); result = new TotalBet(TotalBetType.Over, param, time, team, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } if ((maintype.Contains("Handicap") || maintype.Contains("handicap")) && maintype.Contains("Asian")) { if (sport == Sport.Basketball && (!maintype.Contains("inc. ОТ") && !maintype.Contains("Inc. OT") && !maintype.Contains("INC. OT") && !maintype.Contains("including overtime"))) { continue; } // перевірити правильність написання including overtime(регістр) string first_or_second_team = type.Split(new string[] { " (" }, StringSplitOptions.RemoveEmptyEntries)[0]; if (first_or_second_team == "1") { double param = Convert.ToDouble(type.Split(new string[] { "(", ")" }, StringSplitOptions.RemoveEmptyEntries)[1].Replace(".", ",")); result = new HandicapBet(HandicapBetType.F1, param, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } if (first_or_second_team == "2") { double param = Convert.ToDouble(type.Split(new string[] { "(", ")" }, StringSplitOptions.RemoveEmptyEntries)[1].Replace(".", ",")); result = new HandicapBet(HandicapBetType.F2, param, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } else if (maintype.Contains("Handicap") && !maintype.Contains("Asian")) { // коли буде матч переглянути як тут пишуть INC.OT. string first_or_second_team = type.Split(new string[] { " (" }, StringSplitOptions.RemoveEmptyEntries)[0]; if (first_or_second_team == "1") { string initial_score = type.Split(new string[] { "(", ")" }, StringSplitOptions.RemoveEmptyEntries)[1]; int first_number = Convert.ToInt32(initial_score.Split(new string[] { ":" }, StringSplitOptions.RemoveEmptyEntries)[0]); int second_number = Convert.ToInt32(initial_score.Split(new string[] { ":" }, StringSplitOptions.RemoveEmptyEntries)[1]); double param = 0; if (first_number != 0) { param = first_number - 0.5; } if (second_number != 0) { param = (-1) * (second_number) - 0.5; } result = new HandicapBet(HandicapBetType.F1, param, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } else if (first_or_second_team == "2") { string initial_score = type.Split(new string[] { "(", ")" }, StringSplitOptions.RemoveEmptyEntries)[1]; int first_number = Convert.ToInt32(initial_score.Split(new string[] { ":" }, StringSplitOptions.RemoveEmptyEntries)[0]); int second_number = Convert.ToInt32(initial_score.Split(new string[] { ":" }, StringSplitOptions.RemoveEmptyEntries)[1]); double param = 0; if (first_number != 0) { param = (-1) * first_number - 0.5; } if (second_number != 0) { param = second_number - 0.5; } result = new HandicapBet(HandicapBetType.F2, param, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } else if (maintype.Contains("Draw No Bet")) { if (sport == Sport.Basketball && (!maintype.Contains("inc. ОТ") && !maintype.Contains("Inc. OT") && !maintype.Contains("INC. OT") && !maintype.Contains("including overtime"))) { continue; } double param = 0; if (type.Contains("1")) { result = new HandicapBet(HandicapBetType.F1, param, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } else if (type.Contains("2")) { result = new HandicapBet(HandicapBetType.F2, param, time, Probability, matchName, BetUrl, JavaSelectCode, sport, Maker); } } if (result != null) { int index = BetList.IndexOf(result); if (index != -1) { BetList[index].ChangeOdds(result.Odds); } else { BetList.Add(result); } } } } catch (Exception e) { Console.Write(e.Message); } } System.Threading.Thread.Sleep(50); }
private static void GetBuilds(List <Champion> champ_list) { string html = string.Empty; string url = @"http://koreanbuilds.net/champion/Ahri/Mid/9.8/enc/NA"; for (int i = 0; i < champ_list.Count; i++) { for (int j = 0; j < champ_list[i].Roles.Count; j++) { url = "http://koreanbuilds.net/champion/" + champ_list[i].Name.Replace(" & ", "%26") + "/" + champ_list[i].Roles[j] + "/" + patch + "/enc/NA"; Console.WriteLine(url); try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) using (Stream stream = response.GetResponseStream()) using (StreamReader reader = new StreamReader(stream)) { html = reader.ReadToEnd(); } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); //Console.WriteLine(html); string[] lines = html.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None); //string line = doc.GetElementbyId("items").SelectNodes("div")[0].SelectNodes("div")[0].SelectNodes("img")[0].OuterHtml; HtmlNodeCollection full_items = doc.GetElementbyId("items").SelectNodes("div")[0].SelectNodes("div"); HtmlNodeCollection starter_items = doc.GetElementbyId("items").SelectNodes("div")[1].SelectNodes("div"); StringBuilder str = new StringBuilder(); str.AppendLine("{").AppendLine("\t\"map\": \"any\",").AppendLine("\t\"blocks\": [").AppendLine("\t\t{").AppendLine("\t\t\t\"items\": ["); HtmlNode last_full = full_items.Last(); HtmlNode last_start = starter_items.Last(); foreach (HtmlNode item in starter_items) { string line = item.SelectNodes("img").First().OuterHtml; str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"" + GetItemId(line) + "\",").AppendLine("\t\t\t\t\t\"count\": 1"); if (item != last_start) { str.AppendLine("\t\t\t\t},"); } else { str.AppendLine("\t\t\t\t}"); } } str.AppendLine("\t\t\t],").AppendLine("\t\t\t\"type\": \"Starter items\"").AppendLine("\t\t},"); str.AppendLine("\t\t{").AppendLine("\t\t\t\"items\": ["); foreach (HtmlNode item in full_items) { string line = item.SelectNodes("img").First().OuterHtml; str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"" + GetItemId(line).Replace("3040", "3003").Replace("3042", "3004") + "\",").AppendLine("\t\t\t\t\t\"count\": 1"); if (item != last_full) { str.AppendLine("\t\t\t\t},"); } else { str.AppendLine("\t\t\t\t}"); } } str.AppendLine("\t\t\t],").AppendLine("\t\t\t\"type\": \"Full build\"").AppendLine("\t\t},"); str.AppendLine("\t\t{").AppendLine("\t\t\t\"items\": ["); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2003\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2004\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2055\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2031\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2032\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2033\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2138\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2140\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t},"); str.AppendLine("\t\t\t\t{").AppendLine("\t\t\t\t\t\"id\": \"2139\",").AppendLine("\t\t\t\t\t\"count\": 1").AppendLine("\t\t\t\t}"); str.AppendLine("\t\t\t],").AppendLine("\t\t\t\"type\": \"Consumables\"").AppendLine("\t\t}"); str.AppendLine("\t],").AppendLine("\t\"title\": \"KRB " + champ_list[i].Roles[j] + "\",").AppendLine("\t\"priority\": false,").AppendLine("\t\"mode\": \"any\",").AppendLine("\t\"type\": \"custom\",").AppendLine("\t\"sortrank\": 1,").AppendLine("\t\"champion\": \"" + champ_list[i].Name + "\"").AppendLine("}"); //Console.WriteLine(str.ToString()); bool exists = Directory.Exists(path + "\\" + champ_list[i].Name.Replace(" ", "").Replace("'", "").Replace(".", "").Replace("Wukong", "MonkeyKing") + "\\Recommended\\"); if (!exists) { Directory.CreateDirectory(path + "\\" + champ_list[i].Name.Replace(" ", "").Replace("'", "").Replace(".", "").Replace("Wukong", "MonkeyKing") + "\\Recommended\\"); } File.WriteAllText(path + "\\" + champ_list[i].Name.Replace(" ", "").Replace("'", "").Replace(".", "").Replace("Wukong", "MonkeyKing") + "\\Recommended\\" + champ_list[i].Name + "_" + champ_list[i].Roles[j] + ".json", str.ToString()); } catch (Exception ex) { Console.WriteLine(ex); } } //Console.Write(" Done!\n"); } }
private void GetListThreads() { listThreads = new List <Model.Thread> (); HtmlNodeCollection listThreadNode = doc.DocumentNode .SelectSingleNode("//tbody[@id='threadbits_forum_" + boxId + "']").SelectNodes("./tr"); HtmlNodeCollection nodesTd, nodeDiv; HtmlNode node_second_td, node_id, node_replies, node_view, node_lastpost; foreach (HtmlNode nodeSingleThread in listThreadNode) { //get all <td> (total 5) nodesTd = nodeSingleThread.SelectNodes("./td"); if (nodesTd[1].Attributes["class"].Value == "alt2") { nodesTd.RemoveAt(1); } if (nodesTd.Count == 5) { Model.Thread thread = new Model.Thread(); //1st node contain id node_id = nodesTd[0]; thread.id = node_id.Attributes["id"].Value.Remove(0, 20); //2nd node contain title and creating user and first unread node_second_td = nodesTd[1]; nodeDiv = node_second_td.SelectNodes("./div"); HtmlNodeCollection nodesInFirstDiv = nodeDiv[0].SelectNodes("./a"); foreach (HtmlNode a in nodesInFirstDiv) { thread.title += a.InnerText + " "; } thread.title = HtmlEntity.DeEntitize(thread.title); thread.creatingUser = HtmlEntity.DeEntitize(nodeDiv[1].InnerText.Trim()); //3rd node contain last post info node_lastpost = nodesTd[2]; thread.lastPost = HtmlEntity.DeEntitize(node_lastpost.InnerText.Trim()); string[] s = thread.lastPost.Split(); thread.lastPost = string.Join(" ", s); //4th contain replies node_replies = nodesTd[3]; thread.replies = node_replies.InnerText; //5th contain view node_view = nodesTd[4]; thread.views = node_view.InnerText; //get new post link HtmlNode newPostNode = node_second_td.SelectSingleNode(".//a[@id='thread_gotonew_" + thread.id + "']"); if (newPostNode != null) { thread.newPost = HtmlEntity.DeEntitize(newPostNode.Attributes["href"].Value); thread.title = "[NEW] " + thread.title; } else { thread.newPost = ""; } //last page HtmlNode span = node_second_td.SelectSingleNode(".//span[@class='smallfont']"); if (span != null) { HtmlNodeCollection pages = span.SelectNodes(".//a"); thread.lastPage = HtmlEntity.DeEntitize((( HtmlNode )pages.Last()).Attributes["href"].Value); thread.lastPage = thread.lastPage.Split(new char[] { '&' })[1]; } else { thread.lastPage = ""; } listThreads.Add(thread); } } }