/// <summary> /// 원하는 정보 추출 /// </summary> /// <param name="type"></param> /// <param name="siteInfo"></param> /// <returns></returns> private decimal GetPriceFromInfo(MaterialTypes type, CsQuery.CQ siteInfo) { string dollarStr = string.Empty; var data = siteInfo[ApmexConst.APMEX_LiST_QUERY].ToList(); switch (type) { case MaterialTypes.Gold: dollarStr = data[0].FirstChild.ToString(); break; case MaterialTypes.Silver: dollarStr = data[1].FirstChild.ToString(); break; case MaterialTypes.Platinum: dollarStr = data[2].FirstChild.ToString(); break; case MaterialTypes.Palladium: dollarStr = data[3].FirstChild.ToString(); break; } // decimal 변환 Decimal.TryParse(dollarStr.Replace("$", ""), out decimal result); return(result); }
private List <Bookmark> ImportFromHtml() { // Для парсинга html выбрана библиотека CsQuery, т.к. другие варианты немного не подходят. // AngleSharp требует более высокую версию .NET Framework, а HtmlAgilityPack содержит баги и больше не поддерживается. // https://habr.com/en/post/273807/#AngleSharp // https://ru.stackoverflow.com/questions/420354/%D0%9A%D0%B0%D0%BA-%D1%80%D0%B0%D1%81%D0%BF%D0%B0%D1%80%D1%81%D0%B8%D1%82%D1%8C-html-%D0%B2-net List <Bookmark> bookmarks = new List <Bookmark>(); CsQuery.CQ cq = CsQuery.CQ.Create(File.ReadAllText(FileName)); foreach (CsQuery.IDomObject obj in cq.Find("a")) { if (obj.HasAttribute("href")) { Bookmark b = new Bookmark(); b.URL = obj.GetAttribute("href"); // Не английский текст выводится в виде кодов символов. Нужно декодировать. b.Name = System.Net.WebUtility.HtmlDecode(obj.InnerText); bookmarks.Add(b); } } return(bookmarks); }
public bool VisitTo(string addr) { bool result = true; HttpWebRequest clientVisit = CreateWebRequest(addr, Method.GET); clientVisit.CookieContainer.Add(Cookies); // add cookies to container WebResponse requestVisit = clientVisit.GetResponse(); using (Stream streamGetData = requestVisit.GetResponseStream()) { using (StreamReader reader = new StreamReader(streamGetData)) { CsQuery.CQ DOM = CsQuery.CQ.Create(reader); // parse html CsQuery.IDomObject profile = DOM.Find("div").Where(e => e.ClassName == "logininfo").FirstOrDefault(); // find profile block CsQuery.IDomElement profileName = profile?.ChildElements.Where(e => e.Attributes["title"] == "Просмотр профиля").FirstOrDefault(); // find name if (profileName != null) { Console.WriteLine($"User: { profileName.FirstChild }"); Console.WriteLine($"Visit: `{ DOM.Find("title").Text() }`"); // return title course } else { Console.WriteLine("ERROR LOGIN"); } } } requestVisit.Close(); return(result); }
//将正文中没有 http:// 开头的img路径替换 //public static string GetHtml(string sHtmlText) //{ // //string resultHtml = string.Empty; // // 定义正则表达式用来匹配 img 标签 // Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // // 搜索匹配的字符串 // MatchCollection matches = regImg.Matches(sHtmlText); // //int i = 0; // string[] sUrlList = new string[matches.Count]; // foreach (Match match in matches) // { // string img = match.Groups["imgUrl"].Value; // //sUrlList[i++] = match.Groups["imgUrl"].Value; // //foreach (var img in sUrlList) // //{ // if (!img.StartsWith("data:image/")) // { // if (!img.Contains("this.src")) // { // if (!img.Contains("http://")) // { // sHtmlText = Regex.Replace(sHtmlText, img, "http://blog2.cnool.net" + img); // } // } // } // //} // } // return sHtmlText; //} public static string GetHtml(string sHtmlText) { // 搜索匹配的字符串 CsQuery.CQ cq = sHtmlText; int i = 0; var imgs = cq["img"]; string[] sUrlList = new string[imgs.Count()]; foreach (var img in imgs) { try { var src = img.GetAttribute("src"); if (!src.Contains("file://")) { if (!src.StartsWith("data:image/")) { if (!src.StartsWith("http://")) { sHtmlText = Regex.Replace(sHtmlText, src, "http://blog2.cnool.net" + src); } } } } catch (Exception) { continue; } } return(sHtmlText); }
protected override string GetNumberText(CsQuery.CQ node) { var p = node.Find("p"); if (p.Any()) { node = p; } return(node.Single().InnerText); }
static int CountWordsOnUrl(string url) { string html = string.Empty; using (var webClient = new WebClient()) html = webClient.DownloadString(url); var text = new CsQuery.CQ(html).Text(); return(text.Split(' ').Length); }
public PageCrawlCompletedArgs() { Url = String.Empty; PageContent = new PageContent() { }; CQDocument = new CsQuery.CQ(); WebException = new WebException() { }; }
private void ToSSRJson() { var sFile = getFilePath("2.html"); CsQuery.CQ dom = System.IO.File.ReadAllText(sFile); var oSSR = new SSR(); oSSR.configs = new List <FreeSSR.Server>(); foreach (var row in dom["tr"]) { CsQuery.CQ rowDom = row.InnerHTML; var tds = rowDom["td"].ToList(); var config = new FreeSSR.Server(); if (tds.Count > 0) { var vtm = tds[0].InnerText; if (int.Parse(vtm.Split('/')[1]) > 8) //T是电信线路,值越大越好 { config.server = tds[1].InnerText; config.server_port = int.Parse(tds[2].InnerText); var p1 = tds[3].InnerText; var p2 = tds[4].InnerText; if (p1 == "rc4-md5" || p1 == "chacha20" || p1.StartsWith("aes-")) { config.method = p1; config.password = p2; } else { config.method = p2; config.password = p1; } config.id = Guid.NewGuid().ToString("N"); } } if (!string.IsNullOrWhiteSpace(config.server)) { oSSR.configs.Add(config); } } var json = Newtonsoft.Json.JsonConvert.SerializeObject(oSSR); System.IO.File.WriteAllText(getFilePath("ssr.json"), json); var sOutFile = $"{SSR_EXE_PATH}\\ssr.json"; if (System.IO.File.Exists(sOutFile)) { System.IO.File.Delete(sOutFile); } System.IO.File.Copy(getFilePath("ssr.json"), sOutFile); write_gui_config(sOutFile); }
protected override String GetNumberText(CsQuery.CQ node) { var text = node.Single().InnerText.Trim(); if (text == String.Empty) { return(node.Find("p").Single().InnerText.Trim()); } else { return(text); } }
/// <summary> /// Formats the given CsQuery DOM Elements into a list Message objects with all /// the relevant information /// </summary> /// /// <param name="messages"> /// The list of all DOM elements containing the chat messages /// </param> /// /// <param name="isOutMessage"> /// Indicates whether these are messages sent by the user (outcoming) /// or received by the user (sent by other users - incoming) /// </param> private List <ChatMessage> FormatMessages(CsQuery.CQ messages, bool isOutMessage) { var res = messages .Select(x => x.Cq()) .Select(x => { // get the message author. in case this are out-messages, then the author is me var author = (isOutMessage) ? "Me" : x.Find(".message-author .text-clickable").Text().Trim(); // try fetch the content. in case we can't fetch anything - then // lets assume that the current message contains a photo and try // fetch its url var content = x.Find(".selectable-text").Text().Trim(); // in case the message contains an image instead of text // TODO: understand how to use the string to extract the image // TODO: add support for videos as well if (string.IsNullOrEmpty(content)) { content = x.Find(".image-thumb > img").Attr("src"); } // TODO: handle this properly, right now we skip any messages // that we couldn't find any text or url for // we skip them by returning null here and then filtering // any null messages, see few lines below \/ \/ \/ \/ \/ if (string.IsNullOrEmpty(content)) { return(null); } return(new ChatMessage { Author = author, Content = content }); }) .Where(x => x != null) .ToList(); // resolve author names for incoming messages if (!isOutMessage) { ResolveAuthors(res); } return(res); }
private IEnumerable <string> GetAllLinksForPage(string url) { try { string root = GetHost(url); var client = new System.Net.WebClient(); CsQuery.CQ data = client.DownloadString(url); var links = data["a"].Select((x) => x["href"]).Where((x) => !String.IsNullOrWhiteSpace(x)); return(links.Where((x) => x.StartsWith("/")).Select((x) => root + x) .Union(links.Where((x) => HasSameHost(root, x)))); } catch { return(new List <string>(0)); } }
/// <summary> /// 配置从Html来 /// </summary> /// <param name="oSSR"></param> private void ConfigFromHtml(SSR oSSR) { var sFile = getFilePath("2.html"); CsQuery.CQ dom = System.IO.File.ReadAllText(sFile); foreach (var row in dom["tr"]) { CsQuery.CQ rowDom = row.InnerHTML; var tds = rowDom["td"].ToList(); var config = new Server(); if (tds.Count > 0) { var vtm = tds[0].InnerText; try { if (int.Parse(vtm.Split('/')[1]) > 8) //T是电信线路,值越大越好 { config.server = tds[1].InnerText; config.server_port = int.Parse(tds[2].InnerText); var p1 = tds[3].InnerText; var p2 = tds[4].InnerText; if (p1 == "rc4-md5" || p1 == "chacha20" || p1.StartsWith("aes-")) { config.method = p1; config.password = p2; } else { config.method = p2; config.password = p1; } config.id = Guid.NewGuid().ToString("N"); } if (!string.IsNullOrWhiteSpace(config.server)) { oSSR.configs.Add(config); } } catch { } } } }
public DryHtml(string templateFilePath = "") { //_template = new HtmlDocument(); _dom = new CsQuery.CQ(); if (templateFilePath.Length > 255 || templateFilePath.Contains('<')) { _dom = templateFilePath; } else if (templateFilePath != "") { var file = System.IO.File.ReadAllText(templateFilePath); _dom = file; } }
protected override String GetDescription(CsQuery.CQ descriptionNode) { var caption = descriptionNode.Find("h3").Single().InnerText.Trim(); if (caption == String.Empty) { caption = descriptionNode.Find("h3 span").Single().InnerText.Trim(); } caption = Common.WrapText(caption); var detailedDescription = descriptionNode.Find(".au-accordion__target"); if (detailedDescription.Any()) { return(String.Format("{0}{1}{2}", caption, Environment.NewLine, String.Join(Environment.NewLine, detailedDescription.Single().ChildNodes.Select(node => Common.ParseNode(node))))); } else { return(caption); } }
internal static List <Model.News> SendRequestToHP(string url) { try { #region Get Website Content string rssContent; using (var wc = new WebClient()) { ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12; rssContent = wc.DownloadString(url); } #endregion #region Extract News #endregion var categories = @"3COM Security Bulletins 3rd Party Software Security Bulletins HP General SW Security Bulletins HP Hardware and Firmware Security Bulletins HP MPE/iX Security Bulletins Multi-Platform Software Security Bulletins HP NonStop Servers Security Bulletins HP OpenVMS Security Bulletins ProCurve Security Bulletins HP Storage SW Security Bulletins HP Tru64 UNIX Security Bulletins HP-UX UNIX Security Bulletins"; CsQuery.CQ dom = rssContent; var tables = dom["table"].Has("tr"); var news = new List <Model.News>(); foreach (var table in tables) { if (!categories.Contains(table.Attributes["title"] ?? "none")) { continue; } var tb = table.ChildNodes.Where(cn => cn.NodeName.Contains("TBODY")).ToList(); if (tb.Any()) { foreach (var tbItems in tb[0].ChildNodes) { var row = tbItems.ChildNodes?.Where(cn => cn?.FirstChild != null).ToList(); if (row == null) { continue; } DateTime date = DateTime.Parse( new System.Text.RegularExpressions.Regex(@"\d{4}\/\d{1,2}\/\d{1,2}") .Match(row[0].InnerHTML).Value); if (date < DateTime.Today.AddDays(-1)) { continue; } news.Add(new Model.News() { Identifier = row[1].InnerHTML, PublishDate = date, Title = ((CsQuery.Implementation.HtmlAnchorElement)row[3].ChildNodes[0]).InnerHTML, Description = ((CsQuery.Implementation.HtmlAnchorElement)row[3].ChildNodes[0]).InnerHTML, Url = "http://support.hpe.com" + ((CsQuery.Implementation.HtmlAnchorElement)row[3].ChildNodes[0]).Href, Supplier = Model.NewsSupplier.HP, //NewsCategory = Model.NewsCategory.Advisory }); } } } if (news.Any()) { Console.WriteLine($"[+] HP: {news.Count}"); } return(news); } catch (Exception exception) { ShowNotify(exception.Message, exception.StackTrace, "HP"); return(new List <Model.News>()); } }
private String[] GetUrls(CsQuery.CQ node) { return(node.Find("img").Not(".pdd-inline-sign__icon").Not("p span img").Select(img => img.GetAttribute("src")).ToArray()); }
protected abstract String GetDescription(CsQuery.CQ descriptionNode);
protected abstract String GetNumberText(CsQuery.CQ node);
protected override string GetDescription(CsQuery.CQ descriptionNode) { var nodes = descriptionNode.Find(@"[style=""font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: black;""]"); return(String.Join(Environment.NewLine, nodes.Select(node => Common.ParseNode(node)))); }
/// <summary> /// apmex 사이트중 위 헤더 받아오기 /// </summary> /// <param name="siteDom"></param> /// <returns></returns> private CsQuery.CQ GetExtractTitleFromApmex(CsQuery.CQ siteDom) { return(siteDom[ApmexConst.APMEX_TITLE_QUERY]); }
public DryHtml(string template, object model) { _dom = new CsQuery.CQ(); _dom = template.Replace(model); }
public static string[] GetHtmlImageUrlList(string sHtmlText) { string imgSrc = ""; // 定义正则表达式用来匹配 img 标签 //Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); //<img alt="查看更多精彩图片" onload="var image=new Image();image.src=this.src;if(image.width>0 _fcksavedurl=" border="0" src="http://photo1.hexun.com/p/2006/0425/18694/b_8FB7A2DC2E0BA1D9.jpg" /> CsQuery.CQ cq = sHtmlText; int i = 0; var imgs = cq["img"]; string[] sUrlList = new string[imgs.Count()]; foreach (var img in imgs) { try { var src = img.GetAttribute("src"); if (!src.StartsWith("data:image/")) { if (!src.StartsWith("http://")) { imgSrc = "http://blog2.cnool.net" + src; } else { imgSrc = src; } } //sUrlList[i++] = match.Groups["imgUrl"].Value; sUrlList[i++] = imgSrc; } catch (Exception) { continue; } } // 搜索匹配的字符串 //MatchCollection matches = regImg.Matches(sHtmlText); //int i = 0; //string[] sUrlList = new string[matches.Count]; //// 取得匹配项列表 //foreach (Match match in matches) //{ // string img = match.Groups["imgUrl"].Value; // if (!img.StartsWith("data:image/")) // { // if (!img.StartsWith("http://")) // { // //imgSrc = img.ToLower().Replace(img, "http://blog2.cnool.net" + img); // imgSrc = "http://blog2.cnool.net" + img; // } // else // { // imgSrc = img; // } // } // //sUrlList[i++] = match.Groups["imgUrl"].Value; // sUrlList[i++] = imgSrc; //} //imgSrc = String.Join(",", sUrlList); //return imgSrc; return(sUrlList); }