static void get_items() { var msg_prefix = "-分析列表:"; var y = Encoding.Default.GetByteCount(msg_prefix); Console.Write(msg_prefix); var charArr = @"-\|/".ToArray(); var resp = browser.Get("https://materials.ulprospector.com/zh/results?pageSize=100"); JumonyParser parser = new JumonyParser(); var document = parser.Parse(resp); var total = document.FindFirst(".pagination.pull-right").InnerText().Split(' ').Last().ToInt(); for (int i = 1; i < 2; i++) { resp = browser.Get("https://materials.ulprospector.com/zh/results?pageNum=" + i + "pageSize=100"); document = parser.Parse(resp); var eles = document.Find("table.results tbody tr. "); foreach (var v in eles) { var item = new Item(); item.Url = v.FindFirst(".entry a").Attribute("href").Value(); item.Id = v.FindFirst("input").Attribute("value").Value().ToInt(); Items.Add(item); } Console.CursorLeft = 0; Console.Write(charArr[i % charArr.Length]); Console.CursorLeft = y; Console.Write((int)(i * 100 / total) + "%"); } Console.WriteLine(); }
private async Task <bool> GetAgentsAsync(Uri cityUri) { LogHelper.Info(cityUri.ToString()); var pageSource = await HttpClient.GetStringAsync(cityUri); while (!pageSource.Contains("查企业")) { if (pageSource.StartsWith("<script>window.location")) { VertifyCode(new Uri(pageSource.Split("'")[1])); pageSource = await HttpClient.GetStringAsync(cityUri); } else if (pageSource.Contains("小查还没找到数据")) { return(false); } } var block = JumonyParser.Parse(pageSource).Find(".m_srchList tbody tr td:nth-child(3)"); foreach (var item in block) { await VertifyAsync(item.InnerHtml()); } if (block.Count() < PageSize) { return(false); } return(true); }
/// <summary> /// 核心加载 /// </summary> /// <param name="context"></param> /// <param name="url"></param> /// <param name="htmlContent">HTML内容</param> /// <param name="result">目标结果</param> /// <returns></returns> protected override bool LoadCore(HttpContext <string> context, string url, string htmlContent, ResourceSearchInfo result) { var parser = new JumonyParser(); var doc = parser.Parse(htmlContent, new Uri(ReferUrlPage)); var node = doc.Find("#archiveResult tr").Skip(1); foreach (var row in node) { var title = row.FindFirstOrDefault("td.name")?.InnerText(); //var size = row.FindFirstOrDefault("td.size")?.InnerText(); var date = row.FindFirstOrDefault("td.date")?.InnerText()?.ToDateTimeNullable(); var has = Regex.Match(row.FindFirstOrDefault("td.action a:nth-child(1)").Attribute("href").AttributeValue, @"/([a-z\d]{40})", RegexOptions.IgnoreCase).GetGroupValue(1); var item = CreateResourceInfo(has, title); //item.DownloadSize = size; item.UpdateTime = date; result.Add(item); } var pager = doc.FindFirstOrDefault("div.pagination"); result.HasPrevious = pager?.FindFirstOrDefault("*:first-child")?.Name == "a"; result.HasMore = pager?.FindFirstOrDefault("*:last-child")?.Name == "a"; return(base.LoadCore(context, url, htmlContent, result)); }
protected void Page_Load(object sender, EventArgs e) { var client = new WebClient(); var html = client.DownloadString("http://www.cnblogs.com/"); var parser = new JumonyParser(); var document = parser.Parse(html); var links = document.Find("a[href]"); var baseUrl = new Uri("http://www.cnblogs.com"); var data = from hyperLink in links let url = new Uri(baseUrl, hyperLink.Attribute("href").Value()) orderby url.AbsoluteUri select new { Url = url.AbsoluteUri, IsLinkingOut = !url.Host.EndsWith("cnblogs.com"), Target = hyperLink.Attribute("target").Value() ?? "_self" }; DataList.DataSource = data; DataBind(); }
protected void Page_Load( object sender, EventArgs e ) { var client = new WebClient(); var html = client.DownloadString( "http://www.cnblogs.com/" ); var parser = new JumonyParser(); var document = parser.Parse( html ); var links = document.Find( "a[href]" ); var baseUrl = new Uri( "http://www.cnblogs.com" ); var data = from hyperLink in links let url = new Uri( baseUrl, hyperLink.Attribute( "href" ).Value() ) orderby url.AbsoluteUri select new { Url = url.AbsoluteUri, IsLinkingOut = !url.Host.EndsWith( "cnblogs.com" ), Target = hyperLink.Attribute( "target" ).Value() ?? "_self" }; DataList.DataSource = data; DataBind(); }
public void css_class_has_hyphen() { var html = "<div class=\"css-class\"></div>"; var htmlParser = new JumonyParser(); var doc = htmlParser.Parse(html); var css_class = doc.Find(".css-class"); Assert.AreEqual(1, css_class.Count()); }
private void DoAppendInCurrentFile(string filePath) { try { var body = File.ReadAllText(filePath, Encoding.UTF8); var htmlSource = new JumonyParser(); var bodyHtml = htmlSource.Parse(body); var scriptlist = bodyHtml.Find("script").ToList(); var csslist = bodyHtml.Find("link").ToList(); var dataNowStr = DateTime.Now.ToString("yyyyMMddHHmm"); void Replace(List <IHtmlElement> list, string key) { foreach (var item in list) { var attrs = item.Attributes(); var str = attrs.Where(r => r.Name.ToLower().Equals("href") || r.Name.ToLower().Equals("src")) .Select(r => r.AttributeValue).FirstOrDefault(); if (string.IsNullOrEmpty(str)) { continue; } var lowerCase = str.ToLower(); if (!lowerCase.Contains("." + key)) { continue; } var newStr1 = str; if (lowerCase.Contains("?")) { newStr1 = str.Split('?')[0]; } var newStr = Regex.Replace(newStr1, "\\." + key, "." + key + $"?{dataNowStr}", RegexOptions.IgnoreCase); if (!string.IsNullOrEmpty(newStr)) { body = ReplaceFirstOccurrence(body, str, newStr); } } } Replace(scriptlist, "js"); Replace(csslist, "css"); File.WriteAllText(filePath, body); } catch (Exception ex) { ProjectHelpers.AddError(_package, "file : " + filePath + "====>" + ex.ToString()); } }
/// <summary> /// 获取二级菜单列表 /// </summary> /// <param name="strHtml"></param> /// <param name="url"></param> /// <param name="companyName"></param> /// <param name="nextUrl"></param> private void ParsingHtml(string strHtml, string url, out string companyName, out string nextUrl) { var parser = new JumonyParser(); var document = parser.Parse(strHtml).Find("li a"); foreach (var htmlElement in document) { OutLog(htmlElement.InnerText()); //OutLog(htmlElement.Attribute("href").AttributeValue); companyName = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; return; } companyName = ""; nextUrl = ""; }
private async Task <List <string> > GetCodeAsync(Uri uri, string filter) { var source = await HttpClient.GetStringAsync(uri); var html = JumonyParser.Parse(source).Find(filter); List <string> codeList = new List <string>(); foreach (var item in html) { var code = item.Attribute("data-value").AttributeValue; var name = item.Attribute("data-append").AttributeValue; Console.WriteLine(name); codeList.Add(code); } return(codeList); }
public static void SetCasses(HttpResult result) { //取出返回的Cookie string cookie = result.Cookie; //返回的Html内容 string html = result.Html; if (result.StatusCode == System.Net.HttpStatusCode.OK) { //表示访问成功,具体的大家就参考HttpStatusCode类 //var resultHtml= StripHTML(html); var resultHtml = html; JumonyParser jp = new JumonyParser(); var data = jp.Parse(resultHtml); Cases obj = new Cases(); obj.CaseId = Guid.NewGuid(); obj.Title = data.Find(".flleft>h1.riji_bt").Single().InnerText(); obj.TagName = data.Find(".biaoqian_all span a").Single().InnerText(); obj.CreateTime = DateTime.Parse(data.Find("ul.ll_zan>li.time_zx").Single().InnerText()); //案例内容 obj.Content = data.Find("div.riji_wenzi").Single().InnerHtml(); obj.ProjectName = data.Find(".riji_fy>h2").Single().InnerText(); obj.Price = data.Find(".riji_fy>p>span").Single().InnerText(); var itempage = data.Find(".riji_fy>ul.biapge_list>li").ToList(); for (int i = 0; i < itempage.Count; i++) { obj.TreatmentWay = itempage[0].Find("span.listspan_02").Single().InnerText(); obj.ResultData = itempage[1].Find("span.listspan_02").Single().InnerText(); obj.Durations = itempage[2].Find("span.listspan_02").Single().InnerText(); obj.ResultSpeed = itempage[3].Find("span.listspan_02").Single().InnerText(); obj.ApplicableCrowd = itempage[4].Find("span.listspan_02").Single().InnerText(); } var fileup = data.Find("div.riji_wenzi>p>img").ToList(); foreach (var item in fileup) { var surl = item.Attribute("src").Value(); FileDown(surl); } if (AddCase(obj) == 1) { Console.Write("Downing ok!\n"); } } }
private static String getTKKScript() { using (var webClient = new WebClient()) { webClient.Headers.Add("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.165063 Safari/537.36 AppEngine-Google."); string html = webClient.DownloadString("https://translate.google.cn/"); var parser = new JumonyParser(); var htmlSource = parser.Parse(html); var allinfo = htmlSource.Find("script"); String[] sA = { "TKK" }; foreach (var info in allinfo) { var scriptText = info.OuterHtml(); if (scriptText.IndexOf("TKK") > 0) { var res = scriptText.Split(new string[] { "TKK" }, StringSplitOptions.None)[1]; res = res.Split(new string[] { ");" }, StringSplitOptions.None)[0]; return("var TKK" + res + ");"); } } return("none"); } }
/// <summary> /// 迁移cnblog评论 /// </summary> /// <param name="BlogsId">嗨博客 博客id</param> /// <param name="BlogUsersId">嗨博客 评论博客用户id(因为迁移评论者 没有id 所以都默认为1)</param> /// <param name="postId">cnblog 博客id</param>int BlogUsersId = 1, /// <param name="blogApp">cnblog 博客用户名</param> public string testJumonyParser(int BlogsId = 1, string postId = "4368417", string blogApp = "zhaopei") { bool isNext = true; int i = 0; var BlogUsersId = 1; BLL.BlogUsersSetBLL userbll = new BlogUsersSetBLL(); var usertemp = GetDataHelper.GetAllUser().Where(t => t.UserName == " ").FirstOrDefault(); if (null == usertemp) { var user = new Blogs.ModelDB.BlogUsersSet() { UserName = "******", UserPass = "******", IsDel = false, IsLock = false, UserMail = "无效", CreateTime = DateTime.Now, UserInfo = new ModelDB.UserInfo() }; userbll.Add(user); userbll.save(false); BlogUsersId = user.Id; } else { BlogUsersId = usertemp.Id; } //List<BlogCommentSet> blogcommen = new List<BlogCommentSet>(); BlogCommentSetBLL blogcommenbll = new BlogCommentSetBLL(); while (isNext) { i++; var url = "http://www.cnblogs.com/mvc/blog/GetComments.aspx?postId=" + postId + "&blogApp=" + blogApp + "&pageIndex=" + i; var jumony = new JumonyParser(); var htmlSource = jumony.LoadDocument(url).InnerHtml(); JavaScriptSerializer _jsSerializer = new JavaScriptSerializer(); CnBlogComments comm = _jsSerializer.Deserialize <CnBlogComments>(htmlSource); var commentsHtml = jumony.Parse(comm.commentsHtml); var pager = commentsHtml.Find("div.pager").FirstOrDefault(); if (null != pager) { var Next = pager.Find("*").LastOrDefault().InnerText(); if (Next != "Next >") { isNext = false; } } else { isNext = false; } var listComment = commentsHtml.Find("div.feedbackItem").ToList(); foreach (var item in listComment) { var commentDataNode = item.Find("div.feedbackListSubtitle span.comment_date").FirstOrDefault(); // var commentData = DateTime.Parse(commentDataNode.InnerText()); var commentUserNode = item.Find("div.feedbackListSubtitle a[target='_blank']").FirstOrDefault(); var commentUser = commentUserNode.InnerText(); var Content = item.Find("div.blog_comment_body").FirstOrDefault().InnerText(); blogcommenbll.Add( new BlogCommentSet() { BlogsId = BlogsId, CommentID = -1, IsDel = false, Content = Content, CreateTime = commentData, ReplyUserName = commentUser, BlogUsersId = BlogUsersId, IsInitial = true } ); } } try { blogcommenbll.save(false); } catch (Exception) { } return("ok"); }
static void get_detail() { var msg_prefix = "-分析详情:"; var y = Encoding.Default.GetByteCount(msg_prefix); Console.Write(msg_prefix); var charArr = @"-\|/".ToArray(); JumonyParser parser = new JumonyParser(); var ItemArr = Items.ToArray(); for (int i = 0; i < ItemArr.Length; i++) { var item = ItemArr[i]; var resp = browser.Get("https://materials.ulprospector.com/zh/profile/default?e=" + item.Id); var document = parser.Parse(resp); item.Name = document.FindFirst(".pHdrName").InnerHtml(); item.Title = document.FindFirst(".pHdrTitle").InnerHtml(); item.Supplier = document.FindFirst(".supplierWeb").InnerHtml(); item.SupplierWeb = document.FindFirst(".supplierWeb").Attribute("href").Value(); item.Description = document.FindFirst(".productDescription").InnerHtml(); resp = browser.Get("https://materials.ulprospector.com/pp.axd?CULTURE=zh&ID=TabProperties&A=LOAD&E=" + item.Id); dynamic json = new JavaScriptSerializer().DeserializeObject(resp); string html = json["Data"]; document = parser.Parse(html); foreach (var v in document.Find("div.DSSEC")) { if (v.Attribute("id").Value() == "DATAVIEW_DSSEC_GEN") { item.MainTables.GroupName = v.Attribute("title").Value(); Table <MainTableRow> table = null; foreach (var vv in v.Find("table > tr")) { if (vv.Class().Any(a => a == "categoryheader")) { if (table != null) { item.MainTables.Tables.Add(table); } table = new Table <MainTableRow>(); table.Caption = vv.FindFirstOrDefault(".catname").InnerText(); } else { var tr = new MainTableRow(); tr.Key = vv.FindFirstOrDefault(".propname")?.InnerText(); foreach (var vvv in vv.Find("li")) { var li = new NameUrl(); li.Name = vvv.InnerText(); li.Url = vvv.FindFirstOrDefault("a")?.Attribute("href").Value(); tr.Values.Add(li); } table.Rows.Add(tr); } } if (table != null) { item.MainTables.Tables.Add(table); } } else { var group = new TableGroup <PropertyTableRow>(); group.GroupName = v.Attribute("title").Value(); Table <PropertyTableRow> table = null; foreach (var vv in v.Find("table > tr")) { if (vv.Class().Any(a => a == "categoryheader")) { if (table != null) { group.Tables.Add(table); } table = new Table <PropertyTableRow>(); table.Caption = vv.FindFirstOrDefault(".catname")?.InnerText(); } else { var tr = new PropertyTableRow(); tr.Name = vv.FindFirstOrDefault(".propname")?.InnerText(); tr.Value = vv.FindFirstOrDefault(".dsvalue")?.InnerText(); tr.Unit = vv.FindFirstOrDefault(".dsunit")?.InnerText(); tr.TestMethod = vv.FindFirstOrDefault(".standard")?.InnerText(); table.Rows.Add(tr); } } if (table != null) { group.Tables.Add(table); } item.PropertyTablesList.Add(group); } } Console.CursorLeft = 0; Console.Write(charArr[i % charArr.Length]); Console.CursorLeft = y; Console.Write((int)(i * 100 / Items.Count) + "%"); System.IO.File.WriteAllText(@"result\" + item.Id.ToString() + ".json", new JavaScriptSerializer().Serialize(item)); } Console.WriteLine(); }
protected IHtmlDocument CreateHtmlDom(string url, string html) { var parser = new JumonyParser(); return(parser.Parse(html, new Uri(url))); }
/// <summary> /// 单个查询 /// </summary> /// <param name="companyName"></param> public void SingelSearch(string companyName) { var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create <string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create <string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { AddNull(targetModel); break; } foreach (var htmlElement in urls) { targetModel.称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create <string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List <string>(); foreach (var tableList in tableLists) { tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); } var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } break; } }
/// <summary> /// 得到微信公众平台个人信息 /// </summary> /// <returns></returns> public WechatAccountInfo GetAccount() { WechatAccountInfo account = null; HttpResponseMessage response = null; try { _httpClient = new HttpClient(handler); SetHeader(); response = _httpClient.GetAsync(WeChatUrl.ACCOUNT_INFO_URL + token).Result; if (response.StatusCode == HttpStatusCode.OK) { //已经连接,正在接收数据 string result = response.Content.ReadAsStringAsync().Result; var parser = new JumonyParser(); var htmlDoc = parser.Parse(result); var htmlEles = htmlDoc.Find(".account_setting_area .account_setting_item .meta_content"); if (htmlEles != null && htmlEles.Count() > 0) { var setting = htmlEles.ToList(); account = new WechatAccountInfo(); #region 解析html获取相关文本信息 for (int i = 0; i < setting.Count; i++) { try { var infoText = setting[i].InnerText().Trim(); if (i > 1 && string.IsNullOrWhiteSpace(infoText)) { continue; } switch (i) { case 0: account.HeadImage = setting[0].Find("img").FirstOrDefault() .Attribute("src").AttributeValue; break; case 1: account.QRCode = setting[1].Find("img").FirstOrDefault() .Attribute("src").AttributeValue; break; case 2: account.AccountName = infoText; break; case 3: account.WechatNumber = infoText; break; case 4: SetWechatType(account.WechatType, infoText); break; case 5: account.Introduces = infoText; break; case 6: SetAuthenticate(account.Authenticate, infoText); break; case 7: account.PlaceAddress = infoText; break; case 8: account.SubjectInfo = infoText; break; case 9: account.LoginEmail = infoText; break; case 10: account.AccountId = infoText; break; default: break; } } catch (Exception) { } } #endregion } } } catch (Exception) { } finally { if (response != null) { response.Dispose(); } } return(account); }
/// <summary> /// 得到AppId,AppSecret /// </summary> /// <returns></returns> public WechatDevInfo GetWechatDevInfo() { // TODO 得到AppId,AppSecret WechatDevInfo devInfo = null; HttpResponseMessage response = null; try { _httpClient = new HttpClient(handler); SetHeader(); response = _httpClient.GetAsync(WeChatUrl.DEV_URL + token).Result; if (response.StatusCode == HttpStatusCode.OK) { //已经连接,正在接收数据 string result = response.Content.ReadAsStringAsync().Result; var parser = new JumonyParser(); var htmlDoc = parser.Parse(result); var htmlEles = htmlDoc.Find(".developer_info_wrp"); if (htmlEles != null && htmlEles.Count() > 0) { var vertical = htmlEles.Find(".frm_vertical_pt").ToList(); devInfo = new WechatDevInfo(); #region 解析html获取相关文本信息 for (int i = 0; i < vertical.Count; i++) { try { var infoText = vertical[i].InnerText().Trim(); if (string.IsNullOrWhiteSpace(infoText)) { continue; } switch (i) { case 0: devInfo.AppId = infoText; break; case 1: devInfo.AppSecret = infoText; break; case 2: devInfo.URL = infoText; break; case 3: devInfo.Token = infoText; break; case 4: devInfo.EncodingAESKey = infoText; break; case 5: SetEncodingAESType(devInfo.EncodingAESType, infoText); break; default: break; } } catch (Exception) {} } #endregion } } } catch (Exception) { } finally { if (response != null) { response.Dispose(); } } return(devInfo); }