protected void Page_Load( object sender, EventArgs e ) { var client = new WebClient(); var html = client.DownloadString( "http://www.cnblogs.com/" ); var parser = new JumonyParser(); var document = parser.Parse( html ); var links = document.Find( "a[href]" ); var baseUrl = new Uri( "http://www.cnblogs.com" ); var data = from hyperLink in links let url = new Uri( baseUrl, hyperLink.Attribute( "href" ).Value() ) orderby url.AbsoluteUri select new { Url = url.AbsoluteUri, IsLinkingOut = !url.Host.EndsWith( "cnblogs.com" ), Target = hyperLink.Attribute( "target" ).Value() ?? "_self" }; DataList.DataSource = data; DataBind(); }
private void CallBack(object obj) { DataRow row = (DataRow)obj; WebClient client = new WebClient(); string html = client.DownloadString(row["url"].ToString()); JumonyParser jp = new JumonyParser(); IHtmlDocument document = jp.Parse(html); IEnumerable<IHtmlElement> htmlRows = document.Find(".tel"); foreach (IHtmlElement abc in htmlRows) { string fax = abc.InnerText(); int i = fax.IndexOf("fax"); int length = fax.Length; string faxnum = "无"; if (i > -1) { i = i + 3; string sub = fax.Substring(i, length - i); sub = sub.Replace("+1", ""); sub = sub.Replace("+", ""); sub = sub.Replace("(", ""); sub = sub.Replace(")", ""); sub = sub.Replace(" ", ""); sub = sub.Replace(".", ""); sub = sub.Replace("-", ""); row["fax"] = sub; faxnum = sub; } row["status"] = 1; new faxDataSetTableAdapters.kellysearch_faxTableAdapter().Update(row); Console.WriteLine(faxnum); } }
public void css_class_has_hyphen() { var html = "<div class=\"css-class\"></div>"; var htmlParser = new JumonyParser(); var doc = htmlParser.Parse( html ); var css_class = doc.Find( ".css-class" ); Assert.AreEqual( 1, css_class.Count() ); }
/// <summary> /// 得到AppId,AppSecret /// </summary> /// <returns></returns> public WechatDevInfo GetWechatDevInfo() { // TODO 得到AppId,AppSecret WechatDevInfo devInfo = null; HttpResponseMessage response = null; try { _httpClient = new HttpClient(handler); SetHeader(); response = _httpClient.GetAsync(WeChatUrl.DEV_URL + token).Result; if (response.StatusCode == HttpStatusCode.OK) { //已经连接,正在接收数据 string result = response.Content.ReadAsStringAsync().Result; var parser = new JumonyParser(); var htmlDoc = parser.Parse(result); var htmlEles = htmlDoc.Find(".developer_info_wrp"); if (htmlEles != null && htmlEles.Count() > 0) { var vertical = htmlEles.Find(".frm_vertical_pt").ToList(); devInfo = new WechatDevInfo(); #region 解析html获取相关文本信息 for (int i = 0; i < vertical.Count; i++) { try { var infoText = vertical[i].InnerText().Trim(); if (string.IsNullOrWhiteSpace(infoText)) continue; switch (i) { case 0: devInfo.AppId = infoText; break; case 1: devInfo.AppSecret = infoText; break; case 2: devInfo.URL = infoText; break; case 3: devInfo.Token = infoText; break; case 4: devInfo.EncodingAESKey = infoText; break; case 5: SetEncodingAESType(devInfo.EncodingAESType, infoText); break; default: break; } } catch (Exception){} } #endregion } } } catch (Exception) { } finally { if (response != null) response.Dispose(); } return devInfo; }
/// <summary> /// 得到微信公众平台个人信息 /// </summary> /// <returns></returns> public WechatAccountInfo GetAccount() { WechatAccountInfo account = null; HttpResponseMessage response = null; try { _httpClient = new HttpClient(handler); SetHeader(); response = _httpClient.GetAsync(WeChatUrl.ACCOUNT_INFO_URL + token).Result; if (response.StatusCode == HttpStatusCode.OK) { //已经连接,正在接收数据 string result = response.Content.ReadAsStringAsync().Result; var parser = new JumonyParser(); var htmlDoc = parser.Parse(result); var htmlEles = htmlDoc.Find(".account_setting_area .account_setting_item .meta_content"); if (htmlEles != null && htmlEles.Count() > 0) { var setting = htmlEles.ToList(); account = new WechatAccountInfo(); #region 解析html获取相关文本信息 for (int i = 0; i < setting.Count; i++) { try { var infoText = setting[i].InnerText().Trim(); if (i > 1 && string.IsNullOrWhiteSpace(infoText)) continue; switch (i) { case 0: account.HeadImage = setting[0].Find("img").FirstOrDefault() .Attribute("src").AttributeValue; break; case 1: account.QRCode = setting[1].Find("img").FirstOrDefault() .Attribute("src").AttributeValue; break; case 2: account.AccountName = infoText; break; case 3: account.WechatNumber = infoText; break; case 4: SetWechatType(account.WechatType, infoText); break; case 5: account.Introduces = infoText; break; case 6: SetAuthenticate(account.Authenticate, infoText); break; case 7: account.PlaceAddress = infoText; break; case 8: account.SubjectInfo = infoText; break; case 9: account.LoginEmail = infoText; break; case 10: account.AccountId = infoText; break; default: break; } } catch (Exception) { } } #endregion } } } catch (Exception) { } finally { if (response != null) response.Dispose(); } return account; }
/// <summary> /// 获取公司地址 /// </summary> /// <param name="url">需要查询的地址</param> private void GetPage(string url) { WebClient client = new WebClient(); string html = client.DownloadString(url); JumonyParser jp = new JumonyParser(); IHtmlDocument document = jp.Parse(html); IEnumerable<IHtmlElement> rows = document.Find(".pagediv input"); int page = 1; foreach (IHtmlElement abc in rows) { string name = abc.Attribute("name").Value(); if (name == "maxPage") { string value = abc.Attribute("value").Value(); page = int.Parse(value); } } GetUrl(url, page); }
private void GetUrl(string url, int maxPage) { for (int i = 1; i <= maxPage; i++) { string urls = url + "&page=" + i; WebClient client = new WebClient(); string html = client.DownloadString(urls); JumonyParser jp = new JumonyParser(); IHtmlDocument document = jp.Parse(html); IEnumerable<IHtmlElement> rows = document.Find(".searchresult_zonee .heading_address a"); foreach (IHtmlElement abc in rows) { try { string businessUrl = "http://www.kellysearch.com/" + abc.Attribute("href").Value(); string name = abc.InnerText(); faxDataSet.kellysearch_faxDataTable dt = new faxDataSet.kellysearch_faxDataTable(); DataRow row = dt.NewRow(); row["name"] = name; row["status"] = 0; row["url"] = businessUrl; dt.Rows.Add(row); faxDataSetTableAdapters.kellysearch_faxTableAdapter apt = new faxDataSetTableAdapters.kellysearch_faxTableAdapter(); apt.Update(dt); Console.WriteLine(name + businessUrl); } catch (Exception ex) { Console.WriteLine(ex.Message); } } } }
/// <summary> /// 单个查询 /// </summary> /// <param name="companyName"></param> public void SingelSearch(string companyName) { var httpClient = new HttpClient(); httpClient.Setting.Timeout = 1000 * 5; httpClient.Create<string>(HttpMethod.Post, firsturl).Send(); while (true) { var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique }; try { //IP处理 var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP if (proxyEntity == null) { Console.WriteLine("在线代理临时获取策略启动。"); proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity(); Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port); } httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port); var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new { queryStr = targetModel.搜索名称, module = "", idFlag = "qyxy" }).Send(); if (!resultBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultBody.Result)) { RemoveOldIp(proxyEntity); continue; } //提取二级连接 var parser = new JumonyParser(); var urls = parser.Parse(resultBody.Result).Find("li a").ToList(); var nextUrl = ""; if (urls.Count < 1) { AddNull(targetModel); break; } foreach (var htmlElement in urls) { targetModel.名称 = htmlElement.InnerText(); nextUrl = url + htmlElement.Attribute("href").AttributeValue; } //提取目标正文 var resultsecondBody = httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send(); var nameValueCollection = new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query)); if (!resultsecondBody.IsValid()) { RemoveOldIp(proxyEntity); continue; } if (ValidText(resultsecondBody.Result)) { RemoveOldIp(proxyEntity); continue; } //正文处理 var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td")); var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList(); var listall = new List<string>(); foreach (var tableList in tableLists) tableList.Find("tr td") .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim())); var fillModel = FillModel(listall); fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper(); new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel)); //后续其他处理 包括了IP使用状态,以查询列表状态 proxyEntity.Usage = proxyEntity.Usage + 1; new ProxyDomain().Update(proxyEntity); Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称); } catch (Exception e) { new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now }); continue; } break; } }
/// <summary> /// 迁移cnblog评论 /// </summary> /// <param name="BlogsId">嗨博客 博客id</param> /// <param name="BlogUsersId">嗨博客 评论博客用户id(因为迁移评论者 没有id 所以都默认为1)</param> /// <param name="postId">cnblog 博客id</param>int BlogUsersId = 1, /// <param name="blogApp">cnblog 博客用户名</param> public string testJumonyParser(int BlogsId = 1, string postId = "4368417", string blogApp = "zhaopei") { bool isNext = true; int i = 0; var BlogUsersId = 1; BLL.BlogUsersSetBLL userbll = new BlogUsersSetBLL(); var usertemp = GetDataHelper.GetAllUser().Where(t => t.UserName == " ").FirstOrDefault(); if (null == usertemp) { var user = new Blogs.ModelDB.BlogUsersSet() { UserName = "******", UserPass = "******", IsDel = false, IsLock = false, UserMail = "无效", CreateTime = DateTime.Now, UserInfo = new ModelDB.UserInfo() }; userbll.Add(user); userbll.save(false); BlogUsersId = user.Id; } else BlogUsersId = usertemp.Id; //List<BlogCommentSet> blogcommen = new List<BlogCommentSet>(); BlogCommentSetBLL blogcommenbll = new BlogCommentSetBLL(); while (isNext) { i++; var url = "http://www.cnblogs.com/mvc/blog/GetComments.aspx?postId=" + postId + "&blogApp=" + blogApp + "&pageIndex=" + i; var jumony = new JumonyParser(); var htmlSource = jumony.LoadDocument(url).InnerHtml(); JavaScriptSerializer _jsSerializer = new JavaScriptSerializer(); CnBlogComments comm = _jsSerializer.Deserialize<CnBlogComments>(htmlSource); var commentsHtml = jumony.Parse(comm.commentsHtml); var pager = commentsHtml.Find("div.pager").FirstOrDefault(); if (null != pager) { var Next = pager.Find("*").LastOrDefault().InnerText(); if (Next != "Next >") isNext = false; } else isNext = false; var listComment = commentsHtml.Find("div.feedbackItem").ToList(); foreach (var item in listComment) { var commentDataNode = item.Find("div.feedbackListSubtitle span.comment_date").FirstOrDefault(); // var commentData = DateTime.Parse(commentDataNode.InnerText()); var commentUserNode = item.Find("div.feedbackListSubtitle a[target='_blank']").FirstOrDefault(); var commentUser = commentUserNode.InnerText(); var Content = item.Find("div.blog_comment_body").FirstOrDefault().InnerText(); blogcommenbll.Add( new BlogCommentSet() { BlogsId = BlogsId, CommentID = -1, IsDel = false, Content = Content, CreateTime = commentData, ReplyUserName = commentUser, BlogUsersId = BlogUsersId, IsInitial = true } ); } } try { blogcommenbll.save(false); } catch (Exception) { } return "ok"; }
private static IEnumerable<Tuple<string, long, string, string>> GetAllPathString(string typeName, long id, string path, BsonValue v) { switch (v.BsonType) { case BsonType.Array: foreach (var s in v.AsBsonArray.SelectMany((el, i) => GetAllPathString(typeName, id, path + "[" + i+"]", el))) yield return s; break; case BsonType.Document: foreach (var s in v.AsBsonDocument.SelectMany(el => GetAllPathString(typeName, id, path + "." + el.Name, el.Value))) yield return s; break; case BsonType.String: var p = v.AsString; if (p.StartsWith("/App_Uploads/")) yield return new Tuple<string, long, string, string>(typeName, id, path, p); else { var jp = new JumonyParser(); foreach (var s in jp.Parse(p).Find("img[src]") .Select(img =>img.Attribute("src").AttributeValue) .Where(p0 => !p0.StartsWith("data:image", StringComparison.OrdinalIgnoreCase) && !p0.StartsWith("http://", StringComparison.OrdinalIgnoreCase) && !p0.StartsWith("file://", StringComparison.OrdinalIgnoreCase) ) ) yield return new Tuple<string, long, string, string>(typeName, id, path, s); //foreach (var c in re.Matches(p).Cast<Match>().SelectMany(m => m.Captures.Cast<Capture>())) // yield return new Tuple<string, long, string, string>(typeName, id, path, c.Value); } break; } }