예제 #1
0
  protected void Page_Load( object sender, EventArgs e )
  {

    var client = new WebClient();
    var html = client.DownloadString( "http://www.cnblogs.com/" );

    var parser = new JumonyParser();
    var document = parser.Parse( html );

    var links = document.Find( "a[href]" );

    var baseUrl = new Uri( "http://www.cnblogs.com" );

    var data = from hyperLink in links
               let url = new Uri( baseUrl, hyperLink.Attribute( "href" ).Value() )
               orderby url.AbsoluteUri
               select new
               {
                 Url = url.AbsoluteUri,
                 IsLinkingOut = !url.Host.EndsWith( "cnblogs.com" ),
                 Target = hyperLink.Attribute( "target" ).Value() ?? "_self"
               };

    DataList.DataSource = data;
    DataBind();

  }
예제 #2
0
        private void CallBack(object obj)
        {
            DataRow row = (DataRow)obj;
            WebClient client = new WebClient();
            string html = client.DownloadString(row["url"].ToString());
            JumonyParser jp = new JumonyParser();
            IHtmlDocument document = jp.Parse(html);
            IEnumerable<IHtmlElement> htmlRows = document.Find(".tel");
            foreach (IHtmlElement abc in htmlRows)
            {
                string fax = abc.InnerText();
                int i = fax.IndexOf("fax");
                int length = fax.Length;
                string faxnum = "无";
                if (i > -1)
                {
                    i = i + 3;
                    string sub = fax.Substring(i, length - i);
                    sub = sub.Replace("+1", "");
                    sub = sub.Replace("+", "");
                    sub = sub.Replace("(", "");
                    sub = sub.Replace(")", "");
                    sub = sub.Replace(" ", "");
                    sub = sub.Replace(".", "");
                    sub = sub.Replace("-", "");
                    row["fax"] = sub;
                    faxnum = sub;

                }
                row["status"] = 1;
                new faxDataSetTableAdapters.kellysearch_faxTableAdapter().Update(row);
                Console.WriteLine(faxnum);
            }
        }
예제 #3
0
    public void css_class_has_hyphen()
    {
      var html = "<div class=\"css-class\"></div>";
      var htmlParser = new JumonyParser();
      var doc = htmlParser.Parse( html );

      var css_class = doc.Find( ".css-class" );

      Assert.AreEqual( 1, css_class.Count() );
    }
예제 #4
0
        /// <summary>
        /// 得到AppId,AppSecret 
        /// </summary>
        /// <returns></returns>
        public WechatDevInfo GetWechatDevInfo()
        {
            // TODO 得到AppId,AppSecret
            WechatDevInfo devInfo = null;
            HttpResponseMessage response = null;
            try
            {
                _httpClient = new HttpClient(handler);
                SetHeader();

                response = _httpClient.GetAsync(WeChatUrl.DEV_URL + token).Result;

                if (response.StatusCode == HttpStatusCode.OK)
                {   //已经连接,正在接收数据

                    string result = response.Content.ReadAsStringAsync().Result;

                    var parser = new JumonyParser();
                    var htmlDoc = parser.Parse(result);
                    var htmlEles = htmlDoc.Find(".developer_info_wrp");
                    if (htmlEles != null && htmlEles.Count() > 0)
                    {
                        var vertical = htmlEles.Find(".frm_vertical_pt").ToList();
                        devInfo = new WechatDevInfo();
                        #region  解析html获取相关文本信息
                        for (int i = 0; i < vertical.Count; i++)
                        {
                            try
                            {
                                var infoText = vertical[i].InnerText().Trim();
                                if (string.IsNullOrWhiteSpace(infoText))
                                    continue;
                                switch (i)
                                {
                                    case 0: devInfo.AppId = infoText;
                                        break;
                                    case 1: devInfo.AppSecret = infoText;
                                        break;
                                    case 2: devInfo.URL = infoText;
                                        break;
                                    case 3: devInfo.Token = infoText;
                                        break;
                                    case 4: devInfo.EncodingAESKey = infoText;
                                        break;
                                    case 5:
                                        SetEncodingAESType(devInfo.EncodingAESType, infoText);
                                        break;
                                    default:
                                        break;
                                }
                            }
                            catch (Exception){}
                        }
                        #endregion
                    }
                }
            }
            catch (Exception)
            {

            }
            finally
            {
                if (response != null)
                    response.Dispose();
            }
            return devInfo;
        }
예제 #5
0
        /// <summary>
        /// 得到微信公众平台个人信息 
        /// </summary>
        /// <returns></returns>
        public WechatAccountInfo GetAccount()
        {
            WechatAccountInfo account = null;
            HttpResponseMessage response = null;
            try
            {
                _httpClient = new HttpClient(handler);
                SetHeader();

                response = _httpClient.GetAsync(WeChatUrl.ACCOUNT_INFO_URL + token).Result;
                if (response.StatusCode == HttpStatusCode.OK)
                {   //已经连接,正在接收数据

                    string result = response.Content.ReadAsStringAsync().Result;

                    var parser = new JumonyParser();
                    var htmlDoc = parser.Parse(result);
                    var htmlEles = htmlDoc.Find(".account_setting_area .account_setting_item .meta_content");
                    if (htmlEles != null && htmlEles.Count() > 0)
                    {
                        var setting = htmlEles.ToList();
                        account = new WechatAccountInfo();

                        #region  解析html获取相关文本信息
                        for (int i = 0; i < setting.Count; i++)
                        {
                            try
                            {

                                var infoText = setting[i].InnerText().Trim();
                                if (i > 1 && string.IsNullOrWhiteSpace(infoText))
                                    continue;
                                switch (i)
                                {
                                    case 0: account.HeadImage = setting[0].Find("img").FirstOrDefault()
                                        .Attribute("src").AttributeValue;
                                        break;
                                    case 1: account.QRCode = setting[1].Find("img").FirstOrDefault()
                                         .Attribute("src").AttributeValue;
                                        break;
                                    case 2: account.AccountName = infoText;
                                        break;
                                    case 3: account.WechatNumber = infoText;
                                        break;
                                    case 4: SetWechatType(account.WechatType, infoText);
                                        break;
                                    case 5: account.Introduces = infoText;
                                        break;
                                    case 6: SetAuthenticate(account.Authenticate, infoText);
                                        break;
                                    case 7: account.PlaceAddress = infoText;
                                        break;
                                    case 8: account.SubjectInfo = infoText;
                                        break;
                                    case 9: account.LoginEmail = infoText;
                                        break;
                                    case 10: account.AccountId = infoText;
                                        break;
                                    default:
                                        break;
                                }
                            }
                            catch (Exception) { }
                        }
                        #endregion
                    }
                }
            }
            catch (Exception)
            {

            }
            finally
            {
                if (response != null)
                    response.Dispose();
            }
            return account;
        }
예제 #6
0
        /// <summary>
        /// 获取公司地址
        /// </summary>
        /// <param name="url">需要查询的地址</param>
        private void GetPage(string url)
        {
            WebClient client = new WebClient();
            string html = client.DownloadString(url);
            JumonyParser jp = new JumonyParser();
            IHtmlDocument document = jp.Parse(html);
            IEnumerable<IHtmlElement> rows = document.Find(".pagediv input");
            int page = 1;
            foreach (IHtmlElement abc in rows)
            {
                string name = abc.Attribute("name").Value();
                if (name == "maxPage")
                {
                    string value = abc.Attribute("value").Value();
                    page = int.Parse(value);

                }
            }
            GetUrl(url, page);
        }
예제 #7
0
        private void GetUrl(string url, int maxPage)
        {
            for (int i = 1; i <= maxPage; i++)
            {

                    string urls = url + "&page=" + i;
                    WebClient client = new WebClient();
                    string html = client.DownloadString(urls);
                    JumonyParser jp = new JumonyParser();
                    IHtmlDocument document = jp.Parse(html);
                    IEnumerable<IHtmlElement> rows = document.Find(".searchresult_zonee .heading_address a");
                    foreach (IHtmlElement abc in rows)
                    {
                        try
                        {
                            string businessUrl = "http://www.kellysearch.com/" + abc.Attribute("href").Value();
                            string name = abc.InnerText();
                            faxDataSet.kellysearch_faxDataTable dt = new faxDataSet.kellysearch_faxDataTable();
                            DataRow row = dt.NewRow();
                            row["name"] = name;
                            row["status"] = 0;
                            row["url"] = businessUrl;
                            dt.Rows.Add(row);
                            faxDataSetTableAdapters.kellysearch_faxTableAdapter apt = new faxDataSetTableAdapters.kellysearch_faxTableAdapter();
                            apt.Update(dt);
                            Console.WriteLine(name + businessUrl);
                        }
                        catch (Exception ex)
                        {
                            Console.WriteLine(ex.Message);
                        }
                    }
            }
        }
예제 #8
0
        /// <summary>
        /// 单个查询
        /// </summary>
        /// <param name="companyName"></param>
        public void SingelSearch(string companyName)
        {
            var httpClient = new HttpClient();
            httpClient.Setting.Timeout = 1000 * 5;
            httpClient.Create<string>(HttpMethod.Post, firsturl).Send();
            while (true)
            {
                var targetModel = new CrawlerEntity { 搜索名称 = companyName, 操作人姓名 = TaskEntity.OperatorName, 入爬行库时间 = TaskEntity.CreateTime, TaskGuid = TaskEntity.Unique };
                try
                {
                    //IP处理
                    var proxyEntity = new ProxyDomain().GetByRandom(); //代理IP
                    if (proxyEntity == null)
                    {
                        Console.WriteLine("在线代理临时获取策略启动。");
                        proxyEntity = Proxy.Proxy.GetInstance().GetHttProxyEntity();
                        Console.WriteLine("线上获取到了代理:{0}:{1}", proxyEntity.IpAddress, proxyEntity.Port);
                    }

                    httpClient.Setting.Proxy = new WebProxy(proxyEntity.IpAddress, proxyEntity.Port);

                    var resultBody = httpClient.Create<string>(HttpMethod.Post, targetUrl, data: new
                    {
                        queryStr = targetModel.搜索名称,
                        module = "",
                        idFlag = "qyxy"
                    }).Send();
                    if (!resultBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //提取二级连接
                    var parser = new JumonyParser();
                    var urls = parser.Parse(resultBody.Result).Find("li a").ToList();
                    var nextUrl = "";
                    if (urls.Count < 1)
                    {
                        AddNull(targetModel);
                        break;
                    }
                    foreach (var htmlElement in urls)
                    {
                        targetModel.名称 = htmlElement.InnerText();
                        nextUrl = url + htmlElement.Attribute("href").AttributeValue;
                    }
                    //提取目标正文
                    var resultsecondBody =
                        httpClient.Create<string>(HttpMethod.Get, zhuUrl + new Uri(firsturl + nextUrl).Query).Send();
                    var nameValueCollection =
                        new NameValueCollection(URL.GetQueryString(new Uri(firsturl + nextUrl).Query));
                    if (!resultsecondBody.IsValid())
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    if (ValidText(resultsecondBody.Result))
                    {
                        RemoveOldIp(proxyEntity);
                        continue;
                    }
                    //正文处理
                    var sorceIhtml = new JumonyParser().Parse(resultsecondBody.Result.Replace("<th", "<td"));
                    var tableLists = sorceIhtml.Find("table[class='f-lbiao']").ToList();
                    var listall = new List<string>();
                    foreach (var tableList in tableLists)
                        tableList.Find("tr td")
                            .ForEach(t => listall.Add(t.InnerText().TrimEnd(':').TrimEnd(':').Trim()));
                    var fillModel = FillModel(listall);
                    fillModel.全局唯一编号 = nameValueCollection["reg_bus_ent_id"].ToUpper();
                    new CrawlerDomain().Add(StrategyNo1(fillModel, targetModel));
                    //后续其他处理 包括了IP使用状态,以查询列表状态
                    proxyEntity.Usage = proxyEntity.Usage + 1;
                    new ProxyDomain().Update(proxyEntity);
                    Console.WriteLine("{0} 抓取到:{1}", Task.CurrentId, targetModel.搜索名称);
                }
                catch (Exception e)
                {
                    new LogDomain().Add(new LogEntity { LogType = "error", TaskName = TaskEntity.TaskName, ErrorDetails = Task.CurrentId + "线程: " + e.Message, Details = e.ToString(), TriggerTime = DateTime.Now });
                    continue;
                }
                break;
            }
        }
예제 #9
0
        /// <summary>
        /// 迁移cnblog评论
        /// </summary>
        /// <param name="BlogsId">嗨博客 博客id</param>
        /// <param name="BlogUsersId">嗨博客  评论博客用户id(因为迁移评论者 没有id 所以都默认为1)</param>
        /// <param name="postId">cnblog 博客id</param>int BlogUsersId = 1,
        /// <param name="blogApp">cnblog 博客用户名</param>
        public string testJumonyParser(int BlogsId = 1, string postId = "4368417", string blogApp = "zhaopei")
        {
            bool isNext = true;
            int i = 0;

            var BlogUsersId = 1;
            BLL.BlogUsersSetBLL userbll = new BlogUsersSetBLL();
            var usertemp = GetDataHelper.GetAllUser().Where(t => t.UserName == " ").FirstOrDefault();
            if (null == usertemp)
            {
                var user = new Blogs.ModelDB.BlogUsersSet()
                {
                    UserName = "******",
                    UserPass = "******",
                    IsDel = false,
                    IsLock = false,
                    UserMail = "无效",
                    CreateTime = DateTime.Now,
                    UserInfo = new ModelDB.UserInfo()
                };
                userbll.Add(user);
                userbll.save(false);
                BlogUsersId = user.Id;
            }
            else
                BlogUsersId = usertemp.Id;

            //List<BlogCommentSet> blogcommen = new List<BlogCommentSet>();
            BlogCommentSetBLL blogcommenbll = new BlogCommentSetBLL();
            while (isNext)
            {
                i++;
                var url = "http://www.cnblogs.com/mvc/blog/GetComments.aspx?postId=" + postId + "&blogApp=" + blogApp + "&pageIndex=" + i;
                var jumony = new JumonyParser();
                var htmlSource = jumony.LoadDocument(url).InnerHtml();

                JavaScriptSerializer _jsSerializer = new JavaScriptSerializer();
                CnBlogComments comm = _jsSerializer.Deserialize<CnBlogComments>(htmlSource);
                var commentsHtml = jumony.Parse(comm.commentsHtml);
                var pager = commentsHtml.Find("div.pager").FirstOrDefault();
                if (null != pager)
                {
                    var Next = pager.Find("*").LastOrDefault().InnerText();
                    if (Next != "Next >")
                        isNext = false;
                }
                else
                    isNext = false;

                var listComment = commentsHtml.Find("div.feedbackItem").ToList();
                foreach (var item in listComment)
                {
                    var commentDataNode = item.Find("div.feedbackListSubtitle span.comment_date").FirstOrDefault();  //
                    var commentData = DateTime.Parse(commentDataNode.InnerText());
                    var commentUserNode = item.Find("div.feedbackListSubtitle a[target='_blank']").FirstOrDefault();
                    var commentUser = commentUserNode.InnerText();
                    var Content = item.Find("div.blog_comment_body").FirstOrDefault().InnerText();

                    blogcommenbll.Add(
                        new BlogCommentSet()
                        {
                            BlogsId = BlogsId,
                            CommentID = -1,
                            IsDel = false,
                            Content = Content,
                            CreateTime = commentData,
                            ReplyUserName = commentUser,
                            BlogUsersId = BlogUsersId,
                            IsInitial = true
                        }
                        );
                }
            }

            try
            {
                blogcommenbll.save(false);
            }
            catch (Exception)
            { }
            return "ok";
        }
예제 #10
0
 private static IEnumerable<Tuple<string, long, string, string>> GetAllPathString(string typeName, long id, string path, BsonValue v)
 {
     switch (v.BsonType)
     {
         case BsonType.Array:
             foreach (var s in v.AsBsonArray.SelectMany((el, i) => GetAllPathString(typeName, id, path + "[" + i+"]", el)))
                 yield return s;
             break;
         case BsonType.Document:
             foreach (var s in v.AsBsonDocument.SelectMany(el => GetAllPathString(typeName, id, path + "." + el.Name, el.Value)))
                 yield return s;
             break;
         case BsonType.String:
             var p = v.AsString;
             if (p.StartsWith("/App_Uploads/"))
                 yield return new Tuple<string, long, string, string>(typeName, id, path, p);
             else
             {
                 var jp = new JumonyParser();
                 foreach (var s in jp.Parse(p).Find("img[src]")
                     .Select(img =>img.Attribute("src").AttributeValue)
                     .Where(p0 => !p0.StartsWith("data:image", StringComparison.OrdinalIgnoreCase)
                         && !p0.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                         && !p0.StartsWith("file://", StringComparison.OrdinalIgnoreCase)
                         )
                     )
                     yield return new Tuple<string, long, string, string>(typeName, id, path, s);
                 //foreach (var c in re.Matches(p).Cast<Match>().SelectMany(m => m.Captures.Cast<Capture>()))
                 //    yield return new Tuple<string, long, string, string>(typeName, id, path, c.Value);
             }
             break;
     }
 }