//新浪 public string GetSinaBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("artibody"))//普通文章 { result = div.ToHtml(); break; } else if (css.Contains("articalcontent"))//历史板块文章,博客内容采集 { result = div.ToHtml(); break; } //else if (id.Equals("si_cont"))//图库,这个是其加载完成后才会载入的,需要用其他方式获取Html,如Iframe方法,ONLoad后再取其值 //{ // NodeList imgList = GetTagList(div.Children, "IMG"); //} } return(result); }
//------------------ //获取新浪的内容页,id=artibody //------------------ public string GetSinaArticle(string html) { string titleTlp = "<h1 style='text-align: center;'>{0}</h1>"; HtmlPage page = GetPage(html); NodeList nodelist = GetTagList(html, "div"); string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("artibody"))//普通文章 { result = string.Format(titleTlp, page.Title) + div.ToHtml(); break; } else if (css.Contains("articalcontent"))//历史板块文章,博客内容采集 { result = string.Format(titleTlp, page.Title) + div.ToHtml(); break; } //else if (id.Equals("si_cont"))//图库,这个是其加载完成后才会载入的,需要用其他方式获取Html,如Iframe方法,ONLoad后再取其值 //{ // NodeList imgList = GetTagList(div.Children, "IMG"); //} } if (string.IsNullOrEmpty(result)) { result = GetBodyHtml(html); } result = ReplaceChinaChar(result); return(result); }
//腾讯 public string GetQQBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("Cnt-Main-Article-QQ".ToLower()))//要闻 { result = div.ToHtml(); break; } else if (id.Equals("articleContent".ToLower()))//今日话题 { result = div.ToHtml(); break; } else if (css.Contains("article_mod".ToLower())) { result = div.ToHtml(); break; } else if (id.Equals("slide_bigimage_temp".ToLower()))//单图浏览 { result = div.ToHtml(); break; } else if (css.Contains("box-left")) { result = div.ToHtml(); break; } } return(result); }
//cctv public string GetCctvBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("content_body"))//少儿 { result = div.ToHtml(); } else if (id.Equals("content"))//电影、彩票 { result = div.ToHtml(); } else if (css.Contains("col_w660"))//新闻、国内、国际;经济、军事、评论、娱乐、少儿、书画、旅游、汽车、时尚、历史、农业、健康、综艺、戏曲、音乐、 { result = div.ToHtml(); } else if (css.Contains("col_650"))//评论 { result = div.ToHtml(); } else if (css.Contains("textcontent"))//游戏 { result = div.ToHtml(); } } return(result); }
//网易 public string Get163Body(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("endText".ToLower()))//普通内容 { result = div.ToHtml(); break; } else if (css.Contains("con_1".ToLower())) { result = div.ToHtml(); break; } else if (css.Contains("vdb-plr8".ToLower())) { result = div.ToHtml(); break; } else if (css.Contains("m-introbox".ToLower())) { result = div.ToHtml(); break; } else if (css.Contains("txtcont".ToLower())) { result = div.ToHtml(); break; } } return(result); }
//新华网 public string getXinhuanetBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("article")) //时政、法治;国际;财经、汽车、娱乐、时尚、信息化、人事、理论、港澳、日本、教育、科技、能源、食品、旅游、健康、公益、舆情、人才、更多 { result = div.ToHtml(); } else if (id.Equals("content"))//国际、华人、军事、房产、体育、资料、高层、港澳、台湾、新加坡 { result = div.ToHtml(); } else if (id.Equals("matrix"))//博客 { result = div.ToHtml(); } else if (id.Equals("contentblock"))//资料 { result = div.ToHtml(); } else if (id.Equals("news_content"))//炫空间 { result = div.ToHtml(); } else if (id.Equals("detail-content"))//马来西亚 { result = div.ToHtml(); } else if (id.Equals("message_1"))//读史 { result = div.ToHtml(); } else if (css.Contains("main pagewidth")) //地方 { result = div.ToHtml(); } else if (css.Contains("main_content_wrap")) { result = div.ToHtml(); } else if (css.Contains("wrap"))//微观中国 { result = div.ToHtml(); } //else if (css.Equals("content"))//信息化 //{ // result = div.ToHtml(); //} } return(result); }
//人民网 public string getPeopleBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("p_content"))//新闻:时政、社会、法治;地方:领导、旅游、人物;财经:理财、股票、能源;央企:环保、公益、彩票;教育:科技、文史、收藏、娱乐;观点:传媒、舆情;国际:台湾、港澳、军事;汽车:IT、通信、家电;房产:食品、健康、时尚;文化:读书、体育、游戏、红木、棋牌 { result = div.ToHtml(); } else if (id.Equals("ft_contwrap"))//访谈 { result = div.ToHtml(); } else if (css.Contains("text_show"))//理论 { result = div.ToHtml(); } } return(result); }
//搜狐 public string GetSohuBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("entry"))//博客 { result = div.ToHtml(); } else if (css.Contains("text clear"))//新闻、军事、文化、历史、体育、社会评论、读书、财经、股票、科技、汽车、时尚、健康、教育、母婴、旅游、美食、星座 { result = div.ToHtml(); } else if (css.Contains("new-detail-left"))//房产 { //result = div.ToHtml(); } } return(result); }
//凤凰网 public string GetFengBody(NodeList nodelist) { //HasAttributeFilter filter = new HasAttributeFilter("id", "artical_real"); //nodelist = nodelist.ExtractAllNodesThatMatch(filter); //return nodelist.AsHtml(); string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("artical_real"))//资讯、财经、娱乐、体育、时尚、科技、读书、教育、历史、军事、佛教、旅游、游戏、数码、健康、亲子、家居、星座 { result = div.ToHtml(); break; } else if (id.Equals("artical"))//文化 { result = div.ToHtml(); } else if (id.Equals("blog_article_content"))//博客 { result = div.ToHtml(); } else if (css.Contains("gc-art-body"))//汽车的介绍 { result = div.ToHtml(); break; } else if (css.Contains("ce-main mt12"))//汽车的评价规范 { result = div.ToHtml(); break; } else if (css.Contains("arl-mian fl"))//汽车的文章 { result = div.ToHtml(); } else if (css.Contains("article") && id != "")//房产的文章 { result = div.ToHtml(); } else if (css.Contains("e_i_deta_LX"))//彩票 { result = div.ToHtml(); } } return(result); }
private void TryToScrapeAccountInformation() { // Check if the frame we're waiting for has loaded var Frames = WebBrowser.Document.GetElementsByTagName("frame"); foreach (HtmlElement Frame in Frames) { if (Frame.Name.ToLower() == "tddetails") { bool FoundAccountDivs = false; // We found our frame, check for the account details divs HtmlWindow DetailsFrame = Frame.Document.Window.Frames["tddetails"]; var Divs = DetailsFrame.Document.GetElementsByTagName("div"); foreach (HtmlElement Div in Divs) { switch (Div.GetAttribute("className")) { case "td-target-banking": // Banking accounts ParseAccounts(Div, false); FoundAccountDivs = true; break; case "td-target-creditcards": // Credit accounts, so flip the sign on the balance ParseAccounts(Div, true); FoundAccountDivs = true; break; case "td-target-investing": // Investment accounts ParseAccounts(Div, false); FoundAccountDivs = true; break; } } if (FoundAccountDivs) { // And now that we've parsed the account details, let's log out var Anchors = DetailsFrame.Document.GetElementsByTagName("a"); foreach (HtmlElement Anchor in Anchors) { if (!string.IsNullOrWhiteSpace(Anchor.InnerText) && (Anchor.InnerText.Trim().ToLower() == "logout")) { Anchor.InvokeMember("click"); } } // Presumably we found a logout button above, if not we'll still switch states and hopefully // the user will manually click the Logout button after waiting a few seconds _ScraperState = ScraperState.WaitingForLogout; } break; } } }
//百度百家 public string GetBaijiaBody(NodeList nodelist) { string result = ""; for (int i = 0; i < nodelist.Size(); i++) { Div div = (Div)nodelist[i]; string id = div.GetAttribute("id"); string css = div.GetAttribute("class"); id = string.IsNullOrEmpty(id) ? "" : id.ToLower(); css = string.IsNullOrEmpty(css) ? "" : css.ToLower(); if (id.Equals("page"))//百度百家 { result = div.ToHtml(); } } return(result); }
private Element CreateAppElement() { appElement.AppendChild(_list); List <string> pathList = new List <string>() { "display-alert", "entry-listview", "shared-button", "button", "todo", "draw", "files", "dotmatrixclock", "editor", "monkeys", "refreshlistview", "searchbar", "slider", "switch-listview", "timepicker", "tipcalc", "weatherapp", "xuzzle", "webview", "picker" }; foreach (string path in pathList) { Div pathElement = new Div { ClassName = "list-group-item", Style = { Cursor = "pointer", FontWeight = "bold" }, Text = path }; pathElement.SetAttribute("href", pathElement.Document.Window.Location + path); pathElement.Click += (sender, args) => { Console.WriteLine(pathElement.Document.Window.Location); pathElement.Document.Window.Location = pathElement.GetAttribute("href", ""); }; appElement.AppendChild(pathElement); } return(appElement); }
public DataTable GetWXBySogou(string key, int count, DateTime time) { string baseurl = "http://weixin.sogou.com/weixin?type=2&query={0}&fr=sgsearch&ie=utf8&_ast=1433216256&_asf=null&w=01059900&cid=null&page={1}"; if (string.IsNullOrEmpty(key)) { return(null); } DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now; for (int p = 0; p * 10 < count; p++) { string url = string.Format(baseurl, HttpUtility.UrlEncode(key), p + 1); string html = ieHelp.GetHtmlFromSite(url); HtmlPage page = htmlHelp.GetPage(html); //int cpage = GetCurPage(page.Body); if (cpage <= p) { break; } Winista.Text.HtmlParser.Util.NodeList nodes = page.Body.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "wx-rb wx-rb3"), true); if (nodes.Count <= 0) { break; } //将其序列化为模型并存入相应类中 for (int i = 0; i < nodes.Count; i++) { Winista.Text.HtmlParser.Util.NodeList cnodes = nodes[i].Children; DataRow dr = dt.NewRow(); NodeFilter f_title = new AndFilter(new HasParentFilter(new TagNameFilter("h4")), new TagNameFilter("a")); ATag a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; dr["Title"] = a.StringText; dr["Link"] = a.Link; f_title = new AndFilter(new HasAttributeFilter("id", "weixin_account"), new TagNameFilter("a")); ATag author_a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; dr["Author"] = author_a.GetAttribute("title"); f_title = new HasAttributeFilter("class", "s-p"); Div div = (Div)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; string unixtime = div.GetAttribute("t"); dr["Cdate"] = GetDateTime(unixtime); dr["Day"] = GetDateTime(unixtime).Day; dr["Source"] = "微信"; dt.Rows.Add(dr); } if (cdate < time) { break; } } return(dt); }
[Then(@"Check no null attribute (.*)")] public void CheckAttributeNotNull(string name) { Assert.NotEmpty(CurrentComponent.GetAttribute(name)); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List<MeetInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string arguments = "%5B%7B%22jykssj%22%3A%222014-12-10+00%3A00%3A00%22%2C%22jyjssj%22%3A%222014-12-30+23%3A59%3A59%22%2C%22bmkey%22%3A%22%22%2C%22cdzj%22%3A%22CD0036%22%2C%22cdmc%22%3A%22%E7%AC%AC01%E8%AF%84%E6%A0%87%E5%AE%A4%22%2C%22rnrs%22%3A%229%22%2C%22zt%22%3A%2202%22%2C%22cdmj%22%3A%22105%22%2C%22tyy%22%3A%2201%22%2C%22dzbb%22%3A%2201%22%2C%22mkf%22%3A%2201%22%2C%22spcqsb%22%3A%2202%22%2C%22dlkt%22%3A%2202%22%2C%22sfsydzkpb%22%3A%2201%22%2C%22dn%22%3A%2210%22%2C%22ssbm%22%3A%22%E4%B8%AD%E5%BF%83%E6%9C%AC%E9%83%A8%22%2C%22cdlx%22%3A%22%E8%AF%84%E6%A0%87%E5%AE%A4%22%2C%22gm%22%3A%22%E5%A4%A7%22%2C%22dd%22%3A%22%E5%9B%9B%E6%A5%BC%22%2C%22zw%22%3A%22null%22%2C%22zjzy%22%3A%22null%22%2C%22zjdn%22%3A%22null%22%2C%22dldn%22%3A%22null%22%2C%22jhyzy%22%3A%22null%22%2C%22dyj%22%3A%2202%22%2C%22xsq%22%3A%2202%22%2C%22ipdjj%22%3A%2202%22%2C%22znjhp%22%3A%2202%22%2C%22yzj%22%3A%2202%22%2C%22sfxsmx%22%3A%2202%22%7D%5D"; string method = "findPlaceByCQSCD"; try { cookiestr = System.Web.HttpUtility.UrlDecode(arguments); } catch (Exception ex){ } NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]{ "service","arguments","method" }, new string[]{ "PlaceLentManagerBS", cookiestr, "findPlaceByCQSCD" }); html = this.ToolWebSite.GetHtmlByUrl("http://oa.gzzb.gd.cn/gcpbcOA/json/", nvc, Encoding.UTF8); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return list; } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "myTab0_Content0")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "gridtable")))); if (pageNode != null && pageNode.Count > 0) { TableTag table = pageNode[0] as TableTag; foreach (TableRow row in table.Rows) { parser = new Parser(new Lexer(row.ToHtml())); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); parser.Reset(); NodeList hNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("h5")); if (hNode != null && hNode.Count > 0 && tableNode != null && tableNode.Count > 0) { string address = hNode[0].ToNodePlainString(); TableTag cTable = tableNode[0] as TableTag; foreach (TableRow cRow in cTable.Rows) { foreach (TableColumn col in cRow.Columns) { parser = new Parser(new Lexer(col.ToHtml())); NodeList divNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "allcd"))); if (divNode != null && divNode.Count > 0) { Div div = divNode[0] as Div; string url = div.GetAttribute("id"); } } } } } } return list; }