public static void getContent(string path) { if (path.StartsWith("http")) { return; } string contentPage = Util.getHtmlStr("http://www.huachuan.gov.cn" + path, Encoding.Default); IHtmlDocument source = new JumonyParser().Parse(contentPage); string author = source.FindSingle(".maintittwo").FindLast("span").FindSingle("a").InnerText(); string content = source.FindSingle(".mainnews").InnerHtml(); string sql = string.Format("update t_spider_bslc t set t.author='{0}',t.content='{1}' where t.path='{2}'", author, content, path); int count = DbHelperMySQL.ExecuteSql(sql); }
public static void getContent(string id) { HttpClient httpClient = new HttpClient(); try { httpClient .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_view.aspx?id=" + id, null) .ContinueWith((postTask) => { HttpResponseMessage response = postTask.Result; response.Content.ReadAsStringAsync().ContinueWith((readTask) => { try { IHtmlDocument source = new JumonyParser().Parse(readTask.Result); string content = source.FindSingle(".zwnr").InnerHtml(); string sql = string.Format("update t_spider_zwgk t set t.content='{0}' where t.id={1}", content, id); int count = DbHelperMySQL.ExecuteSql(sql); } catch (Exception e) { Debug.WriteLine("----->【" + id + "】内容存储异常<-----:" + e); } }); }); } catch (Exception e) { Debug.WriteLine("----->【" + id + "】内容存储异常<-----:" + e); } }
public void SpecificationTest2() { //测试各种属性表达式能否被正确解析 var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest2.html")); var element = document.FindSingle("A"); Assert.AreEqual(element.Attribute("a").AttributeValue, "abc"); //双引号情况 Assert.AreEqual(element.Attribute("b").AttributeValue, "123"); //单引号情况 Assert.AreEqual(element.Attribute("c").AttributeValue, "d=x"); // Assert.IsNull(element.Attribute("d")); //属性值前面有空白的情况 Assert.AreEqual(element.Attribute("e").AttributeValue, null); //没有等号的情况 Assert.AreEqual(element.Attribute("f").AttributeValue, ""); //标签末尾的情况 element = document.FindSingle("B"); Assert.AreEqual(element.Attribute("a").AttributeValue, "abc"); //等号前有空格的情况 Assert.AreEqual(element.Attribute("b").AttributeValue, ""); //空属性情况 Assert.AreEqual(element.Attribute("c").AttributeValue, null); //无值属性在标签末尾的情况 }
public static void getByPage(string cookie, string viewstate, int page) { HttpClient httpClient = new HttpClient(); HttpContent postContent = new FormUrlEncodedContent(new Dictionary <string, string>() { { "__VIEWSTATE", viewstate }, { "__VIEWSTATEGENERATOR", "7BE8FDE8" }, { "__EVENTTARGET", "AspNetPager1" }, { "__EVENTARGUMENT", page.ToString() }, { "_keywords", "" }, { "AspNetPager1_input", "1" }, }); httpClient .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_list.aspx", postContent) .ContinueWith((postTask) => { HttpResponseMessage response = postTask.Result; response.Content.ReadAsStringAsync().ContinueWith((readTask) => { //Debug.WriteLine(readTask.Result); IHtmlDocument source = new JumonyParser().Parse(readTask.Result); var itemCount = source.Find(".listbox").Count(); for (int i = 1; i <= itemCount; i++) { try { string id = source.FindSingle("#four" + i).Attribute("href").Value().Split('=')[1]; string author = source.Find("#con_four_" + i).Find(".li1").Last().InnerText().Replace("发布机构:", ""); string time = source.Find("#con_four_" + i).Find(".li2").Last().InnerText().Replace("发文日期:", ""); string title = source.Find("#con_four_" + i).Find(".infoname").First().InnerText().Replace("名称:", ""); //判断第一条是否存在,如果存在,则说明新闻一直未更新,不需要继续下去了 //可以使用下边逻辑,continue换成return //判断是否存在 string sql = string.Format("select count(*) from t_spider_zwgk t where t.id={0}", id); int count = Convert.ToInt32(DbHelperMySQL.GetSingle(sql)); if (count > 0) { //continue; return; } //不存在,插入数据库 sql = string.Format("insert into t_spider_zwgk(id,title,time,author) values({0},'{1}','{2}','{3}')", id, title.Replace('\'', '"'), time, author); count = DbHelperMySQL.ExecuteSql(sql); if (count == 1) { getContent(id); } } catch (Exception e) { Debug.WriteLine("----->【" + page + "." + i + "】新闻创建异常<-----:" + e); } } }); }); }
public void SpecificationTest8() { var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest8.html")); Assert.AreEqual(document.FindSingle("div").Attributes().Count(), 1, "错误的解析了非法的属性"); var links = document.Find("div a").ToArray(); Assert.AreEqual(links.Length, 2, "错误的解析了不属于属性值的引用内容"); Assert.AreEqual(links[0].InnerText(), "Test1", "错误的解析了不属于属性值的引用内容"); Assert.AreEqual(links[1].InnerText(), " \"Test2", "错误的解析了不属于属性值的引用内容"); }
public void SpecificationTest1() { //测试孤立的'<'能否被正确解析 var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest1.html")); var element = document.FindSingle("a"); //需要找到一个<a>元素 Assert.AreEqual(element.InnerHtml(), "abc"); //并且内容是"abc" Assert.AreEqual(element.Attributes().Count(), 1); //有且只有一个属性 Assert.AreEqual(element.Attribute("abc").AttributeValue, "abc"); //属性值为"abc" var textNode = document.Nodes().ElementAt(0) as IHtmlTextNode; Assert.IsNotNull(textNode); Assert.IsTrue(textNode.HtmlText.Contains('<'));//第一个文本节点包含了那个孤立的 '<' }
public static void start() { HttpClient httpClient = new HttpClient(); httpClient .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_list.aspx", null) .ContinueWith((postTask) => { HttpResponseMessage response = postTask.Result; string cookie = response.Headers.GetValues("Set-Cookie").FirstOrDefault().Split(';')[0].Split('=')[1]; Debug.WriteLine("----->响应cookie<-----:" + cookie); response.Content.ReadAsStringAsync().ContinueWith((readTask) => { IHtmlDocument source = new JumonyParser().Parse(readTask.Result); string viewState = source.FindSingle("input[name=__VIEWSTATE]").Attribute("value").Value(); int totalPage = int.Parse(source.FindLast("option").InnerText()); Debug.WriteLine("----->数据总页数<-----:" + totalPage); for (int i = 1; i <= totalPage; i++) { getByPage(cookie, viewState, i); } }); }); }
private void button1_Click(object sender, EventArgs e) { var db = new DbMapper(); this.progressBar1.Value = 0; string config = File.ReadAllText(@".\config.json"); var json = JsonConvert.DeserializeObject <List <Config> >(config); var len = json.Count(); log("开始获取数据!"); for (Int32 i = 0; i < len; i++) { var it = json[i]; var d = new edata(); d.name = it.name; d.url = it.url; AsynTask.Task(() => { AsynTask.UpdateUI(() => { log(d.name); log(d.url); }); var dom = new JumonyParser().LoadDocument(it.url); if (!string.IsNullOrWhiteSpace(it.selector1)) { d.value1 = dom.FindSingle(it.selector1).InnerText(); } if (!string.IsNullOrWhiteSpace(it.selector2)) { d.value2 = dom.FindSingle(it.selector2).InnerText(); } if (!string.IsNullOrWhiteSpace(it.selector3)) { d.value3 = dom.FindSingle(it.selector3).InnerText(); } if (!string.IsNullOrWhiteSpace(it.selector4)) { d.value4 = dom.FindSingle(it.selector4).InnerText(); } if (!string.IsNullOrWhiteSpace(it.selector5)) { d.value5 = dom.FindSingle(it.selector5).InnerHtml(); } //DB db.insert(d); AsynTask.UpdateUI(() => { this.progressBar1.Value = i / len * 100; }); }) .OnSuccess(() => { log("处理完毕!"); }) .OnError(ex => { log("处理失败了!" + ex.Message); }) .Start(); } }