예제 #1
0
 public static void getContent(string path)
 {
     if (path.StartsWith("http"))
     {
         return;
     }
     string        contentPage = Util.getHtmlStr("http://www.huachuan.gov.cn" + path, Encoding.Default);
     IHtmlDocument source      = new JumonyParser().Parse(contentPage);
     string        author      = source.FindSingle(".maintittwo").FindLast("span").FindSingle("a").InnerText();
     string        content     = source.FindSingle(".mainnews").InnerHtml();
     string        sql         = string.Format("update t_spider_bslc t set t.author='{0}',t.content='{1}' where t.path='{2}'", author, content, path);
     int           count       = DbHelperMySQL.ExecuteSql(sql);
 }
예제 #2
0
        public static void getContent(string id)
        {
            HttpClient httpClient = new HttpClient();

            try
            {
                httpClient
                .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_view.aspx?id=" + id, null)
                .ContinueWith((postTask) =>
                {
                    HttpResponseMessage response = postTask.Result;
                    response.Content.ReadAsStringAsync().ContinueWith((readTask) =>
                    {
                        try
                        {
                            IHtmlDocument source = new JumonyParser().Parse(readTask.Result);
                            string content       = source.FindSingle(".zwnr").InnerHtml();
                            string sql           = string.Format("update t_spider_zwgk t set t.content='{0}' where t.id={1}", content, id);
                            int count            = DbHelperMySQL.ExecuteSql(sql);
                        }
                        catch (Exception e)
                        {
                            Debug.WriteLine("----->【" + id + "】内容存储异常<-----:" + e);
                        }
                    });
                });
            }
            catch (Exception e)
            {
                Debug.WriteLine("----->【" + id + "】内容存储异常<-----:" + e);
            }
        }
예제 #3
0
        public void SpecificationTest2()
        {
            //测试各种属性表达式能否被正确解析
            var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest2.html"));
            var element  = document.FindSingle("A");

            Assert.AreEqual(element.Attribute("a").AttributeValue, "abc"); //双引号情况
            Assert.AreEqual(element.Attribute("b").AttributeValue, "123"); //单引号情况
            Assert.AreEqual(element.Attribute("c").AttributeValue, "d=x"); //
            Assert.IsNull(element.Attribute("d"));                         //属性值前面有空白的情况
            Assert.AreEqual(element.Attribute("e").AttributeValue, null);  //没有等号的情况
            Assert.AreEqual(element.Attribute("f").AttributeValue, "");    //标签末尾的情况

            element = document.FindSingle("B");
            Assert.AreEqual(element.Attribute("a").AttributeValue, "abc"); //等号前有空格的情况
            Assert.AreEqual(element.Attribute("b").AttributeValue, "");    //空属性情况
            Assert.AreEqual(element.Attribute("c").AttributeValue, null);  //无值属性在标签末尾的情况
        }
예제 #4
0
        public static void getByPage(string cookie, string viewstate, int page)
        {
            HttpClient  httpClient  = new HttpClient();
            HttpContent postContent = new FormUrlEncodedContent(new Dictionary <string, string>()
            {
                { "__VIEWSTATE", viewstate },
                { "__VIEWSTATEGENERATOR", "7BE8FDE8" },
                { "__EVENTTARGET", "AspNetPager1" },
                { "__EVENTARGUMENT", page.ToString() },
                { "_keywords", "" },
                { "AspNetPager1_input", "1" },
            });

            httpClient
            .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_list.aspx", postContent)
            .ContinueWith((postTask) =>
            {
                HttpResponseMessage response = postTask.Result;
                response.Content.ReadAsStringAsync().ContinueWith((readTask) =>
                {
                    //Debug.WriteLine(readTask.Result);
                    IHtmlDocument source = new JumonyParser().Parse(readTask.Result);
                    var itemCount        = source.Find(".listbox").Count();
                    for (int i = 1; i <= itemCount; i++)
                    {
                        try
                        {
                            string id     = source.FindSingle("#four" + i).Attribute("href").Value().Split('=')[1];
                            string author = source.Find("#con_four_" + i).Find(".li1").Last().InnerText().Replace("发布机构:", "");
                            string time   = source.Find("#con_four_" + i).Find(".li2").Last().InnerText().Replace("发文日期:", "");
                            string title  = source.Find("#con_four_" + i).Find(".infoname").First().InnerText().Replace("名称:", "");
                            //判断第一条是否存在,如果存在,则说明新闻一直未更新,不需要继续下去了
                            //可以使用下边逻辑,continue换成return
                            //判断是否存在
                            string sql = string.Format("select count(*) from t_spider_zwgk t where t.id={0}", id);
                            int count  = Convert.ToInt32(DbHelperMySQL.GetSingle(sql));
                            if (count > 0)
                            {
                                //continue;
                                return;
                            }
                            //不存在,插入数据库
                            sql   = string.Format("insert into t_spider_zwgk(id,title,time,author) values({0},'{1}','{2}','{3}')", id, title.Replace('\'', '"'), time, author);
                            count = DbHelperMySQL.ExecuteSql(sql);
                            if (count == 1)
                            {
                                getContent(id);
                            }
                        }
                        catch (Exception e)
                        {
                            Debug.WriteLine("----->【" + page + "." + i + "】新闻创建异常<-----:" + e);
                        }
                    }
                });
            });
        }
예제 #5
0
        public void SpecificationTest8()
        {
            var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest8.html"));

            Assert.AreEqual(document.FindSingle("div").Attributes().Count(), 1, "错误的解析了非法的属性");
            var links = document.Find("div a").ToArray();

            Assert.AreEqual(links.Length, 2, "错误的解析了不属于属性值的引用内容");
            Assert.AreEqual(links[0].InnerText(), "Test1", "错误的解析了不属于属性值的引用内容");
            Assert.AreEqual(links[1].InnerText(), " \"Test2", "错误的解析了不属于属性值的引用内容");
        }
예제 #6
0
        public void SpecificationTest1()
        {
            //测试孤立的'<'能否被正确解析
            var document = new JumonyParser().LoadDocument(Path.Combine(Environment.CurrentDirectory, "SpecificationTest1.html"));
            var element  = document.FindSingle("a");                         //需要找到一个<a>元素

            Assert.AreEqual(element.InnerHtml(), "abc");                     //并且内容是"abc"
            Assert.AreEqual(element.Attributes().Count(), 1);                //有且只有一个属性
            Assert.AreEqual(element.Attribute("abc").AttributeValue, "abc"); //属性值为"abc"
            var textNode = document.Nodes().ElementAt(0) as IHtmlTextNode;

            Assert.IsNotNull(textNode);
            Assert.IsTrue(textNode.HtmlText.Contains('<'));//第一个文本节点包含了那个孤立的 '<'
        }
예제 #7
0
        public static void start()
        {
            HttpClient httpClient = new HttpClient();

            httpClient
            .PostAsync("http://hd.huachuan.gov.cn/aspx/gkml_list.aspx", null)
            .ContinueWith((postTask) =>
            {
                HttpResponseMessage response = postTask.Result;
                string cookie = response.Headers.GetValues("Set-Cookie").FirstOrDefault().Split(';')[0].Split('=')[1];
                Debug.WriteLine("----->响应cookie<-----:" + cookie);
                response.Content.ReadAsStringAsync().ContinueWith((readTask) =>
                {
                    IHtmlDocument source = new JumonyParser().Parse(readTask.Result);
                    string viewState     = source.FindSingle("input[name=__VIEWSTATE]").Attribute("value").Value();
                    int totalPage        = int.Parse(source.FindLast("option").InnerText());
                    Debug.WriteLine("----->数据总页数<-----:" + totalPage);
                    for (int i = 1; i <= totalPage; i++)
                    {
                        getByPage(cookie, viewState, i);
                    }
                });
            });
        }
예제 #8
0
        private void button1_Click(object sender, EventArgs e)
        {
            var db = new DbMapper();

            this.progressBar1.Value = 0;

            string config = File.ReadAllText(@".\config.json");
            var    json   = JsonConvert.DeserializeObject <List <Config> >(config);
            var    len    = json.Count();

            log("开始获取数据!");

            for (Int32 i = 0; i < len; i++)
            {
                var it = json[i];
                var d  = new edata();
                d.name = it.name;
                d.url  = it.url;

                AsynTask.Task(() => {
                    AsynTask.UpdateUI(() =>
                    {
                        log(d.name);
                        log(d.url);
                    });

                    var dom = new JumonyParser().LoadDocument(it.url);
                    if (!string.IsNullOrWhiteSpace(it.selector1))
                    {
                        d.value1 = dom.FindSingle(it.selector1).InnerText();
                    }
                    if (!string.IsNullOrWhiteSpace(it.selector2))
                    {
                        d.value2 = dom.FindSingle(it.selector2).InnerText();
                    }
                    if (!string.IsNullOrWhiteSpace(it.selector3))
                    {
                        d.value3 = dom.FindSingle(it.selector3).InnerText();
                    }
                    if (!string.IsNullOrWhiteSpace(it.selector4))
                    {
                        d.value4 = dom.FindSingle(it.selector4).InnerText();
                    }
                    if (!string.IsNullOrWhiteSpace(it.selector5))
                    {
                        d.value5 = dom.FindSingle(it.selector5).InnerHtml();
                    }

                    //DB
                    db.insert(d);

                    AsynTask.UpdateUI(() =>
                    {
                        this.progressBar1.Value = i / len * 100;
                    });
                })
                .OnSuccess(() =>
                {
                    log("处理完毕!");
                })
                .OnError(ex =>
                {
                    log("处理失败了!" + ex.Message);
                })
                .Start();
            }
        }