예제 #1
0
파일: crawler.cs 프로젝트: hiepdh/crawler
        //kế toán thiên ưng
        public void start_kttu()
        {
            pageInfo.MaxPage = GetMaxPage(pageInfo.FirstPageUrl);
            for (int i = 0; i < pageInfo.MaxPage; i++)
            {
                try
                {
                    if (i == 0)
                    {
                        docPage = web.Load(pageInfo.FirstPageUrl);
                    }
                    else
                    {
                        docPage = web.Load(String.Format(pageInfo.PageURL, i + 1));
                    }

                    string queryString     = new System.Uri(pageInfo.FirstPageUrl).Query;
                    var    queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);

                    string categoryId = "14";//Thu nộp BHXH,YT,TN

                    //Loop each page
                    pageInfo.PostList = new List <postInfo>();
                    //Load post content

                    //*[@id="nc"]/div[2]/a[1]

                    HtmlNodeCollection nodes = docPage.DocumentNode.SelectNodes("//*[@id='nc']//a[@class='name']");
                    postInfo           post  = null;
                    if (nodes != null)
                    {
                        foreach (HtmlNode node in nodes)
                        {
                            post = new postInfo();
                            if (node.Attributes["href"].Value != null)
                            {
                                post.Title      = node.InnerText.Trim();
                                post.Url        = node.Attributes["href"].Value;
                                post.SiteId     = "http://ketoanthienung.org";
                                post.CategoryId = categoryId;
                                //Load detail
                                docPageDetail = web.Load(post.Url);

                                //
                                //docPageDetail.DocumentNode.Descendants()
                                //  .Where(n => n.Name == "script" || n.Name == "style")
                                //  .ToList()
                                //  .ForEach(n => n.Remove());

                                //
                                //queryString = new System.Uri(post.Url).Query;
                                //queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);
                                string postId = "0";
                                //
                                post.PostId  = postId;
                                post.Content = "";
                                if (docPageDetail.DocumentNode.SelectNodes("//div[@id='nd']//div[@class='details']") != null)
                                {
                                    post.Content = docPageDetail.DocumentNode.SelectNodes("//div[@id='nd']//div[@class='details']").First().InnerText;
                                }

                                post.ContentReply = "";

                                //
                                if (post.Content != "")
                                {
                                    pageInfo.PostList.Add(post);
                                }
                            }
                        }
                    }

                    Console.WriteLine(String.Format("{0} - Page {1} - {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), i + 1, pageInfo.PageURL));

                    //Post2Site(pageInfo.PostList);
                    if (pageInfo.PostList.Count > 0)
                    {
                        Post2Site_Alo(pageInfo.PostList, categoryId);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
        }
예제 #2
0
파일: crawler.cs 프로젝트: hiepdh/crawler
        internal void start_bhdn()
        {
            //pageInfo.MaxPage = 111;
            pageInfo.MaxPage = GetMaxPage(pageInfo.FirstPageUrl);
            for (int i = 0; i < pageInfo.MaxPage; i++)
            {
                try
                {
                    if (i == 0)
                    {
                        docPage = web.Load(pageInfo.FirstPageUrl);
                    }
                    else
                    {
                        docPage = web.Load(String.Format(pageInfo.PageURL, i + 1));
                    }

                    string queryString     = new System.Uri(pageInfo.FirstPageUrl).Query;
                    var    queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);

                    string categoryId = "13";//Thu nộp BHXH,YT,TN

                    //Loop each page
                    pageInfo.PostList = new List <postInfo>();
                    //Load post content

                    //*[@id="nc"]/div[2]/a[1]

                    HtmlNodeCollection nodes = docPage.DocumentNode.SelectNodes("//*[@id='SearchResults']//tr[contains(@class, 'sectiontableentry1') or contains(@class, 'sectiontableentry1')]");
                    postInfo           post  = null;
                    if (nodes != null)
                    {
                        foreach (HtmlNode node in nodes)
                        {
                            post = new postInfo();
                            if (node.SelectSingleNode(".//a").Attributes["href"] != null)
                            {
                                post.Title      = node.SelectSingleNode(".//a//b").InnerText.Trim();
                                post.Url        = "http://www.bhxhdanang.gov.vn/" + node.SelectSingleNode(".//a").Attributes["href"].Value;
                                post.SiteId     = "bhxhdanang.gov.vn";
                                post.CategoryId = categoryId;
                                //Load detail
                                docPageDetail = web.Load(post.Url);

                                string postId = "0";
                                //
                                post.PostId  = postId;
                                post.Content = "";

                                //if (docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]/text()") != null)
                                //    post.Content = docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]/text()").First().InnerText;

                                //post.ContentReply = docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]/span[1]").First().InnerText;

                                post.Content = docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]").First().InnerHtml;

                                //
                                if (post.Content != "")
                                {
                                    pageInfo.PostList.Add(post);
                                }
                            }
                        }
                    }

                    Console.WriteLine(String.Format("{0} - Page {1} - {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), i + 1, pageInfo.PageURL));

                    //Post2Site(pageInfo.PostList);
                    if (pageInfo.PostList.Count > 0)
                    {
                        Post2Site_Alo(pageInfo.PostList, categoryId);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
        }
예제 #3
0
파일: crawler.cs 프로젝트: hiepdh/crawler
        public void start()
        {
            pageInfo.MaxPage = GetMaxPage(pageInfo.FirstPageUrl);
            for (int i = 0; i < pageInfo.MaxPage; i++)
            {
                try
                {
                    if (i == 0)
                    {
                        docPage = web.Load(pageInfo.FirstPageUrl);
                    }
                    else
                    {
                        docPage = web.Load(String.Format(pageInfo.PageURL, i + 1));
                    }

                    //Loop each page
                    pageInfo.PostList = new List <postInfo>();
                    //Load post content
                    HtmlNodeCollection nodes = docPage.DocumentNode.SelectNodes("//div[@id='colbds']//div[@class='row']");
                    postInfo           post  = null;
                    foreach (HtmlNode node in nodes)
                    {
                        post = new postInfo();
                        if (node.SelectNodes(".//h3") != null)
                        {
                            post.Title = node.SelectNodes(".//h3").First().InnerText.Trim();
                            post.Url   = node.SelectNodes(".//h3/a").First().Attributes["href"].Value;
                            //Load detail
                            docPageDetail = web.Load(post.Url);
                            if (docPageDetail.DocumentNode.SelectNodes(".//div[@id='colbds']//p") != null)
                            {
                                post.Content = docPageDetail.DocumentNode.SelectNodes(".//div[@id='colbds']//p").First().InnerText;
                            }
                            if (docPageDetail.DocumentNode.SelectNodes(".//*[@id='colbds']//div[2]//div[2]//span") != null)
                            {
                                post.Price = docPageDetail.DocumentNode.SelectNodes("//*[@id='colbds']//div[2]//div[2]//span").First().InnerText;
                            }
                            if (docPageDetail.DocumentNode.SelectNodes(".//*[@id='colbds']//p[3]") != null)
                            {
                                post.Phone = docPageDetail.DocumentNode.SelectNodes(".//*[@id='colbds']//p[3]").First().InnerText;
                            }
                            if (post.Phone != null && post.Phone.Contains(":"))
                            {
                                post.Phone = post.Phone.Split(new string[] { ":" }, StringSplitOptions.None)[1];
                            }
                            post.Province = docPageDetail.DocumentNode.SelectNodes(".//*[@id='colbds']//ol//li[3]//a").First().InnerText;
                            post.District = docPageDetail.DocumentNode.SelectNodes(".//*[@id='colbds']//ol//li[4]//a").First().InnerText;
                            if (!String.IsNullOrEmpty(post.Phone))
                            {
                                post.Title = String.Format("{0} - LH: {1}", post.Title, post.Phone);
                            }
                            //
                            pageInfo.PostList.Add(post);
                        }
                    }

                    Console.WriteLine(String.Format("{0} - Page {1} - {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), i + 1, pageInfo.PageURL));
                    Post2Site(pageInfo.PostList);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
        }
예제 #4
0
파일: crawler.cs 프로젝트: hiepdh/crawler
        internal void start_bhxhgovvn()
        {
            //pageInfo.MaxPage = 111;
            pageInfo.MaxPage = GetMaxPage(pageInfo.FirstPageUrl);
            for (int i = 0; i < pageInfo.MaxPage; i++)
            {
                try
                {
                    string webUrl = "";
                    if (i == 0)
                    {
                        webUrl  = pageInfo.FirstPageUrl;
                        docPage = web.Load(webUrl);
                    }
                    else
                    {
                        webUrl  = String.Format(pageInfo.PageURL, i + 1);
                        docPage = web.Load(webUrl);
                    }

                    string queryString     = new System.Uri(pageInfo.FirstPageUrl).Query;
                    var    queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);

                    string categoryId = "7";//Thu nộp BHXH,YT,TN


                    //Loop each page
                    pageInfo.PostList = new List <postInfo>();
                    //Load post content

                    HtmlNodeCollection nodes = docPage.DocumentNode.SelectNodes("//div[contains(@class, 'list-tin') and not(contains(@class, 'tintuc-mobile')) ]");
                    postInfo           post  = null;
                    if (nodes != null)
                    {
                        foreach (HtmlNode node in nodes)
                        {
                            post = new postInfo();
                            if (node.SelectSingleNode(".//a").Attributes["href"] != null)
                            {
                                post.Title      = node.SelectSingleNode(".//p[contains(@class, 'tieude')]").InnerText.Trim();
                                post.Url        = webUrl + node.SelectSingleNode(".//a").Attributes["href"].Value;
                                post.SiteId     = "baohiemxahoi.gov.vn";
                                post.CategoryId = categoryId;
                                //Load detail
                                docPageDetail = web.Load(post.Url);

                                string postId = "0";

                                queryString     = new System.Uri(post.Url).Query;
                                queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);
                                postId          = queryDictionary["ItemID"].ToString();

                                //
                                post.PostId  = postId;
                                post.Content = "";

                                //if (docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]/text()") != null)
                                //    post.Content = docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]/text()").First().InnerText;

                                //post.ContentReply = docPageDetail.DocumentNode.SelectNodes("//*[@id=\"body\"]/div[1]/div[2]/span[1]").First().InnerText;

                                post.Content = docPageDetail.DocumentNode.SelectNodes(".//div[contains(@class, 'tinchitiet')]/div[2]/div[2]").First().InnerHtml;

                                //
                                if (post.Content != "")
                                {
                                    pageInfo.PostList.Add(post);
                                }
                            }
                        }
                    }

                    Console.WriteLine(String.Format("{0} - Page {1} - {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), i + 1, pageInfo.PageURL));

                    //Post2Site(pageInfo.PostList);
                    if (pageInfo.PostList.Count > 0)
                    {
                        Post2Site_Alo(pageInfo.PostList, categoryId);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
        }
예제 #5
0
파일: crawler.cs 프로젝트: hiepdh/crawler
        //bhxhhn.com.vn
        public void start_bhhn()
        {
            pageInfo.MaxPage = GetMaxPage(pageInfo.FirstPageUrl);
            for (int i = 0; i < pageInfo.MaxPage; i++)
            {
                try
                {
                    if (i == 0)
                    {
                        docPage = web.Load(pageInfo.FirstPageUrl);
                    }
                    else
                    {
                        docPage = web.Load(String.Format(pageInfo.PageURL, i + 1));
                    }

                    //docPage.DocumentNode.Descendants()
                    //    .Where(n => n.Name == "script" || n.Name == "style")
                    //    .ToList()
                    //    .ForEach(n => n.Remove());

                    //pkcm=2
                    string queryString     = new System.Uri(pageInfo.FirstPageUrl).Query;
                    var    queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);

                    string categoryId = queryDictionary["pkcm"].ToString();

                    switch (categoryId)
                    {
                    //<option value="1">Bảo hiểm y tế</option>
                    //<option value="2">Bảo hiểm xã hội</option>
                    //<option value="3">Bảo hiểm thất nghiệp</option>
                    //<option value="4">Hỏi đáp các vấn đề khác</option>
                    //<option selected="selected" value="5">Hưu trí</option>
                    //<option value="6">Tử tuất</option>
                    //<option value="7">Ốm đau thai sản</option>
                    //<option value="8">Tai nạn lao động, bệnh nghề nghiệp</option>
                    //<option value="9">Sổ BHXH</option>
                    //<option value="10">Thẻ BHYT</option>
                    //<option value="11">KCB BHYT</option>
                    //<option value="12">BHXH 1 lần</option>

                    //Y tế
                    case "1":
                        categoryId = "12";
                        break;

                    case "2":
                        categoryId = "19";
                        break;

                    case "3":
                        categoryId = "16";
                        break;

                    case "4":
                        categoryId = "6";
                        break;

                    case "5":
                        categoryId = "10";
                        break;

                    case "6":
                        categoryId = "10";
                        break;

                    case "7":
                        categoryId = "8";
                        break;

                    case "8":
                        categoryId = "9";
                        break;

                    case "9":
                        categoryId = "15";
                        break;

                    case "10":
                        categoryId = "15";
                        break;

                    case "11":
                        categoryId = "15";
                        break;

                    case "12":
                        categoryId = "14";
                        break;
                    }


                    //Loop each page
                    pageInfo.PostList = new List <postInfo>();
                    //Load post content
                    HtmlNodeCollection nodes = docPage.DocumentNode.SelectNodes("//table[@id='dnn_ctr1675_FE_View_All_view_gvHoiDap']//tr");
                    postInfo           post  = null;
                    foreach (HtmlNode node in nodes)
                    {
                        post = new postInfo();
                        if (node.SelectNodes(".//td[2]//a") != null)
                        {
                            post.Title      = node.SelectNodes(".//td[2]//a").First().InnerText.Trim();
                            post.Url        = node.SelectNodes(".//td[2]//a").First().Attributes["href"].Value;
                            post.SiteId     = "bhxhhn.com.vn";
                            post.CategoryId = categoryId;
                            //Load detail
                            docPageDetail = web.Load(post.Url);

                            //
                            //docPageDetail.DocumentNode.Descendants()
                            //  .Where(n => n.Name == "script" || n.Name == "style")
                            //  .ToList()
                            //  .ForEach(n => n.Remove());

                            //
                            queryString     = new System.Uri(post.Url).Query;
                            queryDictionary = System.Web.HttpUtility.ParseQueryString(queryString);
                            string postId = queryDictionary["hdId"].ToString();
                            //
                            post.PostId = postId;

                            if (docPageDetail.DocumentNode.SelectNodes("//div[@id='dnn_ContentPane']//p[@class='noidungcauhoi']") != null)
                            {
                                post.Content = docPageDetail.DocumentNode.SelectNodes("//div[@id='dnn_ContentPane']//p[@class='noidungcauhoi']").First().InnerText;
                            }

                            //if (docPageDetail.DocumentNode.SelectNodes("//*[@id='dnn_ctr1726_ModuleContent']/div/div/div[2]/div[3]") != null)
                            //    post.Content = docPageDetail.DocumentNode.SelectNodes("//*[@id='dnn_ctr1726_ModuleContent']/div/div/div[2]/div[3]").First().InnerText;
                            post.ContentReply = docPageDetail.DocumentNode.SelectNodes("//div[@id='dnn_ContentPane']//div[@class='contenttraloi']").First().InnerHtml;

                            //
                            pageInfo.PostList.Add(post);
                        }
                    }

                    Console.WriteLine(String.Format("{0} - Page {1} - {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), i + 1, pageInfo.PageURL));
                    //Post2Site(pageInfo.PostList);
                    Post2Site_Alo(pageInfo.PostList, categoryId);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }
        }