Esempio n. 1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(htl));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_Main_paging_LblPageCount")));

            if (pageList != null && pageList.Count > 0)
            {
                try { page = int.Parse(pageList.AsString()); }
                catch { }
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    if (i < 3)
                    {
                        viewState       = this.ToolWebSite.GetAspNetViewState(htl);
                        eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl);
                    }
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]
                    {
                        "ctl00$ScriptManager1",
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "ctl00$Main$ddl_type",
                        "ctl00$Main$txt_Title",
                        "ctl00$Main$paging$txtPageIndex",
                        "__VIEWSTATE",
                        "__EVENTVALIDATION",
                        "__VIEWSTATEENCRYPTED",
                        "__ASYNCPOST",
                        "ctl00$Main$paging$btnForward.x",
                        "ctl00$Main$paging$btnForward.y"
                    },
                                                                                      new string[]
                    {
                        "ctl00$UpdatePanel1|ctl00$Main$paging$btnForward",
                        string.Empty,
                        string.Empty,
                        "1",
                        string.Empty,
                        i.ToString(),
                        viewState,
                        eventValidation,
                        string.Empty,
                        "true",
                        "10", "11"
                    });
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Main_Gv_FinishCheck")));
                if (dtList != null && dtList.Count > 0)
                {
                    TableTag table = dtList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty,
                               pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty,
                               prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty,
                               pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty,
                               pCreatetime = string.Empty, pLicUnit = string.Empty;
                        TableRow tr = table.Rows[j];
                        PrjName  = tr.Columns[3].ToPlainTextString().Trim();
                        pPrjCode = tr.Columns[1].ToPlainTextString().Trim();
                        pEndDate = tr.Columns[4].ToPlainTextString().Trim();
                        pUrl     = "http://www.szbajs.gov.cn/SiteManage/" + tr.GetAttribute("ondblclick").Replace("&amp;", "&").Replace(")", "kdxx").GetRegexBegEnd("&#39;", "kdxx").Replace("&#39;", "");
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(pUrl), Encoding.UTF8);
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "data_con")));
                        if (dtnode.Count > 0 && dtnode != null)
                        {
                            string ctx = dtnode.AsString().Replace(" ", "");
                            pInfoSource = ctx;
                            Regex regPrjAddr = new Regex(@"(建设地点|工程地址)(:|:)[^\r\n]+\r\n");
                            pPrjAddress = regPrjAddr.Match(ctx).Value.Replace("工程地址", "").Replace("建设地点", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n");
                            pDesignUnit = regpDesignUnit.Match(ctx).Value.Replace("设计单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n");
                            pSuperUnit = regpSuperUnit.Match(ctx).Value.Replace("监理单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpConstUnit = new Regex(@"施工单位(:|:)[^\r\n]+\r\n");
                            pConstUnit = regpConstUnit.Match(ctx).Value.Replace("施工单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpBuiUnit = new Regex(@"建设单位(:|:)[^\r\n]+\r\n");
                            pBuildUnit = regpBuiUnit.Match(ctx).Value.Replace("建设单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpRecordUnit = new Regex(@"备案机关(:|:)[^\r\n]+\r\n");
                            pRecordUnit = regpRecordUnit.Match(ctx).Value.Replace("备案机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();;

                            Regex regpLicUnit = new Regex(@"发证机关(:|:)[^\r\n]+\r\n");
                            pLicUnit = regpLicUnit.Match(ctx).Value.Replace("发证机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();;

                            if (string.IsNullOrEmpty(pLicUnit))
                            {
                                pLicUnit = "深圳市宝安区建设局";
                            }

                            ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市宝安区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市宝安区建设局", pLicUnit);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Esempio n. 2
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list = new List <ProjectFinish>();
            string htl = string.Empty;
            string cookiestr = string.Empty;
            string viewState = string.Empty;
            int    pageInt = 1, count = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(htl));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "pageLinkTd")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                try
                {
                    string temp = tdNodes.AsString().ToNodeString();
                    string s    = temp.GetRegexBegEnd("总页数", "页").Replace(":", "");
                    pageInt = int.Parse(s);
                }
                catch (Exception ex) { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]
                    {
                        "page",
                        "qymc",
                        "ann_serial",
                        "pro_name"
                    }, new string[] {
                        i.ToString(),
                        "",
                        "",
                        ""
                    });
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblPrjConstBid")));
                if (listNode != null && listNode.Count > 0)
                {
                    TableTag table = (TableTag)listNode[0];
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty,
                               pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty,
                               prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty,
                               pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty,
                               pCreatetime = string.Empty, pLicUnit = string.Empty;

                        TableRow tr = table.Rows[j];
                        pPrjCode   = tr.Columns[0].ToNodePlainString();
                        PrjName    = tr.Columns[1].ToNodePlainString();
                        pBuildUnit = tr.Columns[2].ToNodePlainString();
                        pEndDate   = tr.Columns[3].ToNodePlainString().GetDateRegex();


                        if (string.IsNullOrEmpty(pRecordUnit))
                        {
                            pRecordUnit = "深圳市住房和建设局";
                        }
                        ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市住房和建设局", pLicUnit);
                        list.Add(info);
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                        count++;
                        if (count >= 200)
                        {
                            count = 1;
                            Thread.Sleep(600 * 1000);
                        }
                    }
                }
            }
            return(list);
        }
Esempio n. 3
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <ProjectFinish>();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(htl));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagefooter")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.AsString().GetRegexBegEnd(",共有", "页");
                    page = int.Parse(temp);
                }
                catch { }
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&web_cur_page=" + i, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1")));
                if (dtList != null && dtList.Count > 0)
                {
                    TableTag table = dtList[dtList.Count - 1] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty,
                               pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty,
                               prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty,
                               pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty,
                               pCreatetime = string.Empty, pLicUnit = string.Empty;
                        TableRow tr = table.Rows[j];
                        PrjName    = tr.Columns[3].ToPlainTextString().Trim();
                        pPrjCode   = tr.Columns[2].ToPlainTextString().Trim();
                        pEndDate   = tr.Columns[1].ToPlainTextString().Trim();
                        pBuildUnit = tr.Columns[4].ToPlainTextString().Trim();
                        ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag;
                        pUrl = "http://www.cb.gov.cn" + aTag.Link.Replace("GoDetail('", "").Replace("');", "");
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(pUrl, Encoding.UTF8).GetJsString();
                        }
                        catch
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1")));
                        if (dtnode.Count > 0 && dtnode != null)
                        {
                            pInfoSource = dtnode.AsHtml().ToCtxString();
                            pPrjAddress = pInfoSource.GetRegex("建设地点,工程地址");
                            pDesignUnit = pInfoSource.GetRegex("设计单位");
                            pSuperUnit  = pInfoSource.GetRegex("监理单位");
                            pConstUnit  = pInfoSource.GetRegex("施工单位");
                            pRecordUnit = pInfoSource.GetRegex("备案机关");
                            pLicUnit    = pInfoSource.GetRegex("发证机关");
                            if (string.IsNullOrEmpty(pLicUnit))
                            {
                                pLicUnit = "深圳市龙岗区住房和建设局";
                            }
                            ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市龙岗区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市龙岗区住房和建设局", pLicUnit);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Esempio n. 4
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            if (htl.Contains("RowCount"))
            {
                try
                {
                    int     index   = htl.IndexOf("RowCount");
                    string  pageStr = htl.Substring(index, htl.Length - index).Replace("RowCount", "").Replace("}", "").Replace(":", "").Replace("\"", "");
                    decimal b       = decimal.Parse(pageStr) / 20;
                    if (b.ToString().Contains("."))
                    {
                        page = Convert.ToInt32(b) + 1;
                    }
                    else
                    {
                        page = Convert.ToInt32(b);
                    }
                }
                catch { }
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl("http://www.szjs.gov.cn/build/build.ashx?_=1352585430077&menu=%E9%A1%B9%E7%9B%AE%E4%BF%A1%E6%81%AF&type=%E7%AB%A3%E5%B7%A5%E9%AA%8C%E6%94%B6%E5%A4%87%E6%A1%88&pageSize=20&pageIndex=" + i.ToString(), Encoding.UTF8);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                }
                JavaScriptSerializer        serializer  = new JavaScriptSerializer();
                Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(htl);
                foreach (KeyValuePair <string, object> obj in smsTypeJson)
                {
                    if (obj.Key != "DataList")
                    {
                        continue;
                    }
                    object[] array = (object[])obj.Value;
                    foreach (object obj2 in array)
                    {
                        Dictionary <string, object> dicSmsType = (Dictionary <string, object>)obj2;
                        string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty,
                               pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty,
                               prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty,
                               pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty,
                               pCreatetime = string.Empty, pLicUnit = string.Empty;
                        try
                        {
                            pPrjCode   = Convert.ToString(dicSmsType["LogSerial"]);
                            PrjName    = Convert.ToString(dicSmsType["PrjLogName"]);
                            pBuildUnit = Convert.ToString(dicSmsType["ConstName"]);
                            pEndDate   = Convert.ToString(dicSmsType["LogDate"]);
                            pUrl       = "http://www.szjs.gov.cn/build/jgys_detail.aspx?id=" + pPrjCode;
                            string htmldetail = string.Empty;
                            try
                            {
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(pUrl, Encoding.UTF8, ref cookiestr).Trim();
                            }
                            catch (Exception)
                            {
                                continue;
                            }
                            Parser   parser = new Parser(new Lexer(htmldetail));
                            NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "js-table mar-l-4")));

                            if (dtList != null && dtList.Count > 0)
                            {
                                TableTag table = dtList[0] as TableTag;
                                for (int j = 0; j < table.RowCount; j++)
                                {
                                    TableRow dr  = table.Rows[j];
                                    string   ctx = string.Empty;
                                    for (int k = 0; k < dr.ColumnCount; k++)
                                    {
                                        ctx += dr.Columns[k].ToPlainTextString().Trim().Replace("\r", "").Replace("\n", "");
                                    }
                                    pInfoSource += ctx + "\r\n";
                                }
                                Regex regPrjAddr = new Regex(@"(建设地点|工程地址)(:|:)[^\r\n]+\r\n");
                                pPrjAddress = regPrjAddr.Match(pInfoSource).Value.Replace("工程地址", "").Replace("建设地点", "").Replace(":", "").Replace(":", "").Trim();
                                Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n");
                                pDesignUnit = regpDesignUnit.Match(pInfoSource).Value.Replace("设计单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();
                                Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n");
                                pSuperUnit = regpSuperUnit.Match(pInfoSource).Value.Replace("监理单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();
                                Regex regpConstUnit = new Regex(@"施工单位(:|:)[^\r\n]+\r\n");
                                pConstUnit = regpConstUnit.Match(pInfoSource).Value.Replace("施工单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();
                                Regex regpRecordUnit = new Regex(@"备案机关(:|:)[^\r\n]+\r\n");
                                pRecordUnit = regpRecordUnit.Match(pInfoSource).Value.Replace("备案机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();
                                Regex regpLicUnit = new Regex(@"发证机关(:|:)[^\r\n]+\r\n");
                                pLicUnit = regpLicUnit.Match(pInfoSource).Value.Replace("发证机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();;
                                if (string.IsNullOrEmpty(pLicUnit))
                                {
                                    pLicUnit = "深圳市住房和建设局";
                                }
                                ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市住房和建设局", pLicUnit);
                                list.Add(info);
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                        catch
                        { continue; }
                    }
                }
            }
            return(list);
        }