Exemple #1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list     = new ArrayList();
            int   sqlCount = 0;
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                Logger.Error(ex.ToString());
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("cellspacing", "2"), new TagNameFilter("table")));

            if (sNode != null && sNode.Count > 0)
            {
                string pageString = sNode.AsString();
                Regex  regexPage  = new Regex(@",共[^页]+页,");
                Match  pageMatch  = regexPage.Match(pageString);
                try { pageInt = int.Parse(pageMatch.Value.Replace(",共", "").Replace("页,", "").Trim()); }
                catch (Exception) { }
            }
            string cookiestr = string.Empty;

            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$hdnPageCount" }, new string[] { "ctl00$Content$GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, pageInt.ToString() });
                    html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_Content_GridView1"), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty,
                               inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty,
                               endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = table.Rows[j] as TableRow;
                        code      = tr.Columns[1].ToPlainTextString().Trim();
                        prjName   = tr.Columns[2].ToPlainTextString().Trim();
                        buildUnit = tr.Columns[3].ToPlainTextString().Trim();
                        beginDate = tr.Columns[5].ToPlainTextString().Trim();
                        endDate   = tr.Columns[6].ToPlainTextString().Trim();
                        ATag aTag = tr.Columns[2].Children[0] as ATag;
                        InfoUrl = "http://www.szjsjy.com.cn/BusinessInfo/" + aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = ToolHtml.GetHtmlByUrlEncode(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span")));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = ToolHtml.GetHtmlByUrlEncode(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n");
                        }
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "lblXXNR"), new TagNameFilter("span")));

                        inviteCtx = dtnode.AsString().Replace(" ", "");
                        Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}");
                        prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim();
                        msgType    = "深圳市建设工程交易中心";
                        specType   = "建设工程";
                        Regex  regInvType = new Regex(@"[^\r\n]+[\r\n]{1}");
                        string InvType    = regInvType.Match(inviteCtx).Value;

                        inviteType = ToolHtml.GetInviteTypes(InvType);
                        #region 2013-11-19修改
                        Dictionary <string, Regex> dicRegex = new Dictionary <string, Regex>();
                        dicRegex.Add("重要提示", new Regex(@"([.\S\s]*)(?=重要提示)"));
                        dicRegex.Add("温馨提示", new Regex(@"([.\S\s]*)(?=温馨提示)"));
                        foreach (string dicValue in dicRegex.Keys)
                        {
                            if (inviteCtx.Contains(dicValue))
                            {
                                inviteCtx = dicRegex[dicValue].Match(inviteCtx).Value;
                            }
                        }
                        #endregion
                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳市工程", string.Empty, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, InfoUrl, HtmlTxt);
                        if (!crawlAll && sqlCount >= this.MaxCount)
                        {
                            return(null);
                        }
                        sqlCount++;
                        if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx))
                        {
                            dtlparser.Reset();
                            NodeList dlNodes = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a"));//
                            if (dlNodes != null && dlNodes.Count > 0)
                            {
                                for (int f = 0; f < dlNodes.Count; f++)
                                {
                                    ATag fileTag = dlNodes[f] as ATag;
                                    if (fileTag.IsAtagAttach())
                                    {
                                        //BaseAttach attach = ToolDb.GenBaseAttach(fileTag.StringText, info.Id, fileTag.Link.Replace("..", "http://www.szjsjy.com.cn"));
                                        try
                                        {
                                            BaseAttach attach = ToolHtml.GetBaseAttach(fileTag.Link.Replace("..", "http://www.szjsjy.com.cn"), fileTag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\");
                                            if (attach != null)
                                            {
                                                ToolDb.SaveEntity(attach, "SourceID,AttachServerPath");
                                            }
                                        }
                                        catch { }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemple #2
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list = new List <ProjectLic>();
            int    pageInt = 1, count = 0;
            string htl             = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                htl = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(htl));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "pageLinkTd")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                try
                {
                    string temp = tdNodes.AsString().ToNodeString();
                    string s    = temp.GetRegexBegEnd("总页数", "页").Replace(":", "");
                    pageInt = int.Parse(s);
                }
                catch (Exception ex) { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]
                    {
                        "page",
                        "qymc",
                        "ann_serial",
                        "pro_name"
                    }, new string[] {
                        i.ToString(),
                        "",
                        "",
                        ""
                    });
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblPrjConstBid")));
                if (listNode != null && listNode.Count > 0)
                {
                    TableTag table = (TableTag)listNode[0];
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string pPrjName = string.Empty, pBuildUnit = string.Empty,
                               pBuildAddress = string.Empty, pBuildManager = string.Empty,
                               pBuildScale = string.Empty, pPrjPrice = string.Empty,
                               pPrjStartDate = string.Empty, PrjEndDate = string.Empty,
                               pConstUnit = string.Empty, pConstUnitManager = string.Empty,
                               pSuperUnit = string.Empty, pSuperUnitManager = string.Empty,
                               pProspUnit = string.Empty, pProspUnitManager = string.Empty,
                               pDesignUnit = string.Empty, pDesignUnitManager = string.Empty,
                               pPrjManager = string.Empty, pSpecialPerson = string.Empty,
                               pLicUnit = string.Empty, pPrjLicCode = string.Empty,
                               PrjLicDate = string.Empty, pPrjDesc = string.Empty,
                               pProvince = string.Empty, pCity = string.Empty,
                               pInfoSource = string.Empty, pUrl = string.Empty,
                               pCreatetime = string.Empty, pPrjCode = string.Empty;
                        TableRow tr = table.Rows[j];
                        pPrjLicCode = tr.Columns[0].ToNodePlainString();
                        pPrjCode    = tr.Columns[1].ToNodePlainString();
                        pPrjName    = tr.Columns[2].ToNodePlainString();
                        pBuildUnit  = tr.Columns[3].ToNodePlainString();
                        PrjLicDate  = tr.Columns[4].ToPlainTextString().GetDateRegex();
                        pUrl        = "http://portal.szjs.gov.cn:8888/gongshi/sgxkz.html";
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "instanceGuid", "yxtywlsh" }, new string[] { pPrjCode, pPrjLicCode });

                        string htmldetl = string.Empty;
                        try
                        {
                            htmldetl = this.ToolWebSite.GetHtmlByUrl(pUrl, nvc, Encoding.UTF8);
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetl));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblPrjConstBid")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            TableTag dtlTag = dtnode[0] as TableTag;
                            pInfoSource = "";
                            for (int rowIndex = 1; rowIndex < dtlTag.RowCount; rowIndex++)
                            {
                                for (int colIndex = 0; colIndex < dtlTag.Rows[rowIndex].ColumnCount; colIndex++)
                                {
                                    if (colIndex % 2 == 0)
                                    {
                                        pInfoSource += dtlTag.Rows[rowIndex].Columns[colIndex].ToNodePlainString() + ":";
                                    }
                                    else
                                    {
                                        pInfoSource += dtlTag.Rows[rowIndex].Columns[colIndex].ToNodePlainString() + "\r\n";
                                    }
                                }
                            }

                            pPrjStartDate = pInfoSource.GetRegex("合同开工日期");
                            PrjEndDate    = pInfoSource.GetRegex("合同竣工日期");
                            pDesignUnit   = pInfoSource.GetRegex("设计单位,建设单位 ");
                            pBuildAddress = pInfoSource.GetRegex("工程地址,建设地址");
                            pBuildScale   = pInfoSource.GetRegex("建筑面积,建设规模");
                            pSuperUnit    = pInfoSource.GetRegex("监理单位");
                            pConstUnit    = pInfoSource.GetRegex("施工单位");
                            pLicUnit      = pInfoSource.GetRegex("发证机关");
                            pProspUnit    = pInfoSource.GetRegex("勘察单位");
                            pPrjPrice     = pInfoSource.GetRegex("合同价格");
                            pPrjManager   = pInfoSource.GetRegex("项目经理,项目负责人");
                            if (string.IsNullOrEmpty(pLicUnit))
                            {
                                pLicUnit = "深圳市住房和建设局";
                            }
                            ProjectLic info = ToolDb.GenProjectLic(pPrjName, pBuildUnit, pBuildAddress, pBuildManager, pBuildScale, pPrjPrice, pPrjStartDate, PrjEndDate, pConstUnit, pConstUnitManager, pSuperUnit, pSuperUnitManager, pProspUnit, pProspUnitManager, pDesignUnit, pDesignUnitManager, pPrjManager, pSpecialPerson, pLicUnit, pPrjLicCode, PrjLicDate, pPrjDesc, "广东省", "深圳市区", pInfoSource, pUrl, pCreatetime, pPrjCode, "深圳市住房和建设局");
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }

                            count++;
                            if (count >= 200)
                            {
                                count = 1;
                                Thread.Sleep(600 * 1000);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemple #3
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_GridViewPaingTwo1_lblGridViewPagingDesc")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.AsString().GetRegexBegEnd("共", "页");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "办事指南";
                        headName    = tr.Columns[1].ToNodePlainString();
                        releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        infoUrl     = "http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + tr.Columns[1].GetATagHref();
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = ToolHtml.GetHtmlByUrlEncode(infoUrl, Encoding.UTF8);
                        }
                        catch {  }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "line")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            ctxHtml = dtlList.AsHtml();
                            infoCtx = dtlList.AsString();
                            msgType = MsgTypeCosnt.DongGuanMsgType;
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "东莞市区", string.Empty, infoCtx, infoType);
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            else
                            {
                                sqlCount++;
                                if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                {
                                    parser = new Parser(new Lexer(htldtl));
                                    NodeList aNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView1")));
                                    if (aNode != null && aNode.Count > 0)
                                    {
                                        TableTag tab = aNode[0] as TableTag;
                                        for (int a = 1; a < tab.RowCount; a++)
                                        {
                                            TableRow dr   = tab.Rows[a];
                                            ATag     aTag = dr.Columns[1].GetATag();
                                            if (aTag.IsAtagAttach())
                                            {
                                                try
                                                {
                                                    BaseAttach obj = ToolHtml.GetBaseAttach("http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + aTag.Link, aTag.LinkText, info.Id);
                                                    if (obj != null)
                                                    {
                                                        ToolDb.SaveEntity(obj, string.Empty);
                                                    }
                                                }
                                                catch {  }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #4
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl             = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8);
                viewState       = this.ToolWebSite.GetAspNetViewState(htl);
                eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(htl));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_Main_paging_LblPageCount")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                try
                {
                    page = int.Parse(tdNodes[0].ToPlainTextString().Trim());
                }
                catch { return(list); }
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    //if (i < 3)
                    //{
                    //    viewState = this.ToolWebSite.GetAspNetViewState(htl);
                    //    eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl);
                    //}
                    //NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]{
                    //    "ctl00$ScriptManager1",
                    //    "__EVENTTARGET",
                    //    "__EVENTARGUMENT",
                    //    "__VIEWSTATE",
                    //    "__VIEWSTATEENCRYPTED",
                    //    "__EVENTVALIDATION",
                    //    "ctl00$Main$ddl_type",
                    //    "ctl00$Main$txt_Title",
                    //    "ctl00$Main$paging$txtPageIndex",
                    //    "__ASYNCPOST",
                    //    "ctl00$Main$paging$btnNext.x","ctl00$Main$paging$btnNext.y"
                    //}, new string[]{
                    //    "ctl00$Main$paging$btnForward",
                    //    string.Empty,
                    //    string.Empty,
                    //    viewState,
                    //    string.Empty,
                    //    eventValidation,
                    //    "","",i.ToString(),"true","5","9"
                    //});
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "ctl00$ScriptManager1",
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "ctl00$Main$ddl_type",
                        "ctl00$Main$txt_Title",
                        "ctl00$Main$paging$txtPageIndex",
                        "__VIEWSTATE",
                        "__VIEWSTATEGENERATOR",
                        "__VIEWSTATEENCRYPTED",
                        "__EVENTVALIDATION",
                        "__ASYNCPOST",
                        "ctl00$Main$paging$btnForward.x",
                        "ctl00$Main$paging$btnForward.y"
                    }, new string[] {
                        "ctl00$UpdatePanel1|ctl00$Main$paging$btnForward",
                        string.Empty,
                        string.Empty,
                        "1",
                        string.Empty,
                        i.ToString(),
                        viewState,
                        "19AE96F3",
                        "",
                        eventValidation,
                        "true",
                        "7", "9"
                    });
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                        //viewState = htl.GetRegexBegEnd("VIEWSTATE", "hiddenField", 100000).Replace("|8|", "").Replace("|", "");
                        //eventValidation = htl.Replace("|", "kdxxAdmin").GetRegexBegEnd("EVENTVALIDATIONkdxxAdmin", "kdxxAdmin", 10000);

                        //continue;
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList tableList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Main_GV_New")));
                if (tableList != null && tableList.Count > 0)
                {
                    TableTag table = (TableTag)tableList[0];
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string   pPrjName = string.Empty, pBuildUnit = string.Empty, pBuildAddress = string.Empty, pBuildManager = string.Empty, pBuildScale = string.Empty, pPrjPrice = string.Empty, pPrjStartDate = string.Empty, PrjEndDate = string.Empty, pConstUnit = string.Empty, pConstUnitManager = string.Empty, pSuperUnit = string.Empty, pSuperUnitManager = string.Empty, pProspUnit = string.Empty, pProspUnitManager = string.Empty, pDesignUnit = string.Empty, pDesignUnitManager = string.Empty, pPrjManager = string.Empty, pSpecialPerson = string.Empty, pLicUnit = string.Empty, pPrjLicCode = string.Empty, PrjLicDate = string.Empty, pPrjDesc = string.Empty, pProvince = string.Empty, pCity = string.Empty, pInfoSource = string.Empty, pUrl = string.Empty, pCreatetime = string.Empty, pPrjCode = string.Empty;
                        TableRow tr = table.Rows[j];
                        pPrjName   = tr.Columns[2].ToPlainTextString().Trim();
                        pPrjCode   = tr.Columns[1].ToPlainTextString().Trim();
                        pBuildUnit = tr.Columns[3].ToPlainTextString().Trim();
                        PrjLicDate = tr.Columns[4].ToPlainTextString().Trim();
                        pUrl       = "http://www.szbajs.gov.cn/SiteManage/" + tr.GetAttribute("ondblclick").Replace("&amp;", "&").Replace(")", "kdxx").GetRegexBegEnd("&#39;", "kdxx").Replace("&#39;", "");
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(pUrl), Encoding.UTF8);
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "data_con")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            string ctx = dtnode.AsString().Replace(" ", "");
                            pInfoSource = ctx;

                            pDesignUnit   = ctx.GetRegex(new string[] { "设计单位" });
                            pBuildAddress = ctx.GetRegex(new string[] { "工程地址", "工程地点" });
                            pBuildScale   = ctx.GetRegex(new string[] { "建设规模", "建筑面积" });
                            pSuperUnit    = ctx.GetRegex(new string[] { "监理单位" });
                            pConstUnit    = ctx.GetRegex(new string[] { "施工单位" });
                            pLicUnit      = ctx.GetRegex(new string[] { "发证机关" });
                            pProspUnit    = ctx.GetRegex(new string[] { "勘察单位" });
                            pPrjManager   = ctx.GetRegex(new string[] { "项目经理", "项目负责人" });
                            pPrjStartDate = ctx.GetRegex(new string[] { "计划开工日期" });
                            PrjEndDate    = ctx.GetRegex(new string[] { "计划竣工日期" });
                            pPrjPrice     = ctx.GetMoneyRegex(new string[] { "工程造价" });
                            if (string.IsNullOrEmpty(PrjLicDate))
                            {
                                ctx.GetRegex("发证日期");
                            }

                            if (string.IsNullOrEmpty(pLicUnit))
                            {
                                pLicUnit = "深圳市宝安区建设局";
                            }
                            ProjectLic info = ToolDb.GenProjectLic(pPrjName, pBuildUnit, pBuildAddress, pBuildManager, pBuildScale, pPrjPrice, pPrjStartDate, PrjEndDate, pConstUnit, pConstUnitManager, pSuperUnit, pSuperUnitManager, pProspUnit, pProspUnitManager, pDesignUnit, pDesignUnitManager, pPrjManager, pSpecialPerson, pLicUnit, pPrjLicCode, PrjLicDate, pPrjDesc, "广东省", "深圳市宝安区", pInfoSource, pUrl, pCreatetime, pPrjCode, "深圳市宝安区建设局");
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemple #5
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list = new List <ProjectFinish>();
            string htl = string.Empty;
            string cookiestr = string.Empty;
            string viewState = string.Empty;
            int    pageInt = 1, count = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = ToolHtml.GetHtmlByUrlEncode(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(htl));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "pageLinkTd")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                try
                {
                    string temp = tdNodes.AsString().ToNodeString();
                    string s    = temp.GetRegexBegEnd("总页数", "页").Replace(":", "");
                    pageInt = int.Parse(s);
                }
                catch (Exception ex) { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]
                    {
                        "page",
                        "qymc",
                        "ann_serial",
                        "pro_name"
                    }, new string[] {
                        i.ToString(),
                        "",
                        "",
                        ""
                    });
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblPrjConstBid")));
                if (listNode != null && listNode.Count > 0)
                {
                    TableTag table = (TableTag)listNode[0];
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string pUrl = string.Empty, pInfoSource = string.Empty, pEndDate = string.Empty,
                               pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty,
                               prjEndDesc = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty,
                               pPrjCode = string.Empty, PrjName = string.Empty, pRecordUnit = string.Empty,
                               pCreatetime = string.Empty, pLicUnit = string.Empty;

                        TableRow tr = table.Rows[j];
                        pPrjCode   = tr.Columns[0].ToNodePlainString();
                        PrjName    = tr.Columns[1].ToNodePlainString();
                        pBuildUnit = tr.Columns[2].ToNodePlainString();
                        pEndDate   = tr.Columns[3].ToNodePlainString().GetDateRegex();


                        if (string.IsNullOrEmpty(pRecordUnit))
                        {
                            pRecordUnit = "深圳市住房和建设局";
                        }
                        ProjectFinish info = ToolDb.GenProjectFinish("广东省", pUrl, "深圳市区", pInfoSource, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, prjEndDesc, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pRecordUnit, pCreatetime, "深圳市住房和建设局", pLicUnit);
                        list.Add(info);
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                        count++;
                        if (count >= 200)
                        {
                            count = 1;
                            Thread.Sleep(600 * 1000);
                        }
                    }
                }
            }
            return(list);
        }