예제 #1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsPage")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace("0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zhbgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsLiks01Text"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value;
                        string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, "");
                        try
                        {
                            int beg = temp.IndexOf("else{"), end = temp.Length;
                            temp    = temp.Substring(beg, end - beg);
                            beg     = temp.IndexOf("<a");
                            end     = temp.IndexOf("/a>");
                            temp    = temp.Substring(beg, (end - beg) + 3);
                            beg     = temp.IndexOf(">");
                            end     = temp.IndexOf("</");
                            prjName = temp.Substring(beg + 1, end - beg - 1);
                            Parser   p    = new Parser(new Lexer(temp));
                            NodeList l    = p.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            ATag     aTag = l.SearchFor(typeof(ATag), true)[0] as ATag;
                            InfoUrl = "http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zhbgg/" + aTag.Link.Replace("../", "").Replace("./", "");
                        }
                        catch { continue; }
                        string htlDtl = string.Empty, ctx = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htlDtl = regexHtml.Replace(htlDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", "");
                            bidCtx  = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            bidCtx  = System.Text.RegularExpressions.Regex.Replace(bidCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            bidType = "工程";
                            if (prjName.Contains("施工"))
                            {
                                bidType = "施工";
                            }
                            if (prjName.Contains("监理"))
                            {
                                bidType = "监理";
                            }
                            if (prjName.Contains("设计"))
                            {
                                bidType = "设计";
                            }
                            if (prjName.Contains("勘察"))
                            {
                                bidType = "勘察";
                            }
                            if (prjName.Contains("服务"))
                            {
                                bidType = "服务";
                            }
                            if (prjName.Contains("劳务分包"))
                            {
                                bidType = "劳务分包";
                            }
                            if (prjName.Contains("专业分包"))
                            {
                                bidType = "专业分包";
                            }
                            if (prjName.Contains("小型施工"))
                            {
                                bidType = "小型工程";
                            }
                            if (prjName.Contains("设备材料"))
                            {
                                bidType = "设备材料";
                            }
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n");
                            code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额)(:|:|)[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBidUnit = new Regex(@"(第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n");
                            prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                            if (bidMoney.Contains("万"))
                            {
                                bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                                bidMoney = regBidMoney.Match(bidMoney).Value;
                            }
                            else
                            {
                                try
                                {
                                    bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                    if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                    {
                                        bidMoney = "0";
                                    }
                                }
                                catch (Exception)
                                {
                                    bidMoney = "0";
                                }
                            }
                            if (prjMgr.Contains("资格"))
                            {
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格"));
                            }
                            buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                            bidUnit   = ToolHtml.GetSubString(bidUnit, 150);
                            code      = ToolHtml.GetSubString(code, 50);
                            prjMgr    = ToolHtml.GetSubString(prjMgr, 50);
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市宝安区沙井街道办事处";
                            }
                            msgType  = "深圳市宝安区沙井街道办事处";
                            specType = "建设工程";
                            bidType  = "小型工程";
                            prjName  = ToolDb.GetPrjName(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #2
0
파일: NanFangDW.cs 프로젝트: SHNXJMG/Small
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Top10 TxtCenter")));

            if (noList != null && noList.Count > 0)
            {
                string temp = noList.AsString().GetRegexBegEnd("/", "页");
                try
                {
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.bidding.csg.cn/zbgg/index_" + i.ToString() + ".jhtml", Encoding.UTF8);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "W750 Right")), true), new TagNameFilter("li")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 1; j < nodeList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        ATag aTag = nodeList[j].GetATag();
                        prjName   = aTag.LinkText;
                        beginDate = nodeList[j].ToPlainTextString().GetDateRegex();
                        InfoUrl   = "http://www.bidding.csg.cn" + aTag.Link;
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = ToolHtml.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Center W1000")));
                        if (dtlNode != null && dtlNode.Count > 0)
                        {
                            HtmlTxt = dtlNode.AsHtml();
                            parser  = new Parser(new Lexer(HtmlTxt));
                            NodeList nameNode = parser.ExtractAllNodesThatMatch(new AndFilter(new
                                                                                              TagNameFilter("h1"), new HasAttributeFilter("class", "TxtCenter Padding10")));
                            if (nameNode != null && nameNode.Count > 0)
                            {
                                prjName = nameNode[0].ToNodePlainString();
                            }
                            inviteCtx = HtmlTxt.ToCtxString();

                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex);
                            buildUnit  = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex);
                            code       = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex);
                            prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            code       = ToolHtml.GetSubString(code, 50);
                            if (string.IsNullOrEmpty(code))
                            {
                                code = "见招标信息";
                            }
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见招标信息";
                            }
                            specType = "其他";
                            msgType  = "中国南方电网有限责任公司招标服务中心";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "中国南方电网有限责任公司招标服务中心";
                            }
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (nodeAtag != null && nodeAtag.Count > 0)
                            {
                                for (int c = 0; c < nodeAtag.Count; c++)
                                {
                                    ATag a = nodeAtag[c] as ATag;
                                    if (a.Link.IsAtagAttach())
                                    {
                                        string alink = "http://www.bidding.csg.cn/" + a.Link;
                                        try
                                        {
                                            BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace("&nbsp", "").Replace(";", "").Replace(";", ""), info.Id, alink);
                                            base.AttachList.Add(attach);
                                        }
                                        catch
                                        {
                                        }
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #3
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwpage")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString();
                    Regex  reg  = new Regex(@"/共[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/共", "").Replace("页", ""));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                }
                parser = new Parser(new Lexer(html));
                NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "jwRercon"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (dtlList != null && dtlList.Count > 0)
                {
                    for (int j = 0; j < dtlList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName = dtlList[j].ToPlainTextString().Trim().Remove(dtlList[j].ToPlainTextString().Trim().IndexOf("["));
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(dtlList[j].ToPlainTextString().Trim()).Value;
                        ATag aTag = dtlList.SearchFor(typeof(ATag), true)[j] as ATag;
                        InfoUrl = "http://www.szns.gov.cn" + aTag.Link;
                        string htmDtl = string.Empty;
                        try
                        {
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxzf2")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt   = dtl.AsHtml();
                            inviteCtx = dtl.AsString().Replace("&nbsp;", "").Replace("\n", "\r\n");
                            string InvType = prjName;
                            if (InvType.Contains("施工"))
                            {
                                inviteType = "施工";
                            }
                            if (InvType.Contains("监理"))
                            {
                                inviteType = "监理";
                            }
                            if (InvType.Contains("设计"))
                            {
                                inviteType = "设计";
                            }
                            if (InvType.Contains("勘察"))
                            {
                                inviteType = "勘察";
                            }
                            if (InvType.Contains("服务"))
                            {
                                inviteType = "服务";
                            }
                            if (InvType.Contains("劳务分包"))
                            {
                                inviteType = "劳务分包";
                            }
                            if (InvType.Contains("专业分包"))
                            {
                                inviteType = "专业分包";
                            }
                            if (InvType.Contains("小型施工"))
                            {
                                inviteType = "小型工程";
                            }
                            if (InvType.Contains("设备材料"))
                            {
                                inviteType = "设备材料";
                            }
                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx.Replace(" ", "")).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();
                            msgType = "深圳市南山区粤海街道办事处";
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见招标信息";
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市南山区粤海街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "南山区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #4
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt = 1;
            string html    = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("div")), new HasAttributeFilter("id", "page_div")));

            if (sNode != null && sNode.Count > 0)
            {
                string page = ToolHtml.GetRegexString(sNode.AsString(), "共", "页");
                try
                {
                    pageInt = int.Parse(page);
                }
                catch
                {
                    pageInt = 7;
                }
            }
            parser.Reset();
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.conghua.gov.cn/zgch/zbzb/list_" + i.ToString() + ".shtml", Encoding.Default);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_list"))), new TagNameFilter("table")));
                if (sNode != null && sNode.Count > 0)
                {
                    TableTag table = sNode[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        TableRow tr          = table.Rows[j];
                        string   projectName = ToolHtml.GetHtmlAtagValue("title", tr.ToHtml());
                        if (!projectName.Contains("中标") && !projectName.Contains("结果") && !projectName.Contains("候选单位公示"))
                        {
                            string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                   prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                   specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                   remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                   CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            prjName    = projectName;
                            inviteType = ToolHtml.GetInviteTypes(projectName);
                            beginDate  = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString());
                            InfoUrl    = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", "");
                            string htmlDtl = string.Empty;
                            try
                            {
                                htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                                htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl);
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htmlDtl));
                            NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon")));
                            if (dtlList != null && dtlList.Count > 0)
                            {
                                HtmlTxt   = dtlList.ToHtml();
                                inviteCtx = dtlList.AsString().Replace("&nbsp;", "");

                                buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex, true);
                                if (!string.IsNullOrEmpty(buildUnit) && buildUnit.Contains(" "))
                                {
                                    buildUnit = buildUnit.Remove(buildUnit.IndexOf(" "));
                                }

                                buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                                msgType    = "广州建设工程交易中心";
                                specType   = "建设工程";
                                inviteType = inviteType == "" ? "小型工程" : inviteType;
                                if (string.IsNullOrEmpty(buildUnit))
                                {
                                    buildUnit = "广州建设工程交易中心";
                                }
                                InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                                list.Add(info);
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                        else
                        {
                            string prjName = string.Empty,
                                   buildUnit = string.Empty, bidUnit = string.Empty,
                                   bidMoney = string.Empty, code = string.Empty,
                                   bidDate = string.Empty,
                                   beginDate = string.Empty,
                                   endDate = string.Empty, bidType = string.Empty,
                                   specType = string.Empty, InfoUrl = string.Empty,
                                   msgType = string.Empty, bidCtx = string.Empty,
                                   prjAddress = string.Empty, remark = string.Empty,
                                   prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            prjName   = projectName;
                            bidType   = ToolHtml.GetInviteTypes(projectName);
                            beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString());
                            InfoUrl   = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", "");
                            string htmlDtl = string.Empty;
                            try
                            {
                                htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                                htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl);
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htmlDtl));
                            NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon")));
                            if (dtlList != null && dtlList.Count > 0)
                            {
                                HtmlTxt   = dtlList.ToHtml();
                                bidCtx    = dtlList.AsString();
                                buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex, true);
                                buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                                msgType   = "广州建设工程交易中心";
                                specType  = "建设工程";
                                bidType   = bidType == "" ? bidType : "小型工程";

                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (bidNode != null && bidNode.Count > 0)
                                {
                                    string   ctx      = string.Empty;
                                    TableTag bidTable = bidNode[0] as TableTag;
                                    try
                                    {
                                        for (int r = 0; r < bidTable.RowCount; r++)
                                        {
                                            ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":";
                                            ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n";
                                        }
                                    }
                                    catch { }

                                    bidUnit  = ctx.GetRegex("单位名称,承包意向人名称");
                                    bidMoney = ctx.GetMoneyRegex();
                                    prjMgr   = ctx.GetMgrRegex();
                                    if (prjMgr.Contains("/"))
                                    {
                                        prjMgr = prjMgr.Remove(prjMgr.IndexOf("/"));
                                    }
                                }

                                if (string.IsNullOrEmpty(buildUnit))
                                {
                                    buildUnit = "广州建设工程交易中心";
                                }
                                BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                                 bidMoney, InfoUrl, prjMgr, HtmlTxt);
                                list.Add(info);
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #5
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser    = new Parser(new Lexer(htl));
            NodeList nodeList  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("align", "right")));
            Regex    regexPage = new Regex(@"\d+页");

            try
            {
                page = Convert.ToInt32(regexPage.Match(nodeList.AsString()).Value.Replace("页", "").Trim());
            }
            catch (Exception)
            { }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.Default);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center")));
                if (tableNodeList != null && tableNodeList.Count > 1)
                {
                    TableTag table = (TableTag)tableNodeList[3];
                    for (int j = 0; j < table.RowCount - 1; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, img = string.Empty,
                               HtmlTxt = string.Empty, downUrl = string.Empty, downName = string.Empty;
                        TableRow tr   = table.Rows[j];
                        ATag     aTag = tr.GetATag(1);
                        prjName = aTag.LinkText;
                        if (prjName == "参加网上竞价招标供应商,敬请浏览以下网站")
                        {
                            continue;
                        }
                        beginDate = tr.Columns[1].ToPlainTextString().Trim();
                        InfoUrl   = "http://zhaobiao.szpt.edu.cn/" + aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default);
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new TagNameFilter("p"));
                        if (dtnode.Count > 0)
                        {
                            HtmlTxt = dtnode.AsHtml();
                            Regex regeximg = new Regex(@"<IMG[^>]*>");//去掉图片
                            HtmlTxt = regeximg.Replace(HtmlTxt, "");
                            for (int z = 0; z < dtnode.Count; z++)
                            {
                                inviteCtx += dtnode[z].ToPlainTextString().Replace("&nbsp;", "").Trim() + "\r\n";
                            }
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            inviteCtx = regexHtml.Replace(inviteCtx, "");
                            Regex regcode = new Regex(@"(项目编号|招标编号)(:|:)[^\r\n]+\r\n");
                            code = regcode.Match(inviteCtx).Value.Replace("项目编号:", "").Replace("招标编号:", "").Replace(":", "").Trim();
                            code = ToolHtml.GetSubString(code, 30);
                            Regex regprjAddress = new Regex(@"地址(:|:)[^\r\n]+\r\n");
                            prjAddress = regprjAddress.Match(inviteCtx).Value.Replace("地址:", "").Trim();
                            //Regex regBegin = new Regex(@"投标报名时间:[^\r\n]+[\r\n]{1}");
                            //string date = regBegin.Match(inviteCtx).Value.Replace("投标报名时间:", "").Replace(" ", "").Trim();
                            //Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日");
                            //endDate = regDate.Match(date).Value.Trim();
                            Regex regBuidUnit = new Regex(@"(招标机构|委托单位)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标机构:", "").Replace("委托单位:", "").Trim();
                            if (inviteType == "设备材料" || inviteType == "小型施工" || inviteType == "专业分包" || inviteType == "劳务分包" || inviteType == "服务" || inviteType == "勘察" || inviteType == "设计" || inviteType == "监理" || inviteType == "施工")
                            {
                                specType = "建设工程";
                            }
                            else
                            {
                                specType = "其他";
                            }
                            if (buildUnit == "")
                            {
                                buildUnit = "";
                            }
                            if (prjAddress == "")
                            {
                                prjAddress = "见招标信息";
                            }
                            msgType    = "深职院";
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "",
                                                                   string.Empty, code, prjName, prjAddress, buildUnit,
                                                                   beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            parserdetail.Reset();
                            parserdetail = new Parser(new Lexer(htmldetail));
                            NodeList nodedown = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("p"), true), new TagNameFilter("a")));
                            for (int k = 0; k < nodedown.Count; k++)
                            {
                                ATag aTagdown = nodedown.SearchFor(typeof(ATag), true)[k] as ATag;
                                if (aTagdown.LinkText.Contains(".doc") || aTagdown.LinkText.Contains(".dwg") || aTagdown.LinkText.Contains(".xls"))
                                {
                                    downName = aTagdown.LinkText;
                                    downUrl  = "http://zhaobiao.szpt.edu.cn" + aTagdown.Link;
                                    BaseAttach attach = ToolDb.GenBaseAttach(downName, info.Id, downUrl);
                                    base.AttachList.Add(attach);
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(null);
        }
예제 #6
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "venycms-page")), true), new TagNameFilter("script")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string page = sNode.ToString().Replace("createPageHTML(", "").Replace(",", "kd").Replace("****", "").Replace("\n", "");
                    page    = page.GetRegexBegEnd("Code", "kd");
                    pageInt = int.Parse(page);
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://sgjd.baoan.gov.cn/zbcg/zhbgg_139208/index_" + (i - 1) + ".html", Encoding.UTF8);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content clearfix")), true), new TagNameFilter("li")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        ATag aTag = viewList[j].GetATag();
                        prjName   = aTag.GetAttribute("title");
                        beginDate = viewList[j].ToPlainTextString().GetDateRegex();
                        InfoUrl   = aTag.Link;
                        InfoUrl   = InfoUrl.GetRegexBegEnd("./", ".html");
                        InfoUrl   = "http://sgjd.baoan.gov.cn/zbcg/zhbgg_139208/" + InfoUrl + ".html";
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "con")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            HtmlTxt = dtlList.AsHtml();
                            bidCtx  = HtmlTxt.ToCtxString();

                            bidType = prjName.GetInviteBidType();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n");
                            code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|总价)(:|:|)[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("总价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim();

                            Regex regBidUnit = new Regex(@"(成交供应商|中标供应商|第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("成交供应商", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理(或建造师)|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n");
                            prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理(或建造师)", "").Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");

                            if (prjMgr.Contains("资格"))
                            {
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格"));
                            }
                            bidUnit   = ToolHtml.GetStringTemp(bidUnit);
                            buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                            bidUnit   = ToolHtml.GetSubString(bidUnit, 150);
                            code      = ToolHtml.GetSubString(code, 50);
                            prjMgr    = ToolHtml.GetSubString(prjMgr, 50);
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable")));
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    for (int t = 0; t < tableNode.Count; t++)
                                    {
                                        TableTag table = tableNode[t] as TableTag;

                                        string ctx = string.Empty;
                                        for (int r = 0; r < table.Rows[0].ColumnCount; r++)
                                        {
                                            try
                                            {
                                                ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":";
                                                ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n";
                                            }
                                            catch { }
                                        }
                                        bidUnit = ctx.GetBidRegex();
                                        if (string.IsNullOrEmpty(bidUnit))
                                        {
                                            bidUnit = ctx.GetRegex("中标供应商");
                                        }

                                        if (string.IsNullOrWhiteSpace(code))
                                        {
                                            code = ctx.GetCodeRegex();
                                        }
                                        // break;

                                        bidMoney = ctx.GetMoneyRegex();
                                    }
                                }
                            }


                            try
                            {
                                if (Convert.ToDecimal(bidMoney) > 100000)
                                {
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                }
                            }
                            catch { }
                            if (bidMoney.Contains("万"))
                            {
                                bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                                bidMoney = regBidMoney.Match(bidMoney).Value;
                            }
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市宝安区松岗街道办事处";
                            }
                            msgType  = "深圳市宝安区松岗街道办事处";
                            specType = "建设工程";
                            bidType  = "小型工程";
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #7
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_page")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(",0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgs/index_" + (i - 1).ToString() + ".html", Encoding.UTF8);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right_list"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value;
                        string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, "");
                        try
                        {
                            int beg = temp.IndexOf("else"), end = temp.Length;
                            temp    = temp.Substring(beg, end - beg);
                            beg     = temp.IndexOf("<a");
                            end     = temp.IndexOf("/a>");
                            temp    = temp.Substring(beg, (end - beg) + 3);
                            beg     = temp.IndexOf(">");
                            end     = temp.IndexOf("</");
                            prjName = temp.Substring(beg + 1, end - beg - 1);
                            Parser   p    = new Parser(new Lexer(temp));
                            NodeList l    = p.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            ATag     aTag = l.SearchFor(typeof(ATag), true)[0] as ATag;
                            InfoUrl = "http://xajdb.baoan.gov.cn/xxgk_11984/ywxx/zbcg/zbxxgs/" + aTag.Link.Replace("../", "").Replace("./", "");
                        }
                        catch { continue; }

                        string htmDtl = string.Empty;
                        try
                        {
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "TRS_PreAppend")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            parser  = new Parser(new Lexer(HtmlTxt.Replace("th", "td")));
                            NodeList dtlTab = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "texttable")));
                            if (dtlTab != null && dtlTab.Count > 0)
                            {
                                TableTag table = dtlTab[0] as TableTag;
                                for (int k = 0; k < table.RowCount; k++)
                                {
                                    for (int c = 0; c < table.Rows[k].ColumnCount; c++)
                                    {
                                        string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace("&nbsp;", "").Replace(" ", "");
                                        if (strCtx == "工程类型")
                                        {
                                            break;
                                        }
                                        if (c % 2 == 0)
                                        {
                                            inviteCtx += strCtx + ":";
                                        }
                                        else
                                        {
                                            inviteCtx += strCtx + "\r\n";
                                        }
                                    }
                                }
                            }
                            else
                            {
                                inviteCtx = dtl.AsString().Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            }
                            string InvType = prjName;
                            if (InvType.Contains("施工"))
                            {
                                inviteType = "施工";
                            }
                            if (InvType.Contains("监理"))
                            {
                                inviteType = "监理";
                            }
                            if (InvType.Contains("设计"))
                            {
                                inviteType = "设计";
                            }
                            if (InvType.Contains("勘察"))
                            {
                                inviteType = "勘察";
                            }
                            if (InvType.Contains("服务"))
                            {
                                inviteType = "服务";
                            }
                            if (InvType.Contains("劳务分包"))
                            {
                                inviteType = "劳务分包";
                            }
                            if (InvType.Contains("专业分包"))
                            {
                                inviteType = "专业分包";
                            }
                            if (InvType.Contains("小型施工"))
                            {
                                inviteType = "小型工程";
                            }
                            if (InvType.Contains("设备材料"))
                            {
                                inviteType = "设备材料";
                            }

                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标单位|招标人|招标单位(盖章)|采购人)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();
                            msgType = "深圳市宝安区新安街道办事处";
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            if (string.IsNullOrWhiteSpace(prjAddress))
                            {
                                prjAddress = inviteCtx.GetAddressRegex();
                            }
                            if (string.IsNullOrWhiteSpace(buildUnit))
                            {
                                inviteCtx.GetBuildRegex();
                            }
                            if (string.IsNullOrWhiteSpace(code))
                            {
                                code = inviteCtx.GetCodeRegex();
                            }
                            if (code.Contains(")"))
                            {
                                code = code.Remove(code.IndexOf(")"));
                            }
                            if (buildUnit.Contains("采购人"))
                            {
                                buildUnit = buildUnit.Replace("采购人", "");
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市宝安区新安街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #8
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace(" ", "");
                    Regex  reg  = new Regex(@"/[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", ""));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://zyjy.huizhou.gov.cn/pages/cms/hzggzyjyzx/html/artList.html?cataId=a000dc84e53b4dc88e1e05d15d7c90f7&pageNo=" + i.ToString(), Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value;
                        //prjName = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, "");
                        ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag;
                        prjName = aTag.GetAttribute("title");
                        InfoUrl = "http://zyjy.huizhou.gov.cn" + aTag.Link;
                        string htmDtl = string.Empty;
                        try
                        {
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            bidCtx  = HtmlTxt.ToCtxString();
                            NodeList ifrm = new Parser(new Lexer(htmDtl)).ExtractAllNodesThatMatch(new TagNameFilter("iframe"));
                            if (ifrm != null && ifrm.Count > 0)
                            {
                                IFrameTag frame = ifrm[0] as IFrameTag;
                                string    url   = frame.GetAttribute("src");
                                try
                                {
                                    string   htm     = this.ToolWebSite.GetHtmlByUrl(url, Encoding.Default);
                                    NodeList tabNode = new Parser(new Lexer(htm)).ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                    string   ctx     = tabNode.AsHtml().ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t");
                                    bidCtx = ctx + bidCtx;
                                }
                                catch { }
                            }
                            //bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            //bidCtx = System.Text.RegularExpressions.Regex.Replace(bidCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");

                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n");
                            code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|总价)(:|:|)[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("总价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim();

                            Regex regBidUnit = new Regex(@"(成交供应商|中标供应商|第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("成交供应商", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理(或建造师)|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n");
                            prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理(或建造师)", "").Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                            if (bidMoney.Contains("万"))
                            {
                                bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                                bidMoney = regBidMoney.Match(bidMoney).Value;
                            }
                            else
                            {
                                try
                                {
                                    bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                    if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                    {
                                        bidMoney = "0";
                                    }
                                }
                                catch (Exception)
                                {
                                    bidMoney = "0";
                                }
                            }
                            if (prjMgr.Contains("资格"))
                            {
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格"));
                            }
                            bidUnit   = ToolHtml.GetStringTemp(bidUnit).Replace(";", "");
                            buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                            bidUnit   = ToolHtml.GetSubString(bidUnit, 150);
                            code      = ToolHtml.GetSubString(code, 50);
                            prjMgr    = ToolHtml.GetSubString(prjMgr, 50);
                            msgType   = "惠州市公共资源交易中心";
                            specType  = "建设工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "惠州市公共资源交易中心";
                            }
                            bidType = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #9
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "middle")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString();
                    Regex  reg  = new Regex(@"/[^页]+页");
                    string page = reg.Match(temp).Value.Replace("/", "").Replace("页", "");
                    pageInt = Convert.ToInt32(page);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl) + "&page=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "100%"))), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        TableTag table = nodeList[j] as TableTag;
                        TableRow tr    = table.Rows[0];

                        prjName   = tr.Columns[0].ToNodePlainString();
                        bidType   = prjName.GetInviteBidType();
                        beginDate = tr.Columns[1].ToPlainTextString();

                        InfoUrl = "http://www.cajsw.gov.cn/" + tr.Columns[0].GetATagHref(2);

                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                            htlDtl = htlDtl.GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "fontzoom")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            HtmlTxt = dtlList.ToHtml();
                            bidCtx  = HtmlTxt.ToCtxString();
                            string ctx = bidCtx.ToNodeString();

                            bidUnit = ctx.GetRegexBegEnd("中标候选人为", ",");
                            bidUnit = ToolHtml.GetSubString(bidUnit, 150);
                            string money = ctx.GetRegexBegEnd("投标报价", "元").GetMoney();

                            bidMoney = money.GetMoney();

                            prjAddress = bidCtx.GetAddressRegex();
                            code       = bidCtx.GetCodeRegex();
                            buildUnit  = bidCtx.GetBuildRegex();
                            msgType    = "潮州市潮安县住房和城乡建设局";
                            specType   = "建设工程";
                            BidInfo info = ToolDb.GenBidInfo("广东省", "潮州市区", "潮安县", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #10
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace(" ", "");
                    Regex  reg  = new Regex(@"/[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", ""));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://zyjy.huizhou.gov.cn/pages/cms/hzggzyjyzx/html/artList.html?cataId=54f6d9f3580843d59b9dd64918e7ae4f&pageNo=" + i.ToString(), Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value;
                        prjName   = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, "");
                        ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag;
                        InfoUrl = "http://zyjy.huizhou.gov.cn" + aTag.Link;
                        string htmDtl = string.Empty;
                        try
                        {
                            System.Data.DataTable dt = new System.Data.DataTable();
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt   = System.Text.RegularExpressions.Regex.Replace(dtl.ToHtml(), "(<script)[\\s\\S]*?(</script>)", "");
                            inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");

                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            if (buildUnit.Contains("资质"))
                            {
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("资质"));
                            }
                            prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code       = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();
                            msgType    = "惠州市公共资源交易中心";
                            specType   = "建设工程";
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            if (Encoding.Default.GetByteCount(code) > 50)
                            {
                                code = "";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }

            return(list);
        }
예제 #11
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsPage")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsLiks01Text"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value;
                        string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, "");
                        try
                        {
                            int beg = temp.IndexOf("else{"), end = temp.Length;
                            temp    = temp.Substring(beg, end - beg);
                            beg     = temp.IndexOf("<a");
                            end     = temp.IndexOf("/a>");
                            temp    = temp.Substring(beg, (end - beg) + 3);
                            beg     = temp.IndexOf(">");
                            end     = temp.IndexOf("</");
                            prjName = temp.Substring(beg + 1, end - beg - 1);
                            Parser   p    = new Parser(new Lexer(temp));
                            NodeList l    = p.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            ATag     aTag = l.SearchFor(typeof(ATag), true)[0] as ATag;
                            InfoUrl = "http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/" + aTag.Link.Replace("../", "").Replace("./", "");
                        }
                        catch { continue; }
                        string htlDtl = string.Empty, ctx = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htlDtl = regexHtml.Replace(htlDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt   = System.Text.RegularExpressions.Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", "");
                            inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            string InvType = prjName;
                            if (InvType.Contains("施工"))
                            {
                                inviteType = "施工";
                            }
                            if (InvType.Contains("监理"))
                            {
                                inviteType = "监理";
                            }
                            if (InvType.Contains("设计"))
                            {
                                inviteType = "设计";
                            }
                            if (InvType.Contains("勘察"))
                            {
                                inviteType = "勘察";
                            }
                            if (InvType.Contains("服务"))
                            {
                                inviteType = "服务";
                            }
                            if (InvType.Contains("劳务分包"))
                            {
                                inviteType = "劳务分包";
                            }
                            if (InvType.Contains("专业分包"))
                            {
                                inviteType = "专业分包";
                            }
                            if (InvType.Contains("小型施工"))
                            {
                                inviteType = "小型工程";
                            }
                            if (InvType.Contains("设备材料"))
                            {
                                inviteType = "设备材料";
                            }
                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();
                            msgType = "深圳市宝安区沙井街道办事处";
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市宝安区沙井街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "宝安区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #12
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);//ToolSocket.Get("http://www.guanhu.gov.cn/NEWS/Public_Edit.aspx?verid=2f51d6aa-816e-41bb-a331-bce28a4f9554", Encoding.Default);
            }
            catch
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs/15158-" + i + ".html", Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        TableTag table = viewList[j] as TableTag;
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                 prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                 specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                 remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                 CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        beginDate = table.ToPlainTextString().GetDateRegex();
                        ATag aTag = table.GetATag();
                        prjName = aTag.GetAttribute("title");
                        InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link;
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt    = dtl.AsHtml();
                            inviteCtx  = HtmlTxt.ToCtxString();
                            inviteCtx  = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            inviteType = prjName.GetInviteBidType();

                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace(")", "").Replace(")", "").Trim();
                            msgType = "深圳市龙华新区观澜街道办事处";
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市龙华新区观澜街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #13
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string cookiestr       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default);
            }
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    string temp = pageNode.AsString().GetRegexBegEnd("共", "页");
                    pageInt = int.Parse(temp);
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default);
                    }
                    catch { continue; }
                }
                parser   = new Parser(new Lexer(html));
                pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position6")), true), new TagNameFilter("li")));
                if (pageNode != null && pageNode.Count > 0)
                {
                    for (int j = 3; j < pageNode.Count; j++)
                    {
                        INode  node   = pageNode[j];
                        ATag   aTag   = node.GetATag();
                        string psName = aTag.LinkText;
                        if (psName.Contains("中标") || psName.Contains("结果"))
                        {
                            string prjName = string.Empty,
                                   buildUnit = string.Empty, bidUnit = string.Empty,
                                   bidMoney = string.Empty, code = string.Empty,
                                   bidDate = string.Empty,
                                   beginDate = string.Empty,
                                   endDate = string.Empty, bidType = string.Empty,
                                   specType = string.Empty, InfoUrl = string.Empty,
                                   msgType = string.Empty, bidCtx = string.Empty,
                                   prjAddress = string.Empty, remark = string.Empty,
                                   prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            prjName = aTag.GetAttribute("title");
                            InfoUrl = "http://www.zqgcjy.com/" + aTag.Link;
                            string htmldetail = string.Empty;
                            try
                            {
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                            }
                            catch (Exception)
                            {
                                continue;
                            }
                            Parser   parserdetail = new Parser(new Lexer(htmldetail));
                            NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1")));
                            if (dtnode != null && dtnode.Count > 0)
                            {
                                HtmlTxt   = dtnode.AsHtml();
                                bidCtx    = HtmlTxt.ToCtxString();
                                beginDate = bidCtx.GetDateRegex();
                                code      = bidCtx.GetCodeRegex();
                                bidMoney  = bidCtx.GetMoneyRegex();
                                if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                {
                                    bidMoney = bidCtx.GetMoneyRegex(null, true);
                                }
                                if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                {
                                    bidMoney = bidCtx.GetRegex("总额").GetMoney();
                                }
                                prjMgr    = bidCtx.GetMgrRegex();
                                bidUnit   = bidCtx.GetBidRegex();
                                bidDate   = bidCtx.GetTimeRegex();
                                buildUnit = bidCtx.GetBuildRegex();
                                parser    = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    for (int t = 0; t < tableNode.Count; t++)
                                    {
                                        TableTag tag      = tableNode[t] as TableTag;
                                        string   classStr = tag.GetAttribute("class");
                                        if (!string.IsNullOrEmpty(classStr) && classStr.ToLower().Contains("table1"))
                                        {
                                            continue;
                                        }

                                        string ctx = string.Empty;
                                        for (int r = 0; r < tag.RowCount; r++)
                                        {
                                            for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                            {
                                                string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:");
                                                if ((c + 1) % 2 == 0)
                                                {
                                                    ctx += temp + "\r\n";
                                                }
                                                else
                                                {
                                                    ctx += temp + ":";
                                                }
                                            }
                                        }


                                        if (string.IsNullOrEmpty(bidUnit))
                                        {
                                            bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一候选人");
                                        }
                                        if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                        {
                                            bidMoney = ctx.GetMoneyRegex();
                                        }
                                        if (string.IsNullOrEmpty(prjMgr))
                                        {
                                            prjMgr = ctx.GetMgrRegex();
                                        }
                                        if (string.IsNullOrEmpty(prjMgr))
                                        {
                                            prjMgr = ctx.GetRegex("拟任总监,拟任项目经理");
                                        }

                                        if (!bidUnit.Contains("公司"))
                                        {
                                            ctx = string.Empty;
                                            try
                                            {
                                                for (int r = 1; r < tag.Rows[4].ColumnCount; r++)
                                                {
                                                    string temp = tag.Rows[4].Columns[r].ToNodePlainString().GetReplace(":,:");
                                                    ctx += temp + ":";
                                                    ctx += tag.Rows[5].Columns[r].ToNodePlainString().GetReplace(":,:") + "\r\n";
                                                }
                                                if (string.IsNullOrEmpty(bidUnit))
                                                {
                                                    bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人");
                                                }
                                                if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                                {
                                                    bidMoney = ctx.GetMoneyRegex();
                                                }
                                                if (string.IsNullOrEmpty(prjMgr))
                                                {
                                                    prjMgr = ctx.GetMgrRegex();
                                                }
                                                if (string.IsNullOrEmpty(prjMgr))
                                                {
                                                    prjMgr = ctx.GetRegex("拟任总监,拟任项目经理");
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                                msgType  = "肇庆工程交易中心";
                                specType = bidType = "建设工程";
                                BidInfo info = ToolDb.GenBidInfo("广东省", "肇庆市区", "", string.Empty, code, prjName, buildUnit, beginDate,
                                                                 bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                                 bidMoney, InfoUrl, prjMgr, HtmlTxt);
                                list.Add(info);

                                //ToolDb.SaveEntity(info, "");
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                        else
                        {
                            string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                   prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                   specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                   remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                   CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                            prjName = aTag.GetAttribute("title");
                            InfoUrl = "http://www.zqgcjy.com/" + aTag.Link;
                            string htmldtl = string.Empty;
                            try
                            {
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                            }
                            catch (Exception)
                            {
                                continue;
                            }
                            Parser   parserdetail = new Parser(new Lexer(htmldtl));
                            NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table1")));
                            if (dtnode != null && dtnode.Count > 0)
                            {
                                HtmlTxt    = dtnode.AsHtml();
                                inviteCtx  = HtmlTxt.ToCtxString();
                                buildUnit  = inviteCtx.GetBidUnitDel().GetBuildRegex();
                                beginDate  = inviteCtx.GetDateRegex();
                                prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex); //inviteCtx.GetAddressRegex();
                                code       = inviteCtx.GetReplace(" ").GetCodeRegex().GetCodeDel();
                                prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    string   ctx = string.Empty;
                                    TableTag tag = tableNode[0] as TableTag;
                                    for (int r = 0; r < tag.RowCount; r++)
                                    {
                                        for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                        {
                                            string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:");
                                            if ((c + 1) % 2 == 0)
                                            {
                                                ctx += temp + "\r\n";
                                            }
                                            else
                                            {
                                                ctx += temp + ":";
                                            }
                                        }
                                    }
                                    if (string.IsNullOrEmpty(code))
                                    {
                                        code = ctx.GetCodeRegex();
                                    }
                                    if (string.IsNullOrEmpty(buildUnit))
                                    {
                                        buildUnit = ctx.GetBuildRegex();
                                    }
                                    if (string.IsNullOrEmpty(prjAddress))
                                    {
                                        prjAddress = ctx.GetAddressRegex();
                                    }
                                    if (string.IsNullOrEmpty(prjAddress))
                                    {
                                        prjAddress = "见招标信息";
                                    }
                                }
                                msgType    = "肇庆工程交易中心";
                                specType   = "建设工程";
                                inviteType = ToolHtml.GetInviteTypes(prjName);
                                InviteInfo info = ToolDb.GenInviteInfo("广东省", "肇庆市区", "",
                                                                       string.Empty, code, prjName, prjAddress, buildUnit,
                                                                       beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                                list.Add(info);
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
예제 #14
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema")));

            if (noList != null && noList.Count > 0)
            {
                string temp = noList.AsString();
                try
                {
                    Regex  reg    = new Regex(@"/[^页]+页");
                    string result = reg.Match(temp).Value.Replace("页", "").Replace("/", "");
                    pageInt = Convert.ToInt32(result);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=76&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (dtlList != null && dtlList.Count > 0)
                {
                    for (int j = 0; j < dtlList.Count - 1; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        string temp    = dtlList[j].ToPlainTextString();
                        string tempHtl = dtlList[j].ToHtml();
                        prjName   = ToolHtml.GetHtmlAtagValue("title", tempHtl);
                        beginDate = ToolHtml.GetRegexDateTime(temp);
                        InfoUrl   = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl);
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                            htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", "");
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main")));
                        if (htlList != null && htlList.Count > 0)
                        {
                            HtmlTxt    = htlList.ToHtml();
                            inviteCtx  = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\t\t", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex);
                            buildUnit  = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex);
                            code       = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex);
                            prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            code       = ToolHtml.GetSubString(code, 50);
                            if (string.IsNullOrEmpty(code))
                            {
                                code = "见招标信息";
                            }
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见招标信息";
                            }
                            specType = "其他";
                            msgType  = "工网在线";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "工网在线";
                            }
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #15
0
파일: BidXiXiang.cs 프로젝트: SHNXJMG/Small
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "50%")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace(" ", "");
                    Regex  reg  = new Regex(@"条,[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("条,", "").Replace("页", ""));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&p=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Datagrid")));
                if (viewList != null && viewList.Count > 0)
                {
                    TableTag tab = viewList[0] as TableTag;
                    for (int j = 0; j < tab.RowCount; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        TableRow tr = tab.Rows[j];
                        prjName = tr.Columns[1].ToPlainTextString().Replace("\r", "").Replace("\t", "").Replace("\n", "");
                        Regex regDate = new Regex(@"\d{4}/\d{1,2}/\d{1,2}");
                        beginDate = regDate.Match(tr.Columns[2].ToPlainTextString()).Value;
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        InfoUrl = "http://www.xixiang.gov.cn/" + aTag.Link;
                        string htmDtl = string.Empty;
                        try
                        {
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            parser  = new Parser(new Lexer(HtmlTxt));
                            NodeList span = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent")));
                            if (span != null && span.Count > 0)
                            {
                                bidCtx = Regex.Replace(span.AsHtml().ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n";
                                parser = new Parser(new Lexer(span.AsHtml().ToLower().Replace("th", "td")));
                                NodeList dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (dtlTab != null && dtlTab.Count > 0)
                                {
                                    bidCtx = "";
                                    TableTag table = dtlTab[0] as TableTag;
                                    for (int k = 0; k < table.RowCount; k++)
                                    {
                                        for (int c = 0; c < table.Rows[k].ColumnCount; c++)
                                        {
                                            if (table.RowCount > 1 && k == 0)
                                            {
                                                string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace("&nbsp;", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", "");
                                                bidCtx += strCtx + ":" + table.Rows[k + 1].Columns[c].ToPlainTextString().Replace("&nbsp;", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", "") + "\r\n";
                                            }
                                        }
                                        break;
                                    }
                                    bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n";
                                }
                                else
                                {
                                    string ctx = HtmlTxt.ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n");
                                    bidCtx = Regex.Replace(ctx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n";
                                }
                            }
                            else
                            {
                                parser = new Parser(new Lexer(HtmlTxt.ToLower().Replace("th", "td")));
                                NodeList dtlTab = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (dtlTab != null && dtlTab.Count > 0)
                                {
                                    TableTag table = dtlTab[0] as TableTag;
                                    for (int k = 0; k < table.RowCount; k++)
                                    {
                                        for (int c = 0; c < table.Rows[k].ColumnCount; c++)
                                        {
                                            if (table.RowCount > 1 && k == 0)
                                            {
                                                string strCtx = table.Rows[k].Columns[c].ToPlainTextString().Replace("&nbsp;", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", "");
                                                bidCtx += strCtx + ":" + table.Rows[k + 1].Columns[c].ToPlainTextString().Replace("&nbsp;", "").Replace(" ", "").Replace("\r\n", "").Replace("\n", "") + "\r\n";
                                            }
                                        }
                                        break;
                                    }
                                    bidCtx = bidCtx.Replace("\n", "").Replace("\r\n\r\n", "\r\n").Replace("\r", "\r\n") + "\r\n";
                                }
                                else
                                {
                                    string ctx = HtmlTxt.ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n");
                                    bidCtx = Regex.Replace(ctx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n";
                                }
                            }
                            bidCtx  = bidCtx.Replace(" ", "");
                            bidType = "工程";
                            if (prjName.Contains("施工"))
                            {
                                bidType = "施工";
                            }
                            if (prjName.Contains("监理"))
                            {
                                bidType = "监理";
                            }
                            if (prjName.Contains("设计"))
                            {
                                bidType = "设计";
                            }
                            if (prjName.Contains("勘察"))
                            {
                                bidType = "勘察";
                            }
                            if (prjName.Contains("服务"))
                            {
                                bidType = "服务";
                            }
                            if (prjName.Contains("劳务分包"))
                            {
                                bidType = "劳务分包";
                            }
                            if (prjName.Contains("专业分包"))
                            {
                                bidType = "专业分包";
                            }
                            if (prjName.Contains("小型施工"))
                            {
                                bidType = "小型工程";
                            }
                            if (prjName.Contains("设备材料"))
                            {
                                bidType = "设备材料";
                            }
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n");
                            code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|报价)(:|:|)[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("报价", "").Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace("¥", "").Replace(",", "").Trim();

                            Regex regBidUnit = new Regex(@"(第一候选人|投标供应商名称|中标候选人|中标供应商|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("投标供应商名称", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n");
                            prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                            if (bidMoney.Contains("万"))
                            {
                                //bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                                bidMoney = regBidMoney.Match(bidMoney).Value;
                            }
                            else
                            {
                                try
                                {
                                    bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                    if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                    {
                                        bidMoney = "0";
                                    }
                                }
                                catch (Exception)
                                {
                                    bidMoney = "0";
                                }
                            }
                            if (prjMgr.Contains("资格"))
                            {
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格"));
                            }
                            string ctxs = HtmlTxt.ToLower().Replace("<br/>", "\r\n").Replace("<br>", "\r\n");
                            bidCtx    = Regex.Replace(ctxs, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n") + "\r\n";
                            bidCtx    = bidCtx.Replace(" ", "");
                            bidUnit   = ToolHtml.GetStringTemp(bidUnit);
                            buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                            bidUnit   = ToolHtml.GetSubString(bidUnit, 150);
                            code      = ToolHtml.GetSubString(code, 50);
                            prjMgr    = ToolHtml.GetSubString(prjMgr, 50);
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市宝安区西乡街道办事处";
                            }
                            msgType  = "深圳市宝安区西乡街道办事处";
                            specType = "建设工程";
                            bidType  = "小型工程";
                            prjName  = ToolDb.GetPrjName(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #16
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();

            //取得页码
            string html = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }

            int      pageInt  = 1;
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "totalpage")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    pageInt = Convert.ToInt32(pageNode[0].ToNodePlainString());
                }
                catch { }
            }
            for (int i = pageInt; i >= 1; i--)
            {
                if (i < pageInt)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.nmgztb.com/Html/gongchengxinxi/zhaobiaogonggao/index_" + (i - 1) + ".htm", Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));

                NodeList sNodes = parser.ExtractAllNodesThatMatch(new  AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")));
                //parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter( new TagNameFilter("div"),new HasAttributeFilter("class","lanmu_con")),true),new TagNameFilter("table")));

                //NodeList div = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lanmu_con")));
                //parser = new Parser(new Lexer(div.ToHtml()));
                //NodeList table = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));

                if (sNodes != null && sNodes.Count > 0)
                {
                    TableTag table = sNodes[0] as TableTag;
                    for (int t = 0; t < table.RowCount; t++)
                    {
                        if (table.Rows[t].ColumnCount < 2)
                        {
                            continue;
                        }
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty,
                               inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty,
                               endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, HtmlTxt = string.Empty;

                        StringBuilder ctx      = new StringBuilder();
                        TableRow      tr       = table.Rows[t] as TableRow;
                        NodeList      nodeList = tr.SearchFor(typeof(ATag), true);
                        if (nodeList.Count > 0)
                        {
                            ATag aTag = nodeList[0] as ATag;
                            InfoUrl = "http://www.nmgztb.com" + aTag.Link;
                            prjName = aTag.GetAttribute("title");
                            string htmldtl = string.Empty;//this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower();
                            try
                            {
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower();
                            }
                            catch (Exception ex)
                            {
                                continue;
                            }
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            htmldtl = regexHtml.Replace(htmldtl, "");
                            Parser   parserdtl = new Parser(new Lexer(htmldtl));
                            NodeList nodesDtl  = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "link_con_con")));
                            if (nodesDtl != null && nodesDtl.Count > 0)
                            {
                                Regex regex = new Regex(@"更新时间:\d{4}年\d{1,2}月\d{1,2}日");
                                Match math  = regex.Match(nodesDtl.AsString());
                                if (math != null)
                                {
                                    beginDate = math.Value.Replace("更新时间:", "").Replace("年", "-").Replace("月", "-").Replace("日", "").Trim();
                                }
                            }
                            parserdtl.Reset();
                            nodesDtl = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "oo")));
                            HtmlTxt  = nodesDtl.AsHtml();
                            string str      = nodesDtl.AsString().Replace("&nbsp;", "").Replace(" ", "");
                            Regex  regexCTX = new Regex(@"作者:[^更新时间]+更新时间:\d{4}年\d{1,2}月\d{1,2}日");
                            str = str.Replace(regexCTX.Match(str).Value, "");
                            if (str.IndexOf("上一篇:") > -1)
                            {
                                ctx.Append(str.Substring(0, str.IndexOf("上一篇:")));
                            }
                            else
                            {
                                ctx.Append(str);
                            }

                            if (ctx.ToString().Contains("招标人:") || ctx.ToString().Contains("招标单位:") || ctx.ToString().Contains("招标采购单位:"))
                            {
                                Regex regex = new Regex("(招标人|招标单位|招标采购单位):[^\r\n]+[\r\n]{1}");
                                Match match = regex.Match(ctx.ToString());
                                buildUnit = match.Value.Replace("招标人:", "").Replace("招标单位:", "").Replace("招标采购单位:", "").Trim();
                            }
                            if (ctx.ToString().Contains("招标编号:"))
                            {
                                Regex regex = new Regex("(招标编号):[^\r\n]+[\r\n]{1}");
                                Match match = regex.Match(ctx.ToString());
                                code = match.Value.Replace("招标编号:", "").ToUpper().Trim();
                                if (code.Length >= 50)
                                {
                                    code = "";
                                }
                            }
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "";
                            }
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            prjAddress = ToolHtml.GetAddress(prjAddress);
                            code       = ToolHtml.GetSubString(code, 50);
                            InviteInfo info = ToolDb.GenInviteInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, "", buildUnit, beginDate, endDate, ctx.ToString(), remark, "内蒙古自治区建设工程招标投标服务中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #17
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            DateTime startDate       = DateTime.Today;
            DateTime endDates        = startDate.AddDays(-90);
            IList    list            = new ArrayList();
            int      pageInt         = 1;
            string   html            = string.Empty;
            string   viewState       = string.Empty;
            string   eventValidation = string.Empty;
            string   cookiestr       = string.Empty;

            try
            {
                NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                    "TIMEBEGIN_SHOW",
                    "TIMEEND_SHOW",
                    "TIMEBEGIN",
                    "TIMEEND",
                    "DEAL_TIME",
                    "DEAL_CLASSIFY",
                    "DEAL_STAGE",
                    "DEAL_PROVINCE",
                    "DEAL_CITY",
                    "DEAL_PLATFORM",
                    "DEAL_TRADE",
                    "isShowAll",
                    "PAGENUMBER",
                    "FINDTXT"
                }, new string[] {
                    endDates.ToString(),
                    startDate.ToString(),
                    endDates.ToString(),
                    startDate.ToString(),
                    "02",
                    "01",
                    "0101",
                    "0",
                    "0",
                    "0",
                    "0",
                    "1",
                    "1",
                    ""
                });
                try
                {
                    html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                }
                catch { }
            }
            catch
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "paging")), true), new TagNameFilter("span")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    string temp = pageNode.AsString().GetRegexBegEnd("共", "页");
                    pageInt = int.Parse(temp);
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "TIMEBEGIN_SHOW",
                        "TIMEEND_SHOW",
                        "TIMEBEGIN",
                        "TIMEEND",
                        "DEAL_TIME",
                        "DEAL_CLASSIFY",
                        "DEAL_STAGE",
                        "DEAL_PROVINCE",
                        "DEAL_CITY",
                        "DEAL_PLATFORM",
                        "DEAL_TRADE",
                        "isShowAll",
                        "PAGENUMBER",
                        "FINDTXT"
                    }, new string[] {
                        endDates.ToString(),
                        startDate.ToString(),
                        endDates.ToString(),
                        startDate.ToString(),
                        "02",
                        "01",
                        "0101",
                        "0",
                        "0",
                        "0",
                        "0",
                        "1",
                        i.ToString(),
                        ""
                    });
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "publicont")));
                if (listNode != null && listNode.Count > 0)
                {
                    for (int j = 0; j < listNode.Count; j++)
                    {
                        string nlse = string.Empty;
                        string ywlx = string.Empty;
                        string sehu = string.Empty;
                        INode  node = listNode[j];
                        ATag   aTag = node.GetATag();
                        if (aTag == null)
                        {
                            continue;
                        }
                        string nod = node.ToHtml();
                        parser = new Parser(new Lexer(nod));
                        NodeList txtNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "span_on")));
                        if (txtNode != null && txtNode.Count > 0)
                        {
                            sehu = txtNode[0].ToNodePlainString();
                            nlse = txtNode[3].ToNodePlainString();
                            ywlx = txtNode[2].ToNodePlainString();
                        }
                        if (nlse.Contains("招标/资审公告"))
                        {
                            string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                   prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                   specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                   remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                   CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            prjName    = aTag.GetAttribute("title");
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            beginDate  = node.ToPlainTextString().GetDateRegex();
                            InfoUrl    = aTag.Link.GetReplace("amp;");
                            string htmlDtl = string.Empty;
                            try
                            {
                                htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                                htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl);
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htmlDtl));
                            NodeList zsList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "div_0101")));
                            if (zsList != null && zsList.Count > 0)
                            {
                                try
                                {
                                    INode  nodezs = zsList[0];
                                    ATag   aTagzs = nodezs.GetATag();
                                    string urlzs  = aTagzs.GetAttribute("onclick");
                                    string urls   = urlzs.GetReplace("showdetail(this, '0101','", "").GetReplace("')", "").Replace(",", "").Replace(")", "");
                                    urls    = "http://www.ggzy.gov.cn/information" + urls;
                                    htmlDtl = this.ToolWebSite.GetHtmlByUrl(urls, Encoding.UTF8);
                                    htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl);
                                }
                                catch (Exception) { throw; }
                            }

                            parser = new Parser(new Lexer(htmlDtl));
                            NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail")));
                            if (dtlList != null && dtlList.Count > 0)
                            {
                                string ctxUrl = string.Empty;
                                HtmlTxt = dtlList.AsHtml();

                                inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString();
                                try
                                {
                                    Parser   parurl = new Parser(new Lexer(HtmlTxt));
                                    NodeList zsUrl  = parurl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "p_o")));
                                    if (zsUrl != null && zsUrl.Count > 0)
                                    {
                                        INode urlzs   = zsUrl[0];
                                        ATag  aTagurl = urlzs.GetATag();
                                        ctxUrl = "原文链接地址 : " + aTagurl.Link;
                                    }
                                }
                                catch (Exception ex)
                                { }
                                inviteCtx  = inviteCtx + ctxUrl;
                                prjAddress = inviteCtx.GetAddressRegex();
                                buildUnit  = inviteCtx.GetBuildRegex();
                                code       = inviteCtx.GetCodeRegex();
                                if (string.IsNullOrEmpty(buildUnit))
                                {
                                    buildUnit = inviteCtx.GetRegex("招标人");
                                }
                                buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                                if (string.IsNullOrWhiteSpace(code))
                                {
                                    parser = new Parser(new Lexer(HtmlTxt));
                                    NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                    if (bidNode != null && bidNode.Count > 0)
                                    {
                                        string   ctx      = string.Empty;
                                        TableTag bidTable = bidNode[0] as TableTag;
                                        try
                                        {
                                            for (int r = 0; r < bidTable.RowCount; r++)
                                            {
                                                ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":";
                                                ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n";
                                            }
                                        }
                                        catch { }

                                        if (string.IsNullOrWhiteSpace(buildUnit))
                                        {
                                            buildUnit = ctx.GetBuildRegex();
                                        }

                                        if (string.IsNullOrWhiteSpace(prjAddress))
                                        {
                                            prjAddress = ctx.GetAddressRegex();
                                        }

                                        if (string.IsNullOrWhiteSpace(code))
                                        {
                                            code = ctx.GetCodeRegex();
                                        }
                                    }
                                }

                                msgType    = "国家信息中心";
                                specType   = "建设工程";
                                inviteType = "建设工程";
                                string[] provs = GetPrivoce(sehu);

                                InviteInfo info = ToolDb.GenInviteInfo(provs[0], provs[1], "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                                list.Add(info);
                                try
                                {
                                    parser = new Parser(new Lexer(HtmlTxt));
                                    NodeList nodeFm = parser.ExtractAllNodesThatMatch((new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_content")))));
                                    if (dtlList != null && dtlList.Count > 0)
                                    {
                                        INode      nodFm  = nodeFm[0];
                                        ATag       aTagzs = nodFm.GetATag();
                                        string     dfe    = aTagzs.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach("内容(点击下载)", info.Id, dfe);
                                        base.AttachList.Add(attach);
                                    }
                                }
                                catch { }
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                        else
                        {
                            continue;
                        }
                    }
                }
            }
            return(list);
        }
예제 #18
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string html            = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    pageInt         = 1;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "input-group-addon")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                try
                {
                    string reTemp   = tdNodes.AsString().GetRegexBegEnd("共", "项");
                    string pageTemp = tdNodes.AsString().GetRegexBegEnd("项", "页").GetReplace("共,项,页," + reTemp + ",,");
                    pageInt = int.Parse(pageTemp);
                }
                catch (Exception) { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?pi=" + (i - 1), Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag tableRow = (TableTag)nodeList[0];
                    for (int j = 1; j < tableRow.RowCount; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = tableRow.Rows[j];
                        beginDate = tr.Columns[3].ToPlainTextString().Trim();
                        prjName   = tr.Columns[1].ToPlainTextString().Trim().GetReplace("&quot;");
                        buildUnit = tr.Columns[2].ToPlainTextString().Trim();
                        InfoUrl   = "http://www.bajsjy.com/" + tr.Columns[1].GetATagHref();

                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace("<th", "<td").Replace("</th>", "</td>").Replace("&nbsp;", "");
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            HtmlTxt = dtnode.AsHtml();
                            TableTag tabledetail = (TableTag)dtnode[0];
                            for (int r = 0; r < tabledetail.RowCount; r++)
                            {
                                TableRow trdetail = tabledetail.Rows[r];
                                try
                                {
                                    for (int c = 0; c < trdetail.ColumnCount; c++)
                                    {
                                        string tr1 = string.Empty;
                                        string tr2 = string.Empty;
                                        tr1     = trdetail.Columns[c].ToPlainTextString().Trim();
                                        tr2     = trdetail.Columns[c + 1].ToPlainTextString().Trim();
                                        bidCtx += tr1 + ":" + tr2 + "\r\n";
                                        if (trdetail.ColumnCount > (c + 1))
                                        {
                                            c = c + 1;
                                        }
                                    }
                                }
                                catch
                                {
                                    bidCtx = HtmlTxt.ToCtxString();
                                }
                            }
                            Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+\r\n");
                            prjAddress = regPrjAdd.Match(bidCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim();
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = string.Empty;
                            }
                            prjAddress = ToolHtml.GetSubString(prjAddress, 50);
                            msgType    = "深圳市建设工程交易中心宝安分中心";
                            specType   = "建设工程";
                            Regex regMoney = new Regex(@"(中标价):[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx).Value.Replace("金额", "").Replace("中标价", "").Replace(":", "").Replace(":", "").Replace("/", "").Replace(",", "").Trim();
                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");

                            if (!string.IsNullOrEmpty(regBidMoney.Match(bidMoney).Value))
                            {
                                if (bidMoney.Contains("万元") || bidMoney.Contains("万美元") || bidMoney.Contains("万"))
                                {
                                    bidMoney = regBidMoney.Match(bidMoney).Value;
                                }
                                else
                                {
                                    try
                                    {
                                        bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                        if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                        {
                                            bidMoney = "0";
                                        }
                                    }
                                    catch (Exception)
                                    {
                                        bidMoney = "0";
                                    }
                                }
                            }
                            Regex regBidUnit = new Regex(@"(中标人|中标单位):[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx).Value.Replace("中标人:", "").Replace("中标单位", "").Trim();

                            if (bidUnit == "" || bidUnit == null)
                            {
                                bidUnit = "";
                            }
                            if (Encoding.Default.GetByteCount(bidUnit) > 150)
                            {
                                bidUnit = bidUnit.Substring(0, 150);
                            }
                            Regex regprjMgr = new Regex(@"(项目经理):[^\r\n]+\r\n");
                            prjMgr = regprjMgr.Match(bidCtx).Value.Replace("项目经理:", "").Trim();
                            if (string.IsNullOrEmpty(prjMgr))
                            {
                                prjMgr = string.Empty;
                            }
                            prjMgr = ToolHtml.GetSubString(prjMgr, 30);
                            Regex  regOtherType = new Regex(@"(工程类型):[^\r\n]+\r\n");
                            string oType        = regOtherType.Match(bidCtx).Value.Replace("工程类型:", "").Trim();
                            if (oType.Contains("房建"))
                            {
                                otherType = "房建及工业民用建筑";
                            }
                            if (oType.Contains("市政"))
                            {
                                otherType = "市政工程";
                            }
                            if (oType.Contains("园林绿化"))
                            {
                                otherType = "园林绿化工程";
                            }
                            if (oType.Contains("装饰装修"))
                            {
                                otherType = "装饰装修工程";
                            }
                            if (oType.Contains("电力"))
                            {
                                otherType = "电力工程";
                            }
                            if (oType.Contains("水利"))
                            {
                                otherType = "水利工程";
                            }
                            if (oType.Contains("环保"))
                            {
                                otherType = "环保工程";
                            }
                            otherType = ToolHtml.GetSubString(otherType, 50);
                            oType     = ToolHtml.GetSubString(oType, 50);
                            //prjName = ToolDb.GetPrjName(prjName);
                            bidType   = ToolHtml.GetInviteTypes(prjName);
                            buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                            BidInfo info = null;
                            try
                            {
                                info = ToolDb.GenBidInfo("广东省", "深圳宝安区工程", "宝安区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, beginDate, beginDate, HtmlTxt);
                            }
                            catch
                            {
                                Logger.Error("出错啦");
                            }
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #19
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema")));

            if (noList != null && noList.Count > 0)
            {
                string temp = noList.AsString();
                try
                {
                    Regex  reg    = new Regex(@"/[^页]+页");
                    string result = reg.Match(temp).Value.Replace("页", "").Replace("/", "");
                    pageInt = Convert.ToInt32(result);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=97&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (dtlList != null && dtlList.Count > 0)
                {
                    for (int j = 0; j < dtlList.Count - 1; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        string temp    = dtlList[j].ToPlainTextString();
                        string tempHtl = dtlList[j].ToHtml();
                        prjName   = ToolHtml.GetHtmlAtagValue("title", tempHtl);
                        beginDate = ToolHtml.GetRegexDateTime(temp);
                        InfoUrl   = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl);
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                            htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", "");
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main")));
                        if (htlList != null && htlList.Count > 0)
                        {
                            HtmlTxt = htlList.ToHtml();
                            bidCtx  = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t\t", "").Replace("\r\r", "\r").Replace("\n\n", "\n");
                            bidType = ToolHtml.GetInviteTypes(prjName);

                            string bidStr = string.Empty;
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList bidList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable")));
                            if (bidList != null && bidList.Count > 0)
                            {
                                try
                                {
                                    TableTag tab = bidList[0] as TableTag;
                                    if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 6)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[6].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[6].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 5)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 4)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 3)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 2)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 1)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                }
                                catch { }
                            }
                            buildUnit  = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex);
                            prjAddress = ToolHtml.GetRegexString(bidCtx, ToolHtml.AddressRegex);
                            code       = ToolHtml.GetRegexString(bidCtx, ToolHtml.CodeRegex);
                            bidUnit    = ToolHtml.GetRegexString(bidCtx, ToolHtml.BidRegex);
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = ToolHtml.GetRegexString(bidStr.Replace("  ", ""), ToolHtml.BidRegex, false);
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegexBegEnd("确认", "为");
                            }
                            bidMoney = ToolHtml.GetRegexString(bidCtx, ToolHtml.MoneyRegex);
                            bidMoney = ToolHtml.GetRegexMoney(bidMoney);

                            if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetRegexBegEnd("¥", "元").GetMoney();
                            }

                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                            code       = ToolHtml.GetSubString(code, 50);
                            bidUnit    = ToolHtml.GetSubString(bidUnit, 150);

                            bidUnit   = ToolHtml.GetStringTemp(bidUnit);
                            buildUnit = ToolHtml.GetStringTemp(buildUnit);

                            if (string.IsNullOrEmpty(code))
                            {
                                code = "见中标信息";
                            }
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见中标信息";
                            }
                            specType = "其他";
                            msgType  = "工网在线";
                            BidInfo info = ToolDb.GenBidInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (nodeAtag != null && nodeAtag.Count > 0)
                            {
                                for (int c = 0; c < nodeAtag.Count; c++)
                                {
                                    ATag a = nodeAtag[c] as ATag;
                                    if (a.Link.IsAtagAttach())
                                    {
                                        string     alink  = "http://www.bidding.csg.cn/" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace("&nbsp", ""), info.Id, alink);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #20
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "").GetRegexBegEnd("/", "跳");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        string url = "http://lhbsc.szlhxq.gov.cn/lhbsc/bsdt43/qyfw78/zbcg2/zbxxgs49/0e647d73-" + i.ToString() + ".html";
                        html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                //NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", ""))), new TagNameFilter("tr")));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", "")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        INode node    = viewList[j];
                        ATag  aTag    = node.GetATag();
                        beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value;
                        InfoUrl   = "http://lhbsc.szlhxq.gov.cn" + aTag.Link.Replace("../", "").Replace("./", "");
                        prjName   = aTag.GetAttribute("title");
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htlDtl = regexHtml.Replace(htlDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt   = Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", "");
                            inviteCtx = Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            inviteCtx = Regex.Replace(inviteCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            string InvType = prjName;
                            inviteType = prjName.GetInviteBidType();
                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标代理机构|采购代理机构|采购人名称|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("采购人名称", "").Replace("采购代理机构", "").Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace("(", "").Replace(")", "").Trim();
                            msgType = "深圳市龙华新区龙华街道办事处";
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市龙华新区龙华街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                            {
                                for (int k = 0; k < aNode.Count; k++)
                                {
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                    {
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                        {
                                            link = a.Link;
                                        }
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #21
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "50%")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace(" ", "");
                    Regex  reg  = new Regex(@"条,[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("条,", "").Replace("页", ""));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&p=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "Datagrid")));
                if (viewList != null && viewList.Count > 0)
                {
                    TableTag tab = viewList[0] as TableTag;
                    for (int j = 0; j < tab.RowCount; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = tab.Rows[j];
                        prjName = tr.Columns[1].ToPlainTextString().Replace("\r", "").Replace("\t", "").Replace("\n", "");
                        Regex regDate = new Regex(@"\d{4}/\d{1,2}/\d{1,2}");
                        beginDate = regDate.Match(tr.Columns[2].ToPlainTextString()).Value;
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        InfoUrl = "http://www.xixiang.gov.cn/" + aTag.Link;
                        string htmDtl = string.Empty;
                        try
                        {
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lblcontent")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt   = dtl.AsHtml();
                            inviteCtx = dtl.AsString().Replace("&nbsp;", "");
                            string InvType = prjName;
                            if (InvType.Contains("施工"))
                            {
                                inviteType = "施工";
                            }
                            if (InvType.Contains("监理"))
                            {
                                inviteType = "监理";
                            }
                            if (InvType.Contains("设计"))
                            {
                                inviteType = "设计";
                            }
                            if (InvType.Contains("勘察"))
                            {
                                inviteType = "勘察";
                            }
                            if (InvType.Contains("服务"))
                            {
                                inviteType = "服务";
                            }
                            if (InvType.Contains("劳务分包"))
                            {
                                inviteType = "劳务分包";
                            }
                            if (InvType.Contains("专业分包"))
                            {
                                inviteType = "专业分包";
                            }
                            if (InvType.Contains("小型施工"))
                            {
                                inviteType = "小型工程";
                            }
                            if (InvType.Contains("设备材料"))
                            {
                                inviteType = "设备材料";
                            }
                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();
                            msgType = "深圳市宝安区西乡街道办事处";
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市宝安区西乡街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #22
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", "").GetRegexBegEnd("/", "跳");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        string url = "http://lhbsc.szlhxq.gov.cn/lhbsc/bsdt43/qyfw78/zbcg2/zbxxgg/065b33d5-" + i.ToString() + ".html";
                        html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class", "")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        INode node    = viewList[j];
                        ATag  aTag    = node.GetATag();
                        beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value;
                        prjName   = aTag.GetAttribute("title");
                        InfoUrl   = "http://lhbsc.szlhxq.gov.cn" + aTag.Link.Replace("../", "").Replace("./", "");
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htlDtl = regexHtml.Replace(htlDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", "");
                            Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            Regex.Replace(bidCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("&yen;", "");
                            bidType   = ToolHtml.GetInviteTypes(prjName);
                            buildUnit = ToolHtml.GetRegexString(bidCtx, "按(建设单位)", "(提供)");

                            bidMoney = ToolHtml.GetRegexString(bidCtx, "(中标金额)", "(元)|(万元)|(;)").GetReplace(":", "").GetMoney("万元");
                            bidUnit  = bidCtx.GetBidRegex();
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegex("中标供应商名称");
                            }

                            if (bidUnit.Contains("公司"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            }
                            if (prjMgr.Contains("资格"))
                            {
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格"));
                            }
                            if (string.IsNullOrWhiteSpace(bidMoney))
                            {
                                bidMoney = bidCtx.GetRegex("中标金额").GetReplace(":", "");
                            }
                            bidUnit = ToolHtml.GetStringTemp(bidUnit);
                            if (string.IsNullOrWhiteSpace(buildUnit))
                            {
                                buildUnit = bidCtx.GetRegex("采购人名称");
                            }
                            bidUnit = ToolHtml.GetSubString(bidUnit, 150);
                            code    = bidCtx.GetCodeRegex().GetReplace(")", "");
                            if (string.IsNullOrWhiteSpace(code))
                            {
                                code = bidCtx.GetRegexBegEnd("招标编号:", ")");
                            }
                            prjMgr = bidCtx.GetMgrRegex();
                            try
                            {
                                if (Convert.ToDecimal(bidMoney) > 100000)
                                {
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                }
                            }
                            catch { }

                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市龙华新区龙华街道办事处";
                            }
                            msgType  = "深圳市龙华新区龙华街道办事处";
                            specType = "建设工程";
                            bidType  = "小型工程";
                            prjName  = ToolDb.GetPrjName(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                            {
                                for (int k = 0; k < aNode.Count; k++)
                                {
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                    {
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                        {
                                            link = a.Link;
                                        }
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }