Beispiel #1
0
        public void DealHtml(IList list, string html, bool crawlAll)
        {
            Parser   parserDtl = new Parser(new Lexer(html));
            NodeList aNodes    = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable")));

            if (aNodes != null && aNodes.Count > 0)
            {
                Type     typs  = typeof(ATag);
                TableTag table = aNodes[0] as TableTag;
                for (int t = 1; t < table.RowCount - 1; t++)
                {
                    string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty,
                           inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty;

                    TableRow tr   = table.Rows[t] as TableRow;
                    ATag     aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag;

                    InfoUrl = aTag.Link;
                    prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                    endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                    string htmlDtl = string.Empty;
                    try
                    {
                        htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                    htmlDtl = regexHtml.Replace(htmlDtl, "");
                    Parser parserCtx = new Parser(new Lexer(htmlDtl));

                    NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable")));
                    if (ctxNode != null && ctxNode.Count > 0)
                    {
                        Parser   parserdiv = new Parser(new Lexer(htmlDtl));
                        NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button")));
                        HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim();
                        Type     tp        = typeof(ATag);
                        TableTag tabTag    = ctxNode[0] as TableTag;
                        string   startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        Regex    regex     = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}");
                        Match    math      = regex.Match(startTime);
                        beginDate = math.Value.Replace("时间:", "").Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                        Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}");
                        Match match     = regexcode.Match(tabTag.ToPlainTextString());
                        code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                        Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}");
                        Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString());
                        buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                        Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}");
                        Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString());
                        prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Trim();
                        ctx        = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace("&nbsp;", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                        if (ctx.Length > 0)
                        {
                            Regex regexCtx = new Regex("<!--[^<]+-->");
                            ctx = regexCtx.Replace(ctx, "");
                        }
                        if (Encoding.Default.GetByteCount(code) > 50)
                        {
                            code = "";
                        }
                        if (buildUnit == "" || buildUnit == null)
                        {
                            buildUnit = "";
                        }
                        if (Encoding.Default.GetByteCount(buildUnit) > 150)
                        {
                            buildUnit = buildUnit.Substring(0, 150);
                        }
                        if (Encoding.Default.GetByteCount(prjAddress) > 200)
                        {
                            prjAddress = "见招标公告内容";
                        }
                        if (beginDate.Length > 0 && endDate.Length > 0)
                        {
                            DateTime begin = new DateTime();
                            DateTime end   = new DateTime();
                            try
                            {
                                begin = DateTime.Parse(beginDate);
                                end   = DateTime.Parse(endDate);
                            }
                            catch (Exception)
                            {
                            }
                            if (begin > end)
                            {
                                endDate = string.Empty;
                            }
                        }
                    }

                    parserCtx.Reset();

                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai")));
                    Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                    beginDate = regDate.Match(ctxNode.AsString()).Value.Trim();
                    if (beginDate == "")
                    {
                        beginDate = string.Empty;
                    }
                    inviteType = ToolHtml.GetInviteTypes(prjName);
                    InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt);
                    list.Add(info);
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")));
                    NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true);
                    for (int a = 0; a < aTagNodes.Count; a++)
                    {
                        ATag fileTage = aTagNodes[a] as ATag;
                        if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile"))
                        {
                            string     downloadURL = fileTage.Link;
                            BaseAttach attach      = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL);
                            base.AttachList.Add(attach);
                        }
                    }
                    if (!crawlAll && list.Count >= this.MaxCount)
                    {
                        return;
                    }
                }
            }
        }
Beispiel #2
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode[0].ToNodePlainString();
                    pageInt = Convert.ToInt32(temp.GetRegexBegEnd("/", "页"));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pageNo=" + i, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list")), true), new TagNameFilter("ul")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        ATag aTag = viewList[j].GetATag();
                        prjName   = aTag.GetAttribute("title");
                        InfoUrl   = "http://www.hzzk.cn" + aTag.Link.GetReplace("../");
                        beginDate = viewList[j].ToPlainTextString().GetDateRegex();
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_view")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            bidCtx  = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString();
                            bidType = prjName.GetInviteBidType();

                            code = bidCtx.GetCodeRegex().GetCodeDel();
                            //if (code.IsChina())
                            //    code = "";
                            buildUnit  = bidCtx.GetBuildRegex();
                            prjAddress = bidCtx.GetAddressRegex();
                            if (buildUnit.Contains("招标代理"))
                            {
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理"));
                            }
                            if (buildUnit.Contains("公司"))
                            {
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司";
                            }
                            bidUnit  = bidCtx.GetBidRegex();
                            bidMoney = bidCtx.GetMoneyRegex();

                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    string   ctx   = string.Empty;
                                    TableTag table = null;
                                    if (tableNode.Count == 2)
                                    {
                                        table = tableNode[1] as TableTag;
                                    }
                                    else
                                    {
                                        table = tableNode[0] as TableTag;
                                    }
                                    if (table.ToPlainTextString().Contains("投标人") || table.ToPlainTextString().Contains("投标企业"))
                                    {
                                        if (table.RowCount > 1)
                                        {
                                            for (int r = 0; r < table.Rows[0].ColumnCount; r++)
                                            {
                                                try
                                                {
                                                    ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":";
                                                    ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n";
                                                }
                                                catch { continue; }
                                            }

                                            bidUnit = ctx.GetBidRegex();
                                            if (string.IsNullOrEmpty(bidUnit))
                                            {
                                                bidUnit = ctx.GetRegex("投标人名称,投标企业名称,投标企业,投标人");
                                            }
                                            if (bidUnit.Contains("单位名称"))
                                            {
                                                bidUnit = ctx.GetRegex("第一中标候选人");
                                            }
                                            bidMoney = ctx.GetMoneyRegex();
                                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                            {
                                                bidMoney = ctx.GetMoneyRegex(new string[] { "报价" });
                                            }
                                        }
                                        if (bidUnit.Length < 5)
                                        {
                                            ctx = string.Empty;
                                            if (table.RowCount > 2)
                                            {
                                                for (int r = 0; r < table.Rows[0].ColumnCount; r++)
                                                {
                                                    try
                                                    {
                                                        ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":";
                                                        ctx += table.Rows[2].Columns[r].ToNodePlainString() + "\r\n";
                                                    }
                                                    catch { continue; }
                                                }

                                                bidUnit = ctx.GetBidRegex();
                                                if (string.IsNullOrEmpty(bidUnit))
                                                {
                                                    bidUnit = ctx.GetRegex("投标人名称,投标企业名称,投标企业,投标人");
                                                }
                                                if (bidUnit.Contains("单位名称"))
                                                {
                                                    bidUnit = ctx.GetRegex("第一中标候选人");
                                                }
                                                bidMoney = ctx.GetMoneyRegex();
                                                if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                                {
                                                    bidMoney = ctx.GetMoneyRegex(new string[] { "报价" });
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        for (int r = 0; r < table.RowCount; r++)
                                        {
                                            try
                                            {
                                                ctx += table.Rows[r].Columns[0].ToNodePlainString() + ":";
                                                ctx += table.Rows[r].Columns[1].ToNodePlainString() + "\r\n";
                                            }
                                            catch { continue; }
                                        }
                                        bidUnit = ctx.GetBidRegex();
                                        if (string.IsNullOrEmpty(bidUnit))
                                        {
                                            bidUnit = ctx.GetRegex("单位名称");
                                        }
                                        bidMoney = ctx.GetMoneyRegex();

                                        prjMgr = ctx.GetMgrRegex();
                                        if (string.IsNullOrEmpty(prjMgr))
                                        {
                                            prjMgr = ctx.GetRegex("项目负责人姓名及资质证书编号");
                                        }
                                        if (prjMgr.Contains(","))
                                        {
                                            prjMgr = prjMgr.Remove(prjMgr.IndexOf(","));
                                        }
                                        if (prjMgr.Contains(","))
                                        {
                                            prjMgr = prjMgr.Remove(prjMgr.IndexOf(","));
                                        }
                                    }
                                }
                                else
                                {
                                    bidUnit = bidCtx.GetRegexBegEnd("中标人", "公司").GetReplace(":,:") + "公司";
                                    if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                    {
                                        bidMoney = bidCtx.GetRegexBegEnd("中标价", "元").GetReplace(":,:").GetMoney();
                                    }
                                }
                            }
                            try
                            {
                                if (decimal.Parse(bidMoney) < 1)
                                {
                                    bidMoney = "0";
                                }
                                if (decimal.Parse(bidMoney) > 100000)
                                {
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                }
                            }
                            catch { }

                            if (bidUnit == "公司" || bidUnit.Length <= 5)
                            {
                                bidUnit = "";
                            }
                            msgType  = "惠州仲恺高新技术产业开发区公共资源交易中心";
                            specType = "政府采购";

                            BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "高新区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                            {
                                for (int k = 0; k < aNode.Count; k++)
                                {
                                    ATag a = aNode[k].GetATag();
                                    if (a.IsAtagAttach())
                                    {
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                        {
                                            link = a.Link;
                                        }
                                        else
                                        {
                                            link = "http://www.hzzk.cn/" + a.Link;
                                        }
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Beispiel #3
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs93/15159-" + i + ".html", Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        TableTag table = viewList[j] as TableTag;
                        string   prjName = string.Empty,
                                 buildUnit = string.Empty, bidUnit = string.Empty,
                                 bidMoney = string.Empty, code = string.Empty,
                                 bidDate = string.Empty,
                                 beginDate = string.Empty,
                                 endDate = string.Empty, bidType = string.Empty,
                                 specType = string.Empty, InfoUrl = string.Empty,
                                 msgType = string.Empty, bidCtx = string.Empty,
                                 prjAddress = string.Empty, remark = string.Empty,
                                 prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        beginDate = table.ToPlainTextString().GetDateRegex();
                        ATag aTag = table.GetATag();
                        prjName = aTag.GetAttribute("title");
                        InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link;
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            bidCtx  = HtmlTxt.GetReplace("</p>,</br>", "\r\n").ToCtxString();

                            code      = bidCtx.GetCodeRegex().GetCodeDel();
                            buildUnit = bidCtx.GetBuildRegex();
                            bidUnit   = bidCtx.GetBidRegex();
                            bidMoney  = bidCtx.GetMoneyRegex(null, false, "万元");
                            prjMgr    = bidCtx.GetMgrRegex();
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList bidNode  = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                TableTag bidTable = null;
                                string   ctx      = string.Empty;
                                if (bidNode != null && bidNode.Count > 1)
                                {
                                    bidTable = bidNode[1] as TableTag;
                                }
                                else if (bidNode != null && bidNode.Count > 0)
                                {
                                    bidTable = bidNode[0] as TableTag;
                                }
                                if (bidTable != null)
                                {
                                    for (int r = 0; r < bidTable.RowCount; r++)
                                    {
                                        for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++)
                                        {
                                            if ((c + 1) % 2 == 0)
                                            {
                                                ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n";
                                            }
                                            else
                                            {
                                                ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + ":";
                                            }
                                        }
                                    }

                                    bidUnit = ctx.GetBidRegex();
                                    if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                                    {
                                        bidMoney = ctx.GetMoneyString().GetMoney("万元");
                                    }
                                    if (string.IsNullOrEmpty(prjAddress))
                                    {
                                        prjAddress = ctx.GetAddressRegex();
                                    }
                                    if (string.IsNullOrEmpty(buildUnit))
                                    {
                                        buildUnit = ctx.GetBuildRegex();
                                    }
                                    if (string.IsNullOrEmpty(code))
                                    {
                                        code = ctx.GetCodeRegex().GetCodeDel();
                                    }
                                    if (bidUnit.Contains("推荐") || bidUnit.Contains("中标") || bidUnit.Contains("地址"))
                                    {
                                        bidUnit = string.Empty;
                                    }
                                    if (string.IsNullOrWhiteSpace(prjMgr))
                                    {
                                        prjMgr = ctx.GetMgrRegex();
                                    }
                                    if (string.IsNullOrEmpty(bidUnit))
                                    {
                                        if (bidTable.RowCount > 1)
                                        {
                                            ctx = string.Empty;
                                            for (int d = 0; d < bidTable.Rows[0].ColumnCount; d++)
                                            {
                                                ctx += bidTable.Rows[0].Columns[d].ToNodePlainString() + ":";
                                                try
                                                {
                                                    ctx += bidTable.Rows[1].Columns[d].ToNodePlainString() + "\r\n";
                                                }
                                                catch { }
                                            }
                                            bidUnit = ctx.GetBidRegex();
                                            if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                                            {
                                                bidMoney = ctx.GetMoneyString().GetMoney();
                                            }
                                            if (string.IsNullOrEmpty(prjAddress))
                                            {
                                                prjAddress = ctx.GetAddressRegex();
                                            }
                                            if (string.IsNullOrEmpty(buildUnit))
                                            {
                                                buildUnit = ctx.GetBuildRegex();
                                            }
                                            if (string.IsNullOrEmpty(code))
                                            {
                                                code = ctx.GetCodeRegex().GetCodeDel();
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        try
                        {
                            if (decimal.Parse(bidMoney) > 1000000)
                            {
                                bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                            }
                        }
                        catch { }


                        if (string.IsNullOrEmpty(buildUnit))
                        {
                            buildUnit = "深圳市龙华新区观澜街道办事处";
                        }
                        msgType  = "深圳市龙华新区观澜街道办事处";
                        specType = "建设工程";
                        bidType  = "小型工程";
                        prjName  = ToolDb.GetPrjName(prjName);
                        BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                         bidMoney, InfoUrl, prjMgr, HtmlTxt);
                        list.Add(info);
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }
            return(list);
        }
Beispiel #4
0
        public void DealHtml(IList list, string html, bool crawlAll)
        {
            Parser   parserDtl = new Parser(new Lexer(html));
            NodeList aNodes    = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable")));

            if (aNodes != null && aNodes.Count > 0)
            {
                Type     typs  = typeof(ATag);
                TableTag table = aNodes[0] as TableTag;
                for (int t = 1; t < table.RowCount - 1; t++)
                {
                    string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty,
                           inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, FbTime = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, HtmlTxt = string.Empty;

                    TableRow tr   = table.Rows[t] as TableRow;
                    ATag     aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag;

                    InfoUrl = aTag.Link;
                    prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                    endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                    string htmlDtl = string.Empty;
                    try
                    {
                        htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                    htmlDtl = regexHtml.Replace(htmlDtl, "");
                    Parser parserCtx = new Parser(new Lexer(htmlDtl));

                    NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable")));
                    if (ctxNode != null && ctxNode.Count > 0)
                    {
                        Parser   parserdiv = new Parser(new Lexer(htmlDtl));
                        NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button")));
                        HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim();
                        Type     tp     = typeof(ATag);
                        TableTag tabTag = ctxNode[0] as TableTag;

                        string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        Regex  regex     = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}");
                        Match  math      = regex.Match(startTime);
                        beginDate = math.Value.Replace("时间:", "");

                        Regex regexcode = new Regex("(工程编号|项目编号):[^\r\n]+[\r\n]{1}");
                        Match match     = regexcode.Match(tabTag.ToPlainTextString());
                        if (match.Value.Length > 0)
                        {
                            code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        }
                        Regex regexBuildUnit = new Regex("(中标人|中标单位):[^\r\n]+[\r\n]{1}");
                        Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString());
                        if (matchBuildUnit.Value.Length > 0)
                        {
                            buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        }
                        Regex regexbidUnit = new Regex("(招标人|建设单位|第一中标候选人):[^\r\n]+[\r\n]{1}");
                        Match matchbidUnit = regexbidUnit.Match(tabTag.ToPlainTextString());
                        if (matchbidUnit.Value.Length > 0)
                        {
                            bidUnit = matchbidUnit.Value.Replace("第一中标候选人:", "").Replace("招标人:", "").Replace("建设单位:", "").Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                            if (bidUnit.Contains(":"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf(":")).ToString().Trim();
                            }
                        }

                        Regex regexMoney = new Regex("(中标价|其中标价为|中标价格):[^\r\n]+[\r\n]{1}");
                        Match matchMoney = regexMoney.Match(tabTag.ToPlainTextString());
                        if (matchMoney.Value.Length > 0)
                        {
                            bidMoney = matchMoney.Value.Replace("中标价:", "").Replace("其中标价为:", "").Replace("中标价格:", "").Replace("\r", "");
                        }
                        Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                        if (bidMoney.Contains("万"))
                        {
                            bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                            bidMoney = regBidMoney.Match(bidMoney).Value;
                        }
                        else
                        {
                            try
                            {
                                bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                {
                                    bidMoney = "0";
                                }
                            }
                            catch (Exception)
                            {
                                bidMoney = "0";
                            }
                        }
                        if (Encoding.Default.GetByteCount(code) > 50)
                        {
                            code = "";
                        }
                        if (buildUnit == "" || buildUnit == null)
                        {
                            buildUnit = "";
                        }
                        if (Encoding.Default.GetByteCount(buildUnit) > 150)
                        {
                            buildUnit = buildUnit.Substring(0, 150);
                        }
                        if (bidUnit == "" || bidUnit == null)
                        {
                            bidUnit = "";
                        }
                        if (Encoding.Default.GetByteCount(bidUnit) > 150)
                        {
                            bidUnit = bidUnit.Substring(0, 150);
                        }
                        ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace("&nbsp;", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                        if (ctx.Length > 0)
                        {
                            Regex regexCtx = new Regex("<!--[^<]+-->");
                            ctx = regexCtx.Replace(ctx, "");
                        }
                    }

                    parserCtx.Reset();
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai")));
                    Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                    beginDate = regDate.Match(ctxNode.AsString()).Value.Trim();
                    if (ctx.Contains("公示开始时间"))
                    {
                        beginDate = ctx.Substring(ctx.IndexOf("公示开始时间")).ToString();
                        Regex regBeDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日");
                        beginDate = regBeDate.Match(beginDate).Value.Trim();
                    }
                    if (beginDate == "")
                    {
                        beginDate = regDate.Match(ctxNode.AsString()).Value.Trim();
                    }
                    if (beginDate == "")
                    {
                        beginDate = string.Empty;
                    }
                    prjName = ToolDb.GetPrjName(prjName);
                    bidType = ToolHtml.GetInviteTypes(prjName);
                    BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠阳区", string.Empty, code, prjName, bidUnit, beginDate, buildUnit, beginDate, endDate, ctx, string.Empty, "惠州市建设工程交易中心", bidType, "建设工程", string.Empty, bidMoney, InfoUrl, string.Empty, HtmlTxt);

                    list.Add(info);
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")));
                    NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true);
                    for (int a = 0; a < aTagNodes.Count; a++)
                    {
                        ATag fileTage = aTagNodes[a] as ATag;
                        if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile"))
                        {
                            string     downloadURL = fileTage.Link;
                            BaseAttach attach      = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL);
                            base.AttachList.Add(attach);
                        }
                    }
                    if (!crawlAll && list.Count >= this.MaxCount)
                    {
                        return;
                    }
                }
            }
        }
Beispiel #5
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);//ToolSocket.Get("http://www.guanhu.gov.cn/NEWS/Public_Edit.aspx?verid=2f51d6aa-816e-41bb-a331-bce28a4f9554", Encoding.Default);
            }
            catch
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "跳转");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://glbsc.szlhxq.gov.cn/glbsc/zwgk70/zbcg5/zbxxgs/15158-" + i + ".html", Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "border-bottom: 1px dashed #333;")));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 0; j < viewList.Count; j++)
                    {
                        TableTag table = viewList[j] as TableTag;
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                 prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                 specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                 remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                 CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        beginDate = table.ToPlainTextString().GetDateRegex();
                        ATag aTag = table.GetATag();
                        prjName = aTag.GetAttribute("title");
                        InfoUrl = "http://glbsc.szlhxq.gov.cn" + aTag.Link;
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentbox")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt    = dtl.AsHtml();
                            inviteCtx  = HtmlTxt.ToCtxString();
                            inviteCtx  = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            inviteType = prjName.GetInviteBidType();

                            Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim();
                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n");
                            code    = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Replace(")", "").Replace(")", "").Trim();
                            msgType = "深圳市龙华新区观澜街道办事处";
                            if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150)
                            {
                                prjAddress = "见招标信息";
                            }
                            code       = ToolHtml.GetSubString(code, 50);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            specType   = "建设工程";
                            inviteType = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市龙华新区观澜街道办事处";
                            }
                            inviteType = ToolHtml.GetInviteType(inviteType);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }