コード例 #1
0
ファイル: InviteSZYTian.cs プロジェクト: SHNXJMG/Small
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser        = new Parser(new Lexer(htl));
            NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "right")));

            if (tableNodeList != null && tableNodeList.Count > 0)
            {
                Regex regexPage = new Regex(@"共\d+页");
                page = int.Parse(regexPage.Match(tableNodeList.AsString()).Value.Trim(new char[] { '共', '页' }));
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl("http://www.yantian.gov.cn/cn/zwgk/zfcg/zbgg/index_" + (i - 1).ToString() + ".shtml", Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "565")), true), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    string url = string.Empty;
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string beg = nodeList[j].ToPlainTextString().GetDateRegex();
                        if (string.IsNullOrEmpty(beg))
                        {
                            continue;
                        }
                        else if (j > 0 && nodeList[j].GetATagHref() == url)
                        {
                            continue;
                        }
                        url = nodeList[j].GetATagHref();
                        TableTag table = nodeList[j] as TableTag;
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                 prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                 specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                 remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                 CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName   = table.GetATagValue("title").Replace("&#41;", ")").Replace("&#40;", "(");
                        InfoUrl   = "http://www.yantian.gov.cn" + table.GetATagValue();
                        beginDate = beg;
                        string htmldetail = string.Empty;
                        if (prjName.Contains("["))
                        {
                            prjName = prjName.Remove(prjName.IndexOf("[")).ToString();
                        }
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("<br />", "\r\n").Trim();
                        }
                        catch
                        {
                            continue;
                        }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            HtmlTxt   = dtnode.AsHtml();
                            inviteCtx = dtnode.AsString().Replace(" ", "").Trim();
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            inviteCtx = regexHtml.Replace(inviteCtx, "");
                            Regex regCode = new Regex(@"(项目序号|招标编号)(:|:)[^\r\n]+\r\n");
                            code = regCode.Match(inviteCtx).Value.Replace("招标编号:", "").Replace("项目序号:", "").Trim();
                            if (Encoding.Default.GetByteCount(code) > 50)
                            {
                                code = "";
                            }
                            msgType  = "深圳市盐田区政府采购中心";
                            specType = "建设工程";
                            Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim();
                            if (Encoding.Default.GetByteCount(buildUnit) > 150)
                            {
                                buildUnit = "";
                            }
                            if (prjAddress == "")
                            {
                                prjAddress = "见招标信息";
                            }
                            if (Encoding.Default.GetByteCount(prjAddress) > 200)
                            {
                                prjAddress = "";
                            }
                            inviteCtx  = inviteCtx.Replace("<ahref=", "").Replace("/service/", "").Replace("</a>", "").Replace("您是第", "").Replace("位访问者粤ICP备06000803号", "").Replace(">", "").Trim();
                            prjName    = prjName.Replace("&ldquo;", "").Replace("&rdquo;", "").Trim();
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "",
                                                                   string.Empty, code, prjName, prjAddress, buildUnit,
                                                                   beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
コード例 #2
0
ファイル: bidSZYTian.cs プロジェクト: SHNXJMG/Small
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser        = new Parser(new Lexer(htl));
            NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "right")));

            if (tableNodeList != null && tableNodeList.Count > 0)
            {
                Regex regexPage = new Regex(@"共\d+页");
                page = int.Parse(regexPage.Match(tableNodeList.AsString()).Value.Trim(new char[] { '共', '页' }));
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.yantian.gov.cn/cn/zwgk/zfcg/zb/index_" + (i - 1).ToString() + ".shtml"), Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "565")), true), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    string url = string.Empty;
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string beg = nodeList[j].ToPlainTextString().GetDateRegex();
                        if (string.IsNullOrEmpty(beg))
                        {
                            continue;
                        }
                        else if (j > 0 && nodeList[j].GetATagHref() == url)
                        {
                            continue;
                        }
                        url = nodeList[j].GetATagHref().GetReplace("&#61;", "=").GetReplace("&amp;", "&");
                        TableTag table = nodeList[j] as TableTag;
                        string   prjName = string.Empty,
                                 buildUnit = string.Empty, bidUnit = string.Empty,
                                 bidMoney = string.Empty, code = string.Empty,
                                 bidDate = string.Empty, beginDate = string.Empty,
                                 endDate = string.Empty, bidType = string.Empty,
                                 specType = string.Empty, InfoUrl = string.Empty,
                                 msgType = string.Empty, bidCtx = string.Empty,
                                 prjAddress = string.Empty, remark = string.Empty,
                                 prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        prjName   = table.GetATagValue("title");
                        beginDate = beg;
                        string htmldetail = string.Empty;
                        if (!url.ToLower().Contains("http"))
                        {
                            InfoUrl = "http://www.yantian.gov.cn" + table.GetATagHref();
                        }
                        else
                        {
                            InfoUrl = url;
                        }
                        string htmltext   = string.Empty;
                        string htmldttext = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("<br />", "\r\n").Trim();
                            htmltext   = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                        }
                        catch
                        {
                            continue;
                        }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        Parser   texthtml  = new Parser(new Lexer(htmltext));
                        NodeList listtext  = texthtml.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content")));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "content")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            HtmlTxt    = dtnode.AsHtml();
                            htmldttext = listtext.AsHtml();
                            bidCtx     = dtnode.AsString().Replace(" ", "").Trim();
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            bidCtx    = regexHtml.Replace(bidCtx, "");
                            dtlparser = new Parser(new Lexer(HtmlTxt));
                            NodeList spanNode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "holder")), true), new TagNameFilter("table")));
                            if (spanNode != null && spanNode.Count > 1)
                            {
                                TableTag spanTable = spanNode[1] as TableTag;
                                string   ctx       = string.Empty;
                                for (int t = 1; t < spanTable.RowCount; t++)
                                {
                                    for (int c = 0; c < spanTable.Rows[t].ColumnCount; c++)
                                    {
                                        ctx += spanTable.Rows[0].Columns[c].ToNodePlainString() + ":" + spanTable.Rows[t].Columns[c].ToNodePlainString() + "\r\n";
                                    }
                                }
                                bidUnit = ctx.GetBidRegex();
                                if (!string.IsNullOrEmpty(bidUnit))
                                {
                                    bool isBreak = false;
                                    spanTable = spanNode[0] as TableTag;
                                    for (int t = 1; t < spanTable.RowCount; t++)
                                    {
                                        for (int c = 0; c < spanTable.Rows[t].ColumnCount; c++)
                                        {
                                            string unit = spanTable.Rows[t].Columns[c].ToNodePlainString();
                                            if (unit == bidUnit)
                                            {
                                                try
                                                {
                                                    bidMoney = spanTable.Rows[t].Columns[c + 1].ToNodePlainString().Replace(",", "").GetMoney();
                                                }
                                                catch { }
                                                isBreak = true;
                                                break;
                                            }
                                        }
                                        if (isBreak)
                                        {
                                            break;
                                        }
                                    }
                                }
                            }
                            else if (spanNode != null && spanNode.Count > 0)
                            {
                                TableTag spanTable = spanNode[0] as TableTag;
                                string   ctx       = string.Empty;
                                for (int t = 1; t < spanTable.RowCount; t++)
                                {
                                    for (int c = 0; c < spanTable.Rows[t].ColumnCount; c++)
                                    {
                                        ctx += spanTable.Rows[0].Columns[c].ToNodePlainString() + ":" + spanTable.Rows[t].Columns[c].ToNodePlainString() + "\r\n";
                                    }
                                }
                                bidUnit  = ctx.GetBidRegex();
                                bidMoney = ctx.GetMoneyRegex();
                            }
                            else
                            {
                                bidUnit  = bidCtx.GetBidRegex();
                                bidMoney = bidCtx.GetMoneyRegex();
                            }
                            if (bidCtx.Contains("项目编号:"))
                            {
                                Regex regCode = new Regex(@"\w{14}");
                                code = regCode.Match(bidCtx.Substring(bidCtx.IndexOf("项目编号:"))).Value.Trim();
                            }
                            buildUnit = bidCtx.GetBuildRegex();

                            bidCtx   = bidCtx.Replace("<ahref=", "").Replace("/service/", "").Replace("</a>", "").Replace("您是第", "").Replace("位访问者粤ICP备06000803号", "").Replace(">", "").Trim();
                            bidCtx   = bidCtx.Replace("&lt;chsdatest=&quot;on&quot;year=&quot;2012&quot;month=&quot;01&quot;day=&quot;16&quot;islunardate=&quot;False&quot;isrocdate=&quot;False&quot;&gt;", "").Replace("&lt;/chsdate&gt;", "").Trim();
                            prjName  = prjName.Replace("&ldquo;", "").Replace("&rdquo;", "").Trim();
                            msgType  = "深圳市盐田区政府采购中心";
                            specType = "建设工程";
                            bidType  = ToolHtml.GetInviteTypes(prjName);
                            prjName  = ToolDb.GetPrjName(prjName);
                            if (Encoding.Default.GetByteCount(code) > 50)
                            {
                                code = "";
                            }
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate,
                                                             bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, htmldttext);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                        else
                        {
                            BidInfo info = GetBidInfo(prjName, InfoUrl, beginDate, htmldetail);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }