Exemplo n.º 1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int pageInt = 1;

            string html = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott"))), new HasChildFilter(new TagNameFilter("a")))).SearchFor(typeof(ATag), true);

            for (int i = 0; i < sNodes.Count; i++)
            {
                ATag aTag = sNodes[i] as ATag;
                if (aTag.ToPlainTextString().Contains(">>"))
                {
                    pageInt = int.Parse(aTag.Link.ToLower().Replace("gopage(", "").Replace(")", ""));
                }
            }
            parser.Reset();
            //处理后续页
            if (pageInt > 1)
            {
                string cookiestr = string.Empty;
                for (int i = 1; i <= pageInt; i++)
                {
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "boardId", "eTime", "newstitle", "pageNO", "sTime", "totalRows", "typeId" }, new string[] { "000000000201", string.Empty, string.Empty, i.ToString(), string.Empty, "0", "000000000002" });

                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr);

                        DealHtml(list, html, crawlAll);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }


                    if (!crawlAll && list.Count >= this.MaxCount)
                    {
                        return(list);
                    }
                }
            }

            return(list);
        }
Exemplo n.º 2
0
        public void DealHtml(IList list, string html, bool crawlAll)
        {
            Parser   parserDtl = new Parser(new Lexer(html));
            NodeList aNodes    = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Content_GridView1")));

            if (aNodes != null && aNodes.Count > 0)
            {
                Type     typs  = typeof(ATag);
                TableTag table = aNodes[0] as TableTag;
                for (int t = 1; t < table.RowCount - 1; t++)
                {
                    string   InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, htmlTxt = string.Empty, prjCode = string.Empty;
                    TableRow tr = table.Rows[t] as TableRow;

                    ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag;
                    InfoUrl     = "http://www.szjsjy.com.cn/BusinessInfo/" + aTag.Link;
                    prjCode     = tr.Columns[1].ToNodePlainString().Replace(" ", "");
                    InfoTitle   = tr.Columns[2].ToPlainTextString();
                    PublistTime = tr.Columns[5].ToPlainTextString();
                    InfoType    = "标底公示";
                    string htmlDtl = string.Empty;
                    try
                    {
                        htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("<br>", "\r\n").Replace("&nbsp;", "");
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }


                    Parser parserCtx = new Parser(new Lexer(htmlDtl));

                    NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblXXNR")));
                    htmlTxt = ctxNode.AsHtml();
                    InfoCtx = ctxNode.AsString().Replace("&nbsp;", "");
                    parserCtx.Reset();
                    NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳市工程", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "深圳市建设工程交易中心", InfoUrl, prjCode, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt);


                    list.Add(info);
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("id", "trFujian")));
                    NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true);
                    for (int a = 0; a < aTagNodes.Count; a++)
                    {
                        ATag       fileTage    = aTagNodes[a] as ATag;
                        string     downloadURL = "http://www.szjsjy.com.cn/" + fileTage.Link.Replace("../", "");
                        BaseAttach attach      = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL);
                        base.AttachList.Add(attach);
                    }
                    if (!crawlAll && list.Count >= this.MaxCount)
                    {
                        return;
                    }
                }
            }
        }
Exemplo n.º 3
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new List <InviteInfo>();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch
            {
                return(list);
            }

            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("bgColor", "#EEF4F9")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "").Replace(" ", "").Trim();
                Regex  regpage  = new Regex(@"1/[0-9]+页");
                try
                {
                    pageInt = int.Parse(regpage.Match(pageTemp).Value.Split('/')[1].Replace("页", "").Trim());
                }
                catch (Exception ex) { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.szzdzb.cn/Product-index-id-8-p-" + i + ".html", Encoding.UTF8);
                    }
                    catch
                    {
                        continue;
                    }
                }

                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table")));

                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = table.Rows[j];
                        code    = tr.Columns[0].ToPlainTextString().Trim();
                        prjName = tr.Columns[1].ToPlainTextString().Trim();
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        InfoUrl = "http://www.szzdzb.cn" + aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table")));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n");
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            htmldetail = regexHtml.Replace(htmldetail, "");
                        }
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "hangao27"))), new TagNameFilter("table")));

                        inviteCtx = System.Web.HttpUtility.HtmlDecode(dtnode.AsString().Replace("打印本页 || 关闭窗口", ""));
                        Regex regCtx = new Regex(@"([\r\n]+)|([\t]+)");
                        inviteCtx = regCtx.Replace(inviteCtx, "\r\n");
                        Regex regBeginDate = new Regex(@"发布时间:[^\r\n]+\r\n");
                        beginDate  = regBeginDate.Match(inviteCtx).Value.Replace("发布时间", "").Replace(":", "").Trim();
                        specType   = "其他";
                        msgType    = "深圳市振东招标代理有限公司";
                        inviteType = ToolHtml.GetInviteTypes(prjName);
                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                        list.Add(info);
                        dtlparser.Reset();
                        dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("bgColor", "#CCCCCC")));
                        NodeList FileTag = dtnode.SearchFor(typeof(ATag), true);
                        if (FileTag != null && FileTag.Count > 0)
                        {
                            for (int f = 0; f < FileTag.Count; f++)
                            {
                                ATag file = FileTag[f] as ATag;
                                if (file.Link.ToUpper().Contains(".DOC"))
                                {
                                    BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, "http://www.szzdzb.cn" + file.Link);
                                    base.AttachList.Add(attach);
                                }
                            }
                        }
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }

            return(list);
        }
Exemplo n.º 4
0
        public void DealHtml(IList list, string html, bool crawlAll)
        {
            Parser   parserDtl = new Parser(new Lexer(html));
            NodeList aNodes    = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable")));

            if (aNodes != null && aNodes.Count > 0)
            {
                Type     typs  = typeof(ATag);
                TableTag table = aNodes[0] as TableTag;
                for (int t = 1; t < table.RowCount - 1; t++)
                {
                    string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty,
                           inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty;

                    TableRow tr   = table.Rows[t] as TableRow;
                    ATag     aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag;

                    InfoUrl = aTag.Link;
                    prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                    endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                    string htmlDtl = string.Empty;
                    try
                    {
                        htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                    htmlDtl = regexHtml.Replace(htmlDtl, "");
                    Parser parserCtx = new Parser(new Lexer(htmlDtl));

                    NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable")));
                    if (ctxNode != null && ctxNode.Count > 0)
                    {
                        Parser   parserdiv = new Parser(new Lexer(htmlDtl));
                        NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button")));
                        HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim();
                        Type     tp        = typeof(ATag);
                        TableTag tabTag    = ctxNode[0] as TableTag;
                        string   startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        Regex    regex     = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}");
                        Match    math      = regex.Match(startTime);
                        beginDate = math.Value.Replace("时间:", "").Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                        Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}");
                        Match match     = regexcode.Match(tabTag.ToPlainTextString());
                        code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                        Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}");
                        Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString());
                        buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                        Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}");
                        Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString());
                        prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Trim();
                        ctx        = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace("&nbsp;", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                        if (ctx.Length > 0)
                        {
                            Regex regexCtx = new Regex("<!--[^<]+-->");
                            ctx = regexCtx.Replace(ctx, "");
                        }
                        if (Encoding.Default.GetByteCount(code) > 50)
                        {
                            code = "";
                        }
                        if (buildUnit == "" || buildUnit == null)
                        {
                            buildUnit = "";
                        }
                        if (Encoding.Default.GetByteCount(buildUnit) > 150)
                        {
                            buildUnit = buildUnit.Substring(0, 150);
                        }
                        if (Encoding.Default.GetByteCount(prjAddress) > 200)
                        {
                            prjAddress = "见招标公告内容";
                        }
                        if (beginDate.Length > 0 && endDate.Length > 0)
                        {
                            DateTime begin = new DateTime();
                            DateTime end   = new DateTime();
                            try
                            {
                                begin = DateTime.Parse(beginDate);
                                end   = DateTime.Parse(endDate);
                            }
                            catch (Exception)
                            {
                            }
                            if (begin > end)
                            {
                                endDate = string.Empty;
                            }
                        }
                    }

                    parserCtx.Reset();

                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai")));
                    Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                    beginDate = regDate.Match(ctxNode.AsString()).Value.Trim();
                    if (beginDate == "")
                    {
                        beginDate = string.Empty;
                    }
                    inviteType = ToolHtml.GetInviteTypes(prjName);
                    InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠东县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt);
                    list.Add(info);
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")));
                    NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true);
                    for (int a = 0; a < aTagNodes.Count; a++)
                    {
                        ATag fileTage = aTagNodes[a] as ATag;
                        if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile"))
                        {
                            string     downloadURL = fileTage.Link;
                            BaseAttach attach      = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL);
                            base.AttachList.Add(attach);
                        }
                    }
                    if (!crawlAll && list.Count >= this.MaxCount)
                    {
                        return;
                    }
                }
            }
        }
Exemplo n.º 5
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new List <BidInfo>();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl);
            }
            catch
            {
                return(list);
            }

            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagePanel")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "").Replace(" ", "").Trim();
                try
                {
                    pageInt = int.Parse(pageTemp.GetRegexBegEnd("总", "页"));
                }
                catch (Exception ex) { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ST12")), true), new TagNameFilter("li")));

                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        ATag aTag = nodeList[j].GetATag();

                        prjName = aTag.GetAttribute("title");
                        if (prjName.Contains("声明"))
                        {
                            continue;
                        }
                        beginDate = nodeList[j].ToPlainTextString().GetDateRegex();
                        InfoUrl   = aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        }
                        catch { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "content"), new TagNameFilter("div")));

                        if (dtnode != null && dtnode.Count > 0)
                        {
                            HtmlTxt = dtnode.AsHtml().GetReplace("<!--[if !supportLists]-->,<!--[endif]-->");
                            bidCtx  = HtmlTxt.ToCtxString();

                            code      = bidCtx.GetCodeRegex().GetCodeDel();
                            buildUnit = bidCtx.GetBuildRegex();
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = bidCtx.GetRegex("采购单位,招标代理");
                            }


                            bidUnit = bidCtx.GetBidRegex();
                            if (bidUnit.Contains("公司"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            }

                            bidMoney = bidCtx.GetMoneyRegex();
                            if (bidMoney == "0" || string.IsNullOrWhiteSpace(bidMoney))
                            {
                                bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标金额" }, false, "万元");
                            }
                            prjMgr = bidCtx.GetMgrRegex();
                            try
                            {
                                if (decimal.Parse(bidMoney) >= 100000)
                                {
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                }
                            }
                            catch { }
                            specType = "政府采购";
                            msgType  = "中国远东国际招标公司";
                            bidType  = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            dtlparser = new Parser(new Lexer(HtmlTxt));
                            NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (FileTag != null && FileTag.Count > 0)
                            {
                                for (int f = 0; f < FileTag.Count; f++)
                                {
                                    ATag file = FileTag[f] as ATag;
                                    if (file.IsAtagAttach())
                                    {
                                        string link = string.Empty;
                                        if (file.Link.ToLower().Contains("http"))
                                        {
                                            link = file.Link;
                                        }
                                        else
                                        {
                                            link = "http://www.cfet.com.cn/" + file.Link;
                                        }
                                        BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemplo n.º 6
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            int    crawlMax        = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default).Replace("&nbsp;", "");
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_21 tc")));

            if (sNode != null && sNode.Count > 0)
            {
                string pageString = sNode.AsString().Trim();
                Regex  regexPage  = new Regex(@"createPageHTML\([^\)]+\)");
                Match  pageMatch  = regexPage.Match(pageString);
                try { pageInt = int.Parse(pageMatch.Value.Replace("createPageHTML(", "").Replace(")", "").Split(',')[0].Trim()); }
                catch (Exception) { }
            }
            string cookiestr = string.Empty;

            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "index_" + (i - 1).ToString() + ".html", Encoding.Default); }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ny_22"))), new TagNameFilter("li")));
                if (sNode != null && sNode.Count > 0)
                {
                    for (int j = 0; j < sNode.Count; j++)
                    {
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        INode  node   = sNode[j];
                        ATag   aTag   = node.Children.SearchFor(typeof(ATag), true)[0] as ATag;
                        Div    divTag = node.Children.SearchFor(typeof(Div), true)[1] as Div;
                        prjName   = aTag.ToPlainTextString().Trim();
                        beginDate = divTag.ToPlainTextString().Trim(new char[] { '[', ']', ' ' });
                        InfoUrl   = aTag.Link.Replace("./", "http://ztb.gaoming.gov.cn/jsgc/zbjg/");

                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div"))));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n");
                        }
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div"))));

                        if (dtnode != null && dtnode.Count > 0)
                        {
                            Regex regCtx = new Regex(@"[\n]+");
                            bidCtx = regCtx.Replace(dtnode.AsString().Replace(" ", "").Trim(), "\r\n");
                            TableTag table = dtnode.SearchFor(typeof(TableTag), true)[0] as TableTag;
                            for (int dl = 0; dl < table.RowCount; dl++)
                            {
                                TableRow tr = table.Rows[dl];
                                if (tr.Columns[0].ToPlainTextString().Contains("编号"))
                                {
                                    code = tr.Columns[1].ToPlainTextString().Trim();
                                }
                                else if (tr.Columns[0].ToPlainTextString().Contains("招标单位"))
                                {
                                    buildUnit = tr.Columns[1].ToPlainTextString().Trim();
                                }
                                else if (tr.Columns[0].ToPlainTextString().Contains("中标单位"))
                                {
                                    bidUnit = tr.Columns[1].ToPlainTextString().Trim();
                                }
                                else if (tr.Columns[0].ToPlainTextString().Contains("建造师") || tr.Columns[0].ToPlainTextString().Contains("负责人") || tr.Columns[0].ToPlainTextString().Contains("法定代表人"))
                                {
                                    prjMgr = tr.Columns[1].ToPlainTextString().Replace(" ", "").Trim();
                                }
                                else if (tr.Columns[0].ToPlainTextString().Contains("中标价"))
                                {
                                    Regex  regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                                    Regex  regmoneyctx = new Regex(@"[0-9]+[\%]");
                                    string bidMoneyctx = regmoneyctx.Replace(tr.Columns[1].ToPlainTextString(), "");
                                    if (!string.IsNullOrEmpty(bidMoneyctx))
                                    {
                                        if (tr.Columns[1].ToPlainTextString().Contains("万元"))
                                        {
                                            bidMoney = regBidMoney.Match(bidMoneyctx).Value;
                                        }
                                        else
                                        {
                                            try
                                            {
                                                bidMoney = (decimal.Parse(regBidMoney.Match(bidMoneyctx).Value) / 10000).ToString();
                                                if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                                {
                                                    bidMoney = "0";
                                                }
                                            }
                                            catch (Exception)
                                            {
                                                bidMoney = "0";
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        if (Encoding.Default.GetByteCount(bidUnit) > 150)
                        {
                            try
                            {
                                if (bidUnit.Contains("第二标段"))
                                {
                                    bidUnit = bidUnit.Remove(bidUnit.IndexOf("\n")).Replace("第一标段", "").Replace(":", "").Replace(":", "");
                                }
                            }
                            catch { }
                        }

                        msgType  = "佛山市高明区建设工程交易中心";
                        specType = "建设工程";
                        prjName  = ToolDb.GetPrjName(prjName);
                        bidType  = ToolHtml.GetInviteTypes(prjName);
                        BidInfo info = ToolDb.GenBidInfo("广东省", "佛山市区", "高明区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                        list.Add(info);
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }
            return(list);
        }
Exemplo n.º 7
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr);
            }
            catch (Exception ex)
            {
                return(list);
            }

            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "PageDataList")));

            if (tdNodes != null)
            {
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "").Replace(" ", "").Trim();
                Regex  regpage  = new Regex(@"共[0-9]+条");
                try
                {
                    int pageCount = int.Parse(regpage.Match(pageTemp).Value.Replace("共", "").Replace("条", "").Trim());
                    if (pageCount % 15 > 0)
                    {
                        pageInt = (pageCount / 15) + 1;
                    }
                    else
                    {
                        pageInt = pageCount / 15;
                    }
                }
                catch (Exception ex) { }
                for (int i = 1; i <= pageInt; i++)
                {
                    if (i > 1)
                    {
                        viewState       = this.ToolWebSite.GetAspNetViewState(html);
                        eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                            "cataId",
                            "find_yn",
                            "key_word",
                            "typeId",
                            "__EVENTARGUMENT",
                            "__EVENTTARGET",
                            "__EVENTVALIDATION",
                            "__VIEWSTATE"
                        }, new string[] {
                            "1,2,3,4,5,6,7,8,",
                            string.Empty,
                            string.Empty,
                            "1,2,3,4,5,6,7,8,",
                            string.Empty,
                            "PageDataList$ctl12$LinkButton1",
                            eventValidation,
                            viewState
                        });
                        try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); }
                        catch (Exception ex) { continue; }
                    }
                    parser = new Parser(new Lexer(html));
                    NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "StaffList"), new TagNameFilter("table")));

                    if (nodeList != null && nodeList.Count > 0)
                    {
                        TableTag table = nodeList[0] as TableTag;
                        for (int j = 1; j < table.RowCount; j++)
                        {
                            string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            TableRow tr = table.Rows[j];
                            code      = tr.Columns[0].ToPlainTextString().Trim();
                            prjName   = tr.Columns[1].ToPlainTextString().Trim();
                            beginDate = tr.Columns[2].ToPlainTextString().Trim();
                            ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                            InfoUrl = "http://www.cobo91.com/project/" + aTag.Link;
                            string htmldetail = string.Empty;
                            try
                            {
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").Trim();
                                Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                                NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "info"), new TagNameFilter("table")));
                                HtmlTxt    = dtnodeHTML.AsHtml();
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n");
                                Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                                htmldetail = regexHtml.Replace(htmldetail, "");
                            }
                            catch (Exception ex) { continue; }
                            Parser   dtlparser = new Parser(new Lexer(htmldetail));
                            NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "info"), new TagNameFilter("table")));

                            inviteCtx  = System.Web.HttpUtility.HtmlDecode(dtnode.AsString().Replace("【打印本页】", "").Replace("【关闭窗口】", "").Replace("版权所有:中邦国际招标&邦迪工程顾问", ""));
                            buildUnit  = inviteCtx.GetBuildRegex();
                            prjAddress = inviteCtx.GetAddressRegex();
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = inviteCtx.GetRegex("采购单位,招标代理");
                            }
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = inviteCtx.GetRegex("开标地点");
                            }
                            specType   = "其他";
                            msgType    = "中邦国际招标&邦迪工程顾问";
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            dtlparser.Reset();
                            NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (FileTag != null && FileTag.Count > 0)
                            {
                                for (int f = 0; f < FileTag.Count; f++)
                                {
                                    ATag file = FileTag[f] as ATag;
                                    if (file.Link.ToUpper().Contains(".DOC"))
                                    {
                                        BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, "http://www.cobo91.com" + file.Link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemplo n.º 8
0
        public void DealHtml(IList list, string html, bool crawlAll)
        {
            Parser   parserDtl = new Parser(new Lexer(html));
            NodeList aNodes    = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable")));

            if (aNodes != null && aNodes.Count > 0)
            {
                Type     typs  = typeof(ATag);
                TableTag table = aNodes[0] as TableTag;
                for (int t = 1; t < table.RowCount - 1; t++)
                {
                    string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty,
                           inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, FbTime = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, HtmlTxt = string.Empty;

                    TableRow tr   = table.Rows[t] as TableRow;
                    ATag     aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag;

                    InfoUrl = aTag.Link;
                    prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                    endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();

                    string htmlDtl = string.Empty;
                    try
                    {
                        htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                    htmlDtl = regexHtml.Replace(htmlDtl, "");
                    Parser parserCtx = new Parser(new Lexer(htmlDtl));

                    NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable")));
                    if (ctxNode != null && ctxNode.Count > 0)
                    {
                        Parser   parserdiv = new Parser(new Lexer(htmlDtl));
                        NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button")));
                        HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim();
                        Type     tp     = typeof(ATag);
                        TableTag tabTag = ctxNode[0] as TableTag;

                        string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        Regex  regex     = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}");
                        Match  math      = regex.Match(startTime);
                        beginDate = math.Value.Replace("时间:", "");

                        Regex regexcode = new Regex("(工程编号|项目编号):[^\r\n]+[\r\n]{1}");
                        Match match     = regexcode.Match(tabTag.ToPlainTextString());
                        if (match.Value.Length > 0)
                        {
                            code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        }
                        Regex regexBuildUnit = new Regex("(中标人|中标单位):[^\r\n]+[\r\n]{1}");
                        Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString());
                        if (matchBuildUnit.Value.Length > 0)
                        {
                            buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                        }
                        Regex regexbidUnit = new Regex("(招标人|建设单位|第一中标候选人):[^\r\n]+[\r\n]{1}");
                        Match matchbidUnit = regexbidUnit.Match(tabTag.ToPlainTextString());
                        if (matchbidUnit.Value.Length > 0)
                        {
                            bidUnit = matchbidUnit.Value.Replace("第一中标候选人:", "").Replace("招标人:", "").Replace("建设单位:", "").Replace("\r\n", "").Replace("\t", "").Replace("&nbsp;", " ").Trim();
                            if (bidUnit.Contains(":"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf(":")).ToString().Trim();
                            }
                        }

                        Regex regexMoney = new Regex("(中标价|其中标价为|中标价格):[^\r\n]+[\r\n]{1}");
                        Match matchMoney = regexMoney.Match(tabTag.ToPlainTextString());
                        if (matchMoney.Value.Length > 0)
                        {
                            bidMoney = matchMoney.Value.Replace("中标价:", "").Replace("其中标价为:", "").Replace("中标价格:", "").Replace("\r", "");
                        }
                        Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                        if (bidMoney.Contains("万"))
                        {
                            bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                            bidMoney = regBidMoney.Match(bidMoney).Value;
                        }
                        else
                        {
                            try
                            {
                                bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                {
                                    bidMoney = "0";
                                }
                            }
                            catch (Exception)
                            {
                                bidMoney = "0";
                            }
                        }
                        if (Encoding.Default.GetByteCount(code) > 50)
                        {
                            code = "";
                        }
                        if (buildUnit == "" || buildUnit == null)
                        {
                            buildUnit = "";
                        }
                        if (Encoding.Default.GetByteCount(buildUnit) > 150)
                        {
                            buildUnit = buildUnit.Substring(0, 150);
                        }
                        if (bidUnit == "" || bidUnit == null)
                        {
                            bidUnit = "";
                        }
                        if (Encoding.Default.GetByteCount(bidUnit) > 150)
                        {
                            bidUnit = bidUnit.Substring(0, 150);
                        }
                        ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace("&nbsp;", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                        if (ctx.Length > 0)
                        {
                            Regex regexCtx = new Regex("<!--[^<]+-->");
                            ctx = regexCtx.Replace(ctx, "");
                        }
                    }

                    parserCtx.Reset();
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai")));
                    Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                    beginDate = regDate.Match(ctxNode.AsString()).Value.Trim();
                    if (ctx.Contains("公示开始时间"))
                    {
                        beginDate = ctx.Substring(ctx.IndexOf("公示开始时间")).ToString();
                        Regex regBeDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日");
                        beginDate = regBeDate.Match(beginDate).Value.Trim();
                    }
                    if (beginDate == "")
                    {
                        beginDate = regDate.Match(ctxNode.AsString()).Value.Trim();
                    }
                    if (beginDate == "")
                    {
                        beginDate = string.Empty;
                    }
                    prjName = ToolDb.GetPrjName(prjName);
                    bidType = ToolHtml.GetInviteTypes(prjName);
                    BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠阳区", string.Empty, code, prjName, bidUnit, beginDate, buildUnit, beginDate, endDate, ctx, string.Empty, "惠州市建设工程交易中心", bidType, "建设工程", string.Empty, bidMoney, InfoUrl, string.Empty, HtmlTxt);

                    list.Add(info);
                    ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")));
                    NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true);
                    for (int a = 0; a < aTagNodes.Count; a++)
                    {
                        ATag fileTage = aTagNodes[a] as ATag;
                        if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile"))
                        {
                            string     downloadURL = fileTage.Link;
                            BaseAttach attach      = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL);
                            base.AttachList.Add(attach);
                        }
                    }
                    if (!crawlAll && list.Count >= this.MaxCount)
                    {
                        return;
                    }
                }
            }
        }
Exemplo n.º 9
0
        public void GetInfoFromHtml(int currentPage)
        {
            Lexer    lexer       = new Lexer(currentHtml);
            Parser   parser      = new Parser(lexer);
            NodeList poiHeadList = parser.Parse(poiListFilter);

            if (poiHeadList.Count == 1)
            {
                NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
                int      numCount    = 0;
                for (int i = 0; i < poiNodeList.Count; i++)
                {
                    POI poi = new POI();
                    DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
                    if (poiNode.TagName.Equals("DD"))
                    {
                        numCount++;
                        poi.Page   = currentPage;
                        poi.Number = numCount;
                        #region 获取口味、环境和服务评分,以及获取星级
                        NodeList tasteNodeList       = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                        NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                        NodeList serviceNodeList     = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                        if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                        {
                            Span spanNode = (Span)tasteNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            spanNode = (Span)environmentNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            spanNode = (Span)serviceNodeList[0];
                            if (!spanNode.ToPlainTextString().Equals("-"))
                            {
                                poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                            }
                            #region 获取星级
                            INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                            if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                            {
                                INode rankNodeCandidate = rankNodeOfParent.Children[0];
                                if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                                {
                                    Span   rankNode = (Span)rankNodeCandidate;
                                    string rank     = rankNode.GetAttribute("TITLE");
                                    if (rank.Contains("五"))
                                    {
                                        poi.Rank = 5;
                                    }
                                    else
                                    {
                                        if (rank.Contains("四"))
                                        {
                                            poi.Rank = 4;
                                        }
                                        else
                                        {
                                            if (rank.Contains("三"))
                                            {
                                                poi.Rank = 3;
                                            }
                                            else
                                            {
                                                if (rank.Contains("二"))
                                                {
                                                    poi.Rank = 2;
                                                }
                                                else
                                                {
                                                    if (rank.Contains("一"))
                                                    {
                                                        poi.Rank = 1;
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                            #endregion
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!");
                        }
                        #endregion
                        #region 获取平均消费
                        NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                        if (averageNodeList.Count == 1)
                        {
                            INode averageNode = averageNodeList[0];
                            if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                            {
                                string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                                poi.AverageCost = Int32.Parse(cost);
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!");
                        }
                        #endregion
                        #region 获取点评数
                        NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                        if (commentNodeList.Count == 1)
                        {
                            INode commentNode = commentNodeList[0];
                            if (commentNode.GetType().Equals(typeof(ATag)))
                            {
                                string commentNum = ((ATag)commentNode).StringText;
                                if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                                {
                                    commentNum = commentNum.Substring(0, commentNum.Length - 3);
                                }
                                poi.CommentCount = Int32.Parse(commentNum);
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!");
                        }
                        #endregion
                        #region 获取店名
                        NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                        if (nameNodeList.Count == 1)
                        {
                            INode nameNode = nameNodeList[0];
                            if (nameNode.GetType().Equals(typeof(ATag)))
                            {
                                poi.Name = ((ATag)nameNode).StringText;
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!");
                        }
                        #endregion
                        #region 获取地址和电话
                        NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                        if (addressNodeList.Count == 1)
                        {
                            NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                            if (districtNodeList.Count == 1)
                            {
                                ATag   districtTag = (ATag)districtNodeList[0];
                                string address     = districtTag.ToPlainTextString();
                                if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                                {
                                    TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                                    string   detailAddress     = detailAddressNode.ToPlainTextString();
                                    detailAddress = detailAddress.Trim();
                                    string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                                    poi.Phone = phoneStr;
                                    address  += detailAddress.Substring(0, detailAddress.Length - 8);
                                }
                                char[] removeChrVector = { ' ', '\n', '\t' };
                                address = address.Trim(removeChrVector);
                                foreach (char c in removeChrVector)
                                {
                                    address = address.Replace(c.ToString(), "");
                                }
                                poi.Address = address;
                            }
                            else
                            {
                                Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!");
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!");
                        }
                        #endregion
                        #region 获取标签
                        NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                        if (tagsNodeList.Count == 1)
                        {
                            INode tagsNode = tagsNodeList[0];
                            if (tagsNode.Children != null)
                            {
                                for (int j = 0; j < tagsNode.Children.Count; j++)
                                {
                                    INode node = tagsNode.Children[j];
                                    if (node.GetType().Equals(typeof(ATag)))
                                    {
                                        poi.Tags.Add(node.ToPlainTextString());
                                    }
                                }
                            }
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!");
                        }
                        #endregion
                        poiList.Add(poi);
                    }
                }
            }
            else
            {
                Console.WriteLine("获取POI列表出错");
            }
        }
Exemplo n.º 10
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();

            //取得页码
            int    pageInt = 1;
            string html    = string.Empty;
            string HtmlTxt = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }

            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("form")), new TagNameFilter("a")));

            if (sNode != null && sNode.Count > 0)
            {
                for (int i = 0; i < sNode.Count; i++)
                {
                    ATag pageA = sNode[i] as ATag;
                    if (pageA.ToPlainTextString().Contains("尾页"))
                    {
                        try
                        {
                            pageInt = int.Parse(pageA.Link.Remove(0, pageA.Link.LastIndexOf("=") + 1));
                        }
                        catch (Exception)
                        {
                        }
                    }
                }
            }

            parser.Reset();

            for (int i = 1; i <= pageInt; i++)
            {
                try
                {
                    html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&Page=" + i.ToString()), Encoding.Default);
                }
                catch (Exception ex)
                {
                    continue;
                }

                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("bordercolor", "#CCCCCC")));
                if (sNode != null && sNode.Count > 0)
                {
                    HtmlTxt = sNode.AsHtml();
                    string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty,
                           bidDate = string.Empty, beginDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty;
                    StringBuilder ctx      = new StringBuilder();
                    decimal       decMoney = 0;
                    TableTag      table    = sNode[1] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        TableRow tr = table.Rows[j];
                        //招标类型
                        bidType = tr.Columns[0].ToPlainTextString();

                        string invType = "施工,设计,勘察,服务,劳务分包,专业分包,小型施工,监理,设备材料,其他";
                        if (invType.Contains(bidType))
                        {
                            specType = "建设工程";
                        }
                        else
                        {
                            specType = "其他";
                        }

                        //项目名称
                        prjName = tr.Columns[1].ToPlainTextString().Replace("&nbsp;", "");


                        //中标单位
                        bidUnit = tr.Columns[2].ToPlainTextString().Replace("&nbsp;", "");


                        //发布时间
                        bidDate = tr.Columns[3].ToPlainTextString().TrimStart('[').TrimEnd(']');

                        NodeList cNode = new NodeList();
                        //进行搜索子节点A标签
                        tr.Columns[1].CollectInto(cNode, new TagNameFilter("a"));


                        InfoUrl = "http://www.chjssz.gov.cn/" + (cNode[0] as ATag).Link;
                        prjName = ToolDb.GetPrjName(prjName);
                        bidType = ToolHtml.GetInviteTypes(bidType);
                        BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, string.Empty, bidDate, bidUnit, bidDate, string.Empty, "见附件", string.Empty, "广州建设工程交易中心", bidType, specType, string.Empty, string.Empty, InfoUrl, string.Empty, HtmlTxt);
                        list.Add(info);


                        //采集内容页
                        string dlHtml = string.Empty;
                        try
                        {
                            dlHtml = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default);
                        }
                        catch (Exception ex)
                        {
                            continue;
                        }

                        Parser   dlParser = new Parser(new Lexer(dlHtml));
                        NodeList dlNodes  = dlParser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("background", "pic/abouts_16.jpg")));
                        if (dlNodes != null && dlNodes.Count > 0)
                        {
                            NodeList ddNode = dlNodes.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("A"), new HasAttributeFilter("target", "_blank")), true);
                            if (ddNode != null && ddNode.Count > 0)
                            {
                                for (int k = 0; k < ddNode.Count; k++)
                                {
                                    ATag ddATag = ddNode[k] as ATag;
                                    if (ddATag.Link.Contains("UploadFiles"))
                                    {
                                        BaseAttach attach = ToolDb.GenBaseAttach(ddATag.StringText, info.Id, "http://www.chjssz.gov.cn/" + ddATag.Link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                                dlParser.Reset();
                            }
                        }
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }

            return(list);
        }
Exemplo n.º 11
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new List <BidInfo>();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl);
            }
            catch
            {
                return(list);
            }

            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "");
                try
                {
                    pageInt = int.Parse(pageTemp.GetRegexBegEnd("/", "页"));
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://new.sztc.com/bidNotice/index_" + i + ".jhtml");
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lb-link")), true), new TagNameFilter("li")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        ATag aTag = nodeList[j].GetATag();
                        prjName = aTag.LinkText.ToNodeString().Replace(" ", "");

                        beginDate = prjName.GetDateRegex();
                        if (!string.IsNullOrEmpty(prjName))
                        {
                            prjName = prjName.Replace(beginDate, "");
                        }
                        InfoUrl = aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }

                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ninfo-con"), new TagNameFilter("div")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            HtmlTxt = dtnode.AsHtml();
                            bidCtx  = HtmlTxt.GetReplace("</p>,<br/>", "\r\n").ToCtxString().GetReplace("\t", "").GetReplace("\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n,\r\n\r\n", "\r\n");
                            bidCtx  = bidCtx.GetReplace("单位:\r\n,单位:\r\n", "单位:").GetReplace("中标人:\r\n,中标人:\r\n", "中标人:").GetReplace("编号:\r\n,编号:\r\n", "编号:");

                            code       = bidCtx.GetCodeRegex().GetCodeDel();
                            buildUnit  = bidCtx.GetBuildRegex();
                            prjAddress = bidCtx.GetAddressRegex();

                            bidUnit  = bidCtx.GetBidRegex();
                            bidMoney = bidCtx.Replace("和中标金额", "").GetMoneyRegex(new string[] { "中标金额" });
                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                            {
                                bidMoney = bidCtx.GetMoneyRegex();
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (dtlNode != null && dtlNode.Count > 0)
                                {
                                    string   ctx   = string.Empty;
                                    TableTag table = dtlNode[0] as TableTag;
                                    for (int r = 0; r < table.Rows[0].ColumnCount; r++)
                                    {
                                        try
                                        {
                                            ctx += table.Rows[0].Columns[r].ToNodePlainString() + ":";
                                            ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n";
                                        }
                                        catch { }
                                    }
                                    bidUnit  = ctx.GetBidRegex();
                                    bidMoney = ctx.GetMoneyRegex(new string[] { "中标金额" });
                                    if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                    {
                                        bidMoney = ctx.GetMoneyRegex();
                                    }
                                }
                            }
                            if (bidUnit.Contains("名称"))
                            {
                                bidUnit = bidUnit.Replace("名称", "");
                            }
                            if (bidUnit.Contains("公司"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            }
                            if (bidUnit.Contains("包号"))
                            {
                                bidUnit = "";
                            }

                            specType = "政府采购";
                            msgType  = "深圳市国际招标有限公司";
                            bidType  = prjName.GetInviteBidType();
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            dtlparser = new Parser(new Lexer(HtmlTxt));
                            NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (FileTag != null && FileTag.Count > 0)
                            {
                                for (int f = 0; f < FileTag.Count; f++)
                                {
                                    ATag file = FileTag[f] as ATag;
                                    if (file.IsAtagAttach())
                                    {
                                        string link = string.Empty;
                                        if (file.Link.ToLower().Contains("http"))
                                        {
                                            link = file.Link;
                                        }
                                        else
                                        {
                                            link = "http://new.sztc.com/" + file.Link;
                                        }
                                        BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemplo n.º 12
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr);
            }
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(htl));
            NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "box"))), new TagNameFilter("a")));

            if (nodeList != null && nodeList.Count > 0)
            {
                try
                {
                    ATag aTag = nodeList[nodeList.Count - 1] as ATag;
                    if (aTag.ToPlainTextString().Contains("末页"))
                    {
                        page = int.Parse(aTag.GetAttribute("tagname").ToLower().Replace("/szgm/132100/xwdt17/135204/151246/8d25503a-", "").Replace(".html", ""));
                    }
                }
                catch { }
            }
            if (page == 1)
            {
                page = 82;
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.szgm.gov.cn/szgm/132100/xwdt17/135204/151250/897d248a-" + i.ToString() + ".html"), Encoding.UTF8);
                    }
                    catch { return(list); }
                }
                parser = new Parser(new Lexer(htl));
                NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page_co")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))));
                if (tabList != null && tabList.Count > 0)
                {
                    for (int j = 0; j < tabList.Count; j++)
                    {
                        TableRow tr   = (tabList[j] as TableTag).Rows[0];
                        ATag     aTag = tr.GetATag();
                        if (aTag == null || tr.ColumnCount != 3)
                        {
                            continue;
                        }

                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        prjName   = aTag.GetAttribute("title");

                        InfoUrl = "http://www.szgm.gov.cn" + aTag.Link;
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htlDtl = regexHtml.Replace(htlDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article_body")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            bidCtx  = HtmlTxt.ToCtxString();
                            string tempName = bidCtx.GetRegex("工程名称");
                            if (!string.IsNullOrWhiteSpace(tempName))
                            {
                                prjName = tempName;
                            }
                            code       = bidCtx.GetCodeRegex();
                            buildUnit  = bidCtx.GetBuildRegex();
                            prjAddress = bidCtx.GetAddressRegex();
                            bidUnit    = bidCtx.GetBidRegex();
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegex("委托单位");
                            }
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegexBegEnd("确认", "为中标单位");
                            }
                            bidMoney = bidCtx.GetMoneyRegex();
                            if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetRegex("合同价").GetMoney();
                            }
                            if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetRegexBegEnd("人民币", "元").GetMoney();
                            }

                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "holder")), true), new TagNameFilter("table")));
                                if (tableNode == null || tableNode.Count < 1)
                                {
                                    parser.Reset();
                                    tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable")));
                                }
                                string ctx = string.Empty;
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    TableTag table = tableNode[0] as TableTag;
                                    if (table.RowCount >= 2)
                                    {
                                        for (int r = 0; r < table.Rows[0].ColumnCount; r++)
                                        {
                                            string temp = table.Rows[0].Columns[r].ToNodePlainString();
                                            if (temp.Contains("控制金额"))
                                            {
                                                continue;
                                            }
                                            ctx += temp + ":";
                                            ctx += table.Rows[1].Columns[r].ToNodePlainString() + "\r\n";
                                        }
                                    }
                                    bidUnit = ctx.GetBidRegex();
                                    if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0")
                                    {
                                        bidMoney = ctx.GetMoneyRegex();
                                    }

                                    if (string.IsNullOrWhiteSpace(code))
                                    {
                                        code = ctx.GetCodeRegex();
                                    }
                                }
                            }
                            try
                            {
                                if (decimal.Parse(bidMoney) > 50000)
                                {
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                }
                            }
                            catch { }
                            code    = code.GetCodeDel();
                            msgType = "深圳市光明新区";
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见招标信息";
                            }
                            specType = "政府采购";
                            bidType  = prjName.GetInviteBidType();
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市光明新区公明街道办事处";
                            }
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "光明新区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }

            return(list);
        }
Exemplo n.º 13
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new List <InviteInfo>();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl);
            }
            catch (Exception ex)
            {
                return(list);
            }

            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "");
                try
                {
                    pageInt = int.Parse(pageTemp.GetRegexBegEnd("/", "页"));
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://new.sztc.com/bidBulletin/index_" + i + ".jhtml");
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lb-link")), true), new TagNameFilter("li")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    if (nodeList != null && nodeList.Count > 0)
                    {
                        for (int j = 0; j < nodeList.Count; j++)
                        {
                            string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                            ATag aTag = nodeList[j].GetATag();
                            prjName = aTag.LinkText.ToNodeString().Replace(" ", "");

                            beginDate = prjName.GetDateRegex();
                            if (!string.IsNullOrEmpty(prjName))
                            {
                                prjName = prjName.Replace(beginDate, "");
                            }
                            InfoUrl = aTag.Link;
                            string htmldetail = string.Empty;
                            try
                            {
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                            }
                            catch { continue; }

                            Parser   dtlparser = new Parser(new Lexer(htmldetail));
                            NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "ninfo-con"), new TagNameFilter("div")));
                            if (dtnode != null && dtnode.Count > 0)
                            {
                                HtmlTxt   = dtnode.AsHtml();
                                inviteCtx = HtmlTxt.ToCtxString();

                                code       = inviteCtx.GetCodeRegex().GetCodeDel();
                                buildUnit  = inviteCtx.GetBuildRegex();
                                prjAddress = inviteCtx.GetAddressRegex();

                                specType   = "政府采购";
                                msgType    = "深圳市国际招标有限公司";
                                inviteType = prjName.GetInviteBidType();
                                InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                                list.Add(info);
                                dtlparser = new Parser(new Lexer(HtmlTxt));
                                NodeList FileTag = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (FileTag != null && FileTag.Count > 0)
                                {
                                    for (int f = 0; f < FileTag.Count; f++)
                                    {
                                        ATag file = FileTag[f] as ATag;
                                        if (file.IsAtagAttach())
                                        {
                                            string link = string.Empty;
                                            if (file.Link.ToLower().Contains("http"))
                                            {
                                                link = file.Link;
                                            }
                                            else
                                            {
                                                link = "http://new.sztc.com/" + file.Link;
                                            }
                                            BaseAttach attach = ToolDb.GenBaseAttach(file.ToPlainTextString(), info.Id, link);
                                            base.AttachList.Add(attach);
                                        }
                                    }
                                }
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemplo n.º 14
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt = 1;
            string html    = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "5")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                NodeList aNodes = new NodeList();
                tdNodes[0].CollectInto(aNodes, new TagNameFilter("a"));
                if (aNodes != null && aNodes.Count > 0)
                {
                    for (int i = 0; i < aNodes.Count; i++)
                    {
                        ATag aTag = aNodes[i] as ATag;
                        if (aTag.ToPlainTextString().Contains("尾页"))
                        {
                            Regex re = new Regex(@"[^0-9]+");
                            pageInt = int.Parse(re.Replace(aTag.Link, ""));
                            break;
                        }
                    }
                }
            }
            parser.Reset();
            for (int i = 1; i <= pageInt; i++)
            {
                try
                {
                    html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://218.20.201.20/www/zbmsg/2008/xzb_list.asp?page=" + i.ToString() + "&id=13828"), Encoding.Default);
                }
                catch (Exception ex)
                {
                    continue;
                }

                parser  = new Parser(new Lexer(html));
                tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_list")), true)));
                if (tdNodes != null && tdNodes.Count > 0)
                {
                    for (int j = 0; j < tdNodes.Count; j++)
                    {
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, buildScale = string.Empty, buildCycle = string.Empty,
                               levels = string.Empty, structType = string.Empty, bidMoney = string.Empty, buildType = string.Empty, buildQual = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, bidType = string.Empty, HtmlTxt = string.Empty;
                        decimal       decMoney = 0;
                        StringBuilder ctx      = new StringBuilder();
                        ATag          aTag     = tdNodes[j] as ATag;
                        if (aTag.Link.Contains("xzb_show.asp"))
                        {
                            InfoUrl = "http://218.20.201.20/www/zbmsg/2008/" + aTag.Link.Remove(aTag.Link.IndexOf("&"));
                            Regex  regexHtml = new Regex(@"<div[^>]*>[\s]*</div>");
                            string dlHtml    = string.Empty;
                            try
                            {
                                dlHtml = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).ToLower().Replace("&nbsp;", "");
                            }
                            catch (Exception ex)
                            {
                                continue;
                            }
                            string filterHtml = dlHtml.Replace("\n", "").Replace("\r", "").Replace("<u>", "<a>").Replace("</u>", "</a>");
                            prjName = aTag.ToPlainTextString();

                            //内容
                            Parser   ctxParser = new Parser(new Lexer(dlHtml));
                            NodeList ctxNodes  = ctxParser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), false));

                            ctx.Append(ctxNodes.AsString().Replace("&nbsp;", ""));
                            HtmlTxt = ctxNodes.AsHtml();
                            Parser   dlParser = new Parser(new Lexer(regexHtml.Replace(filterHtml, "")));
                            NodeList dlNodes  = dlParser.ExtractAllNodesThatMatch(new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), false));


                            //搜索附件
                            NodeList findFiles = dlNodes.ExtractAllNodesThatMatch(new TagNameFilter("a"), true);
                            NodeList fileNode  = new NodeList();
                            if (findFiles != null && findFiles.Count > 0)
                            {
                                for (int f = 0; f < findFiles.Count; f++)
                                {
                                    ATag fileA = findFiles[f] as ATag;
                                    if (fileA.Link.Contains("uploadfile"))
                                    {
                                        fileNode.Add(fileA);
                                    }
                                }
                            }
                            INode nods = dlNodes[0].Parent.Parent.Parent.Parent;
                            //发布日期
                            if (nods != null)
                            {
                                TableTag tb = nods as TableTag;
                                for (int t = 0; t < tb.RowCount; t++)
                                {
                                    TableRow tr = tb.Rows[t];
                                    if (tr.ToPlainTextString().Contains("发布日期"))
                                    {
                                        beginDate = tr.ToPlainTextString().Substring(tr.ToPlainTextString().IndexOf("[") + 1, tr.ToPlainTextString().IndexOf("]") - tr.ToPlainTextString().IndexOf("[") - 1);
                                        break;
                                    }
                                }
                            }
                            for (int k = 0; k < dlNodes.Count; k++)
                            {
                                if (dlNodes[k] is ITag)
                                {
                                    //对a标签进行过滤
                                    Regex strReplace = new Regex(@"<a[^>]*>|</a>");
                                    if (dlNodes[k].ToPlainTextString().Contains("中标候选人为:") || dlNodes[k].ToPlainTextString().Contains("中标人为:"))
                                    {
                                        NodeList bidUnitNode = new NodeList();
                                        dlNodes[k].CollectInto(bidUnitNode, new TagNameFilter("a"));
                                        if (bidUnitNode.Count > 0)
                                        {
                                            //找出匹配的项
                                            Regex           regexbidUnit = new Regex(@"<a[^>]*>[^<]*</a>");
                                            MatchCollection matchbidUnit = null;
                                            if (dlNodes[k].ToPlainTextString().Contains("中标候选人为:"))
                                            {
                                                matchbidUnit = regexbidUnit.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标候选人为:")));
                                            }
                                            else if (dlNodes[k].ToPlainTextString().Contains("中标人为:"))
                                            {
                                                matchbidUnit = regexbidUnit.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标人为:")));
                                            }
                                            if (matchbidUnit != null && matchbidUnit.Count > 0)
                                            {
                                                bidUnit = strReplace.Replace(matchbidUnit[0].ToString(), "");
                                            }
                                            if (string.IsNullOrEmpty(bidUnit))
                                            {
                                                bidUnit = dlNodes[k + 1].ToPlainTextString().Trim();
                                            }
                                        }
                                        else
                                        {
                                            bidUnit = dlNodes[k + 1].ToPlainTextString();
                                        }
                                    }
                                    if (dlNodes[k].ToPlainTextString().Contains("中标价:") || dlNodes[k].ToPlainTextString().Contains("投标报价:") || dlNodes[k].ToPlainTextString().Contains("中标价:") || dlNodes[k].ToPlainTextString().Contains("中标价为"))
                                    {
                                        Regex    regdecimal = new Regex(@"\d{1,}[\.]?\d{0,}");
                                        NodeList moneyNode  = new NodeList();
                                        dlNodes[k].CollectInto(moneyNode, new TagNameFilter("a"));
                                        if (moneyNode.Count > 0)
                                        {
                                            Regex           regexmoney = new Regex(@"<a[^>]*>[^<]*</a>");
                                            MatchCollection matchmoney = null;
                                            if (dlNodes[k].ToPlainTextString().Contains("中标价:"))
                                            {
                                                matchmoney = regexmoney.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("中标价:")));
                                            }
                                            if (dlNodes[k].ToPlainTextString().Contains("投标报价:"))
                                            {
                                                matchmoney = regexmoney.Matches(dlNodes[k].ToHtml().Substring(dlNodes[k].ToHtml().IndexOf("投标报价:")));
                                            }
                                            if (matchmoney != null && matchmoney.Count > 0)
                                            {
                                                if (dlNodes[k].ToPlainTextString().Contains("万元"))
                                                {
                                                    try
                                                    {
                                                        decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString())[0].ToString());
                                                    }
                                                    catch (Exception ex)
                                                    {
                                                    }
                                                }
                                                else
                                                {
                                                    try
                                                    {
                                                        decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString())[0].ToString()) / 10000;
                                                    }
                                                    catch (Exception ex)
                                                    {
                                                    }
                                                }
                                            }
                                        }
                                        else
                                        {
                                            if (dlNodes[k].ToPlainTextString().Contains("万元"))
                                            {
                                                decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString().ToString())[0].ToString());
                                            }
                                            else
                                            {
                                                decMoney = decimal.Parse(regdecimal.Matches(dlNodes[k].ToPlainTextString().ToString())[0].ToString()) / 10000;
                                            }
                                        }
                                    }
                                }
                            }
                            string regexstr = @"<[^>]*>";
                            string ctxStr   = Regex.Replace(ctx.ToString(), regexstr, string.Empty, RegexOptions.IgnoreCase);
                            bidUnit = bidUnit.Replace(" ", "").Trim();
                            Regex reg = new Regex(@"[\u4e00-\u9fa5]");
                            if (!reg.IsMatch(bidUnit))
                            {
                                bidUnit = "";
                            }
                            else
                            {
                                Regex  regBidMoneys = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                                string t            = regBidMoneys.Match(bidUnit).Value;
                                if (!string.IsNullOrEmpty(t))
                                {
                                    bidUnit = "";
                                }
                            }
                            if (string.IsNullOrEmpty(bidUnit) || decMoney <= 0)
                            {
                                string txt = string.Empty;
                                parser = new Parser(new Lexer(dlHtml));
                                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), true)));
                                if (dtList != null && dtList.Count > 1)
                                {
                                    for (int k = 0; k < dtList.Count; k++)
                                    {
                                        if (dtList[k].ToPlainTextString().Trim().Contains("中标候选人") || dtList[k].ToPlainTextString().Trim().Contains("中标人"))
                                        {
                                            try
                                            {
                                                if (string.IsNullOrEmpty(dtList[k + 1].ToPlainTextString().Trim()))
                                                {
                                                    txt += dtList[k].ToPlainTextString().Trim();
                                                    string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2);
                                                    if (string.IsNullOrEmpty(text))
                                                    {
                                                        txt += dtList[k].ToPlainTextString().Trim();
                                                        txt += dtList[k + 2].ToPlainTextString().Trim() + "\r\n";
                                                    }
                                                    else
                                                    {
                                                        txt += dtList[k].ToPlainTextString().Trim() + "\r\n";
                                                    }
                                                }
                                                else
                                                {
                                                    txt += dtList[k].ToPlainTextString().Trim();
                                                    string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2);
                                                    if (string.IsNullOrEmpty(text))
                                                    {
                                                        txt += dtList[k].ToPlainTextString().Trim();
                                                        txt += dtList[k + 1].ToPlainTextString().Trim() + "\r\n";
                                                    }
                                                    else
                                                    {
                                                        txt += dtList[k].ToPlainTextString().Trim() + "\r\n";
                                                    }
                                                }
                                            }
                                            catch { }
                                        }
                                        else
                                        {
                                            txt += dtList[k].ToPlainTextString().Trim() + "\r\n";
                                        }
                                    }
                                    if (string.IsNullOrEmpty(bidUnit))
                                    {
                                        Regex regBidUnit = new Regex(@"(中标单位|中标候选单位|中标候选人为|中标人为):[^\r\n]+\r\n");
                                        bidUnit = regBidUnit.Match(txt.Replace("\r\n\r\n", "")).Value.Replace("中标候选人为", "").Replace("中标人为", "").Replace("中标单位:", "").Replace("中标候选单位:", "").Replace(":", "").Trim();
                                    }
                                    if (decMoney <= 0)
                                    {
                                        Regex  regBidMoneystr = new Regex(@"(金额|价格|报价|中标价)(:|:)[^\r\n]+\r\n");
                                        string monerystr      = regBidMoneystr.Match(txt).Value.Replace("中标价", "").Replace("金额", "").Replace("价格", "").Replace("报价", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim();
                                        Regex  regBidMoney    = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                                        if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value))
                                        {
                                            if (monerystr.Contains("万元") || monerystr.Contains("万美元"))
                                            {
                                                decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value);
                                            }
                                            else
                                            {
                                                try
                                                {
                                                    decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000;
                                                    if (decMoney < decimal.Parse("0.1"))
                                                    {
                                                        decMoney = 0;
                                                    }
                                                }
                                                catch (Exception)
                                                {
                                                    decMoney = 0;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                            if (string.IsNullOrEmpty(bidUnit) || decMoney <= 0)
                            {
                                string txt = string.Empty;
                                parser = new Parser(new Lexer(dlHtml));
                                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("p"), new HasParentFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "news_show")), true)));
                                if (dtList != null && dtList.Count > 1)
                                {
                                    for (int k = 0; k < dtList.Count; k++)
                                    {
                                        if (dtList[k].ToPlainTextString().Trim().Contains("中标候选人") || dtList[k].ToPlainTextString().Trim().Contains("中标人"))
                                        {
                                            if (string.IsNullOrEmpty(dtList[k + 1].ToPlainTextString().Trim()))
                                            {
                                                k++;
                                                txt += dtList[k].ToPlainTextString().Trim();
                                            }
                                            else
                                            {
                                                txt += dtList[k].ToPlainTextString().Trim();
                                                string text = txt.Remove(txt.Length - txt.IndexOf("为:") - 2);
                                                if (string.IsNullOrEmpty(text))
                                                {
                                                    txt  = "";
                                                    txt += dtList[k].ToPlainTextString().Trim();
                                                }
                                                else
                                                {
                                                    txt  = "";
                                                    txt += dtList[k].ToPlainTextString().Trim() + "\r\n";
                                                }
                                            }
                                        }
                                        else
                                        {
                                            txt += dtList[k].ToPlainTextString().Trim() + "\r\n";
                                        }
                                        Regex regexsHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                                        txt = regexsHtml.Replace(txt, "");
                                    }
                                    if (string.IsNullOrEmpty(bidUnit))
                                    {
                                        Regex regBidUnit = new Regex(@"(中标单位|中标候选单位|中标候选人为|中标人为):[^\r\n]+\r\n");
                                        bidUnit = regBidUnit.Match(txt.Replace("\r\n\r\n", "")).Value.Replace("中标候选人为", "").Replace("中标人为", "").Replace("中标单位:", "").Replace("中标候选单位:", "").Replace(":", "").Trim();
                                    }
                                    if (string.IsNullOrEmpty(bidMoney))
                                    {
                                        Regex  regBidMoneystr = new Regex(@"(金额|价格|报价|中标价|中标价为)(:|:)[^\r\n]+\r\n");
                                        string monerystr      = regBidMoneystr.Match(txt).Value.Replace("中标价为", "").Replace("中标价", "").Replace("金额", "").Replace("价格", "").Replace("报价", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim();
                                        Regex  regBidMoney    = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                                        if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value))
                                        {
                                            if (monerystr.Contains("万元") || monerystr.Contains("万美元"))
                                            {
                                                decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value);
                                            }
                                            else
                                            {
                                                try
                                                {
                                                    decMoney = decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000;
                                                    if (decMoney < decimal.Parse("0.1"))
                                                    {
                                                        decMoney = 0;
                                                    }
                                                }
                                                catch (Exception)
                                                {
                                                    decMoney = 0;
                                                }
                                            }
                                        }
                                    }
                                }
                            }

                            prjName = ToolDb.GetPrjName(prjName.Replace(" ", ""));
                            bidType = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "番禺区", string.Empty, string.Empty, prjName, buildUnit, beginDate, bidUnit, beginDate, string.Empty, ctxStr, string.Empty, "广州市番禺区建设局", bidType, "建设工程", string.Empty, decMoney.ToString(), InfoUrl, string.Empty, HtmlTxt);

                            list.Add(info);
                            if (fileNode.Count > 0)
                            {
                                try
                                {
                                    for (int f = 0; f < fileNode.Count; f++)
                                    {
                                        BaseAttach attach = ToolDb.GenBaseAttach((fileNode[0] as ATag).StringText, info.Id, "http://218.20.201.20" + (fileNode[0] as ATag).Link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                                catch { }
                            }
                            dlParser.Reset();
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }

            return(list);
        }
Exemplo n.º 15
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            int    crawlMax        = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(this.SiteUrl + "&page=0"), Encoding.Default).Replace("&nbsp;", "");
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "page_PageList")));

            if (sNode != null && sNode.Count > 0)
            {
                SelectTag select = sNode[0] as SelectTag;
                pageInt = select.OptionTags.Length;
            }

            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + (i - 1).ToString(), Encoding.Default); }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("onmouseover", "this.style.backgroundColor=\"#EFFCD0\";")));
                if (sNode != null && sNode.Count > 0)
                {
                    for (int n = 0; n < sNode.Count; n++)
                    {
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = sNode[n] as TableRow;
                        prjName   = tr.Columns[0].ToPlainTextString().Trim();
                        code      = tr.Columns[1].ToPlainTextString().Trim();
                        beginDate = tr.Columns[2].ToPlainTextString().Trim();
                        ATag aTag = tr.GetATag();
                        if (aTag == null)
                        {
                            continue;
                        }
                        Regex regexLink = new Regex(@"id=[^-]+");
                        InfoUrl = "http://www.sdcin.com.cn/viewzbggnew.php?" + regexLink.Match(aTag.Link).Value;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "zbtgHTML"), new TagNameFilter("td")));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = htmldetail.Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n");
                        }
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "zbtgHTML"), new TagNameFilter("td")));
                        if (dtnode != null && dtnode.Count > 0)
                        {
                            inviteCtx = HtmlTxt.ToCtxString().Replace("startprint", "");
                            TableTag table = dtnode[0] as TableTag;
                            if (table != null && table.RowCount > 0)
                            {
                                for (int t = 0; t < table.RowCount; t++)
                                {
                                    for (int c = 0; c < table.Rows[t].ColumnCount; c++)
                                    {
                                        if (table.Rows[t].Columns[c].ToPlainTextString().Replace(" ", "").Contains("招标人"))
                                        {
                                            if (string.IsNullOrEmpty(buildUnit))
                                            {
                                                buildUnit = table.Rows[t].Columns[c + 1].ToPlainTextString().Trim();
                                            }
                                        }
                                        else if (table.Rows[t].Columns[c].ToPlainTextString().Replace(" ", "").Contains("公告时间"))
                                        {
                                            if (string.IsNullOrEmpty(beginDate))
                                            {
                                                beginDate = table.Rows[t].Columns[c + 1].ToPlainTextString().Trim().Replace("年", "-").Replace("月", "-").Replace("日", "");
                                            }
                                        }
                                        else if (table.Rows[t].Columns[c].ToPlainTextString().Replace(" ", "").Contains("工程地点"))
                                        {
                                            if (string.IsNullOrEmpty(prjAddress))
                                            {
                                                prjAddress = table.Rows[t].Columns[c + 1].ToPlainTextString().Trim();
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        if (string.IsNullOrEmpty(beginDate))
                        {
                            Regex regDate = new Regex(@"请于\d{4}年\d{1,2}月\d{1,2}日");
                            beginDate = regDate.Match(inviteCtx.Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value.Replace("请于", "");
                        }
                        if (string.IsNullOrEmpty(beginDate))
                        {
                            Regex regDate = new Regex(@"请于\d{4}-\d{1,2}-\d{1,2}");
                            beginDate = regDate.Match(inviteCtx.Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value.Replace("请于", "");
                        }
                        if (string.IsNullOrEmpty(beginDate))
                        {
                            if (inviteCtx.Length > 250)
                            {
                                Regex regDate = new Regex(@"\d{4}年\d{1,2}月\d{1,2}日");
                                beginDate = regDate.Match(inviteCtx.Substring(inviteCtx.Length - 250, 250).Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value;
                            }
                        }
                        if (string.IsNullOrEmpty(beginDate))
                        {
                            if (inviteCtx.Length > 250)
                            {
                                Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                                beginDate = regDate.Match(inviteCtx.Substring(inviteCtx.Length - 250, 250).Replace("\r\n", "").Replace(" ", "").Replace("\t", "")).Value;
                            }
                        }
                        if (string.IsNullOrEmpty(beginDate))
                        {
                            beginDate = DateTime.Now.ToString();
                        }
                        msgType    = "佛山市顺德区建设工程交易中心";
                        specType   = "建设工程";
                        inviteType = ToolHtml.GetInviteTypes(prjName);
                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", "顺德区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                        list.Add(info);
                        NodeList filenode = dtnode.SearchFor(typeof(ATag), true);
                        if (filenode != null && filenode.Count > 0)
                        {
                            for (int f = 0; f < filenode.Count; f++)
                            {
                                ATag fileTag = filenode[f] as ATag;
                                if (fileTag.IsAtagAttach())
                                {
                                    BaseAttach attach = ToolDb.GenBaseAttach(fileTag.ToPlainTextString().Trim(), info.Id, "http://www.sdcin.com.cn" + fileTag.Link);
                                    base.AttachList.Add(attach);
                                }
                            }
                        }
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }
            return(list);
        }
Exemplo n.º 16
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            int    crawlMax        = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default).Replace("&nbsp;", "");
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_21 tc")));

            if (sNode != null && sNode.Count > 0)
            {
                string pageString = sNode.AsString().Trim();
                Regex  regexPage  = new Regex(@"createPageHTML\([^\)]+\)");
                Match  pageMatch  = regexPage.Match(pageString);
                try { pageInt = int.Parse(pageMatch.Value.Replace("createPageHTML(", "").Replace(")", "").Split(',')[0].Trim()); }
                catch (Exception) { }
            }
            string cookiestr = string.Empty;

            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "index_" + (i - 1).ToString() + ".html", Encoding.Default); }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ny_22"))), new TagNameFilter("li")));
                if (sNode != null && sNode.Count > 0)
                {
                    for (int j = 0; j < sNode.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        INode  node   = sNode[j];
                        ATag   aTag   = node.Children.SearchFor(typeof(ATag), true)[0] as ATag;
                        Div    divTag = node.Children.SearchFor(typeof(Div), true)[1] as Div;
                        prjName   = aTag.ToPlainTextString().Trim();
                        beginDate = divTag.ToPlainTextString().Trim(new char[] { '[', ']', ' ' });

                        InfoUrl = "http://ztb.gaoming.gov.cn/jsgc/zbxx/" + aTag.Link.Replace("../", "").Replace("./", "");
                        if (aTag.Link.Contains("../"))
                        {
                            InfoUrl = "http://ztb.gaoming.gov.cn/" + aTag.Link.Replace("../", "").Replace("./", "");
                        }
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div")));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n");
                        }
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "con_10 tl"), new TagNameFilter("div")));

                        inviteCtx = dtnode.AsString().Replace(" ", "");
                        Regex regCtx = new Regex(@"[\n]+");
                        inviteCtx = regCtx.Replace(inviteCtx, "\r\n");
                        Regex regPrjAdd = new Regex(@"(工程地点|工程地址|项目地址):[^\r\n]+[\r\n]{1}");
                        prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Replace("项目地址:", "").Replace(")", "").Replace("。", "").Trim();
                        Regex regCode = new Regex(@"GMJ[0-9]+");
                        code = regCode.Match(inviteCtx).Value;
                        Regex regbuildUnit = new Regex(@"(招标单位|招标人):[^\r\n]+[\r\n]{1}");
                        buildUnit = regbuildUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace("招标人:", "").Replace("。", "").Trim();
                        if (Encoding.Default.GetByteCount(buildUnit) > 150)
                        {
                            buildUnit = buildUnit.Substring(0, 150);
                        }
                        if (Encoding.Default.GetByteCount(prjAddress) > 200)
                        {
                            prjAddress = "见招标信息";
                        }
                        msgType  = "佛山市高明区建设工程交易中心";
                        specType = "建设工程";
                        bidType  = ToolHtml.GetInviteTypes(bidType);
                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", "高明区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, bidType, specType, otherType, InfoUrl, HtmlTxt);
                        list.Add(info);
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }
            return(list);
        }