Beispiel #1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema")));

            if (noList != null && noList.Count > 0)
            {
                string temp = noList.AsString();
                try
                {
                    Regex  reg    = new Regex(@"/[^页]+页");
                    string result = reg.Match(temp).Value.Replace("页", "").Replace("/", "");
                    pageInt = Convert.ToInt32(result);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=97&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (dtlList != null && dtlList.Count > 0)
                {
                    for (int j = 0; j < dtlList.Count - 1; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        string temp    = dtlList[j].ToPlainTextString();
                        string tempHtl = dtlList[j].ToHtml();
                        prjName   = ToolHtml.GetHtmlAtagValue("title", tempHtl);
                        beginDate = ToolHtml.GetRegexDateTime(temp);
                        InfoUrl   = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl);
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                            htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", "");
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main")));
                        if (htlList != null && htlList.Count > 0)
                        {
                            HtmlTxt = htlList.ToHtml();
                            bidCtx  = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t\t", "").Replace("\r\r", "\r").Replace("\n\n", "\n");
                            bidType = ToolHtml.GetInviteTypes(prjName);

                            string bidStr = string.Empty;
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList bidList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable")));
                            if (bidList != null && bidList.Count > 0)
                            {
                                try
                                {
                                    TableTag tab = bidList[0] as TableTag;
                                    if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 6)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[6].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[6].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 5)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[5].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[5].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 4)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[4].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[4].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 3)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[3].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[3].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 2)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[2].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[2].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                    else if (tab.RowCount > 1 && tab.Rows[0].ColumnCount > 1)
                                    {
                                        bidStr  = tab.Rows[0].Columns[0].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[0].ToPlainTextString().ToNodeString() + "\r\n";
                                        bidStr += tab.Rows[0].Columns[1].ToPlainTextString().ToNodeString() + ":" + tab.Rows[1].Columns[1].ToPlainTextString().ToNodeString() + "\r\n";
                                    }
                                }
                                catch { }
                            }
                            buildUnit  = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex);
                            prjAddress = ToolHtml.GetRegexString(bidCtx, ToolHtml.AddressRegex);
                            code       = ToolHtml.GetRegexString(bidCtx, ToolHtml.CodeRegex);
                            bidUnit    = ToolHtml.GetRegexString(bidCtx, ToolHtml.BidRegex);
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = ToolHtml.GetRegexString(bidStr.Replace("  ", ""), ToolHtml.BidRegex, false);
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegexBegEnd("确认", "为");
                            }
                            bidMoney = ToolHtml.GetRegexString(bidCtx, ToolHtml.MoneyRegex);
                            bidMoney = ToolHtml.GetRegexMoney(bidMoney);

                            if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetRegexBegEnd("¥", "元").GetMoney();
                            }

                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                            code       = ToolHtml.GetSubString(code, 50);
                            bidUnit    = ToolHtml.GetSubString(bidUnit, 150);

                            bidUnit   = ToolHtml.GetStringTemp(bidUnit);
                            buildUnit = ToolHtml.GetStringTemp(buildUnit);

                            if (string.IsNullOrEmpty(code))
                            {
                                code = "见中标信息";
                            }
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见中标信息";
                            }
                            specType = "其他";
                            msgType  = "工网在线";
                            BidInfo info = ToolDb.GenBidInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList nodeAtag = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (nodeAtag != null && nodeAtag.Count > 0)
                            {
                                for (int c = 0; c < nodeAtag.Count; c++)
                                {
                                    ATag a = nodeAtag[c] as ATag;
                                    if (a.Link.IsAtagAttach())
                                    {
                                        string     alink  = "http://www.bidding.csg.cn/" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace("&nbsp", ""), info.Id, alink);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Beispiel #2
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html.GetJsString()));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "foot")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().Trim();
                    Regex  reg  = new Regex(@"共[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value);
                }
                catch { pageInt = 18; }
            }
            for (int i = 0; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.haizhu.gov.cn/site/jsj/bszx/ztbgl/index_" + (i - 1).ToString() + ".html", Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "body"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 7; j < nodeList.Count; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        INode iNode = nodeList[j];
                        prjName   = ToolHtml.GetHtmlAtagValue("title", iNode.ToHtml(), null, 1);
                        beginDate = iNode.ToPlainTextString().GetDateRegex();
                        string aLinks = string.Empty;
                        try
                        {
                            aLinks = iNode.ToHtml().GetATag(1).Link.Replace("./jsgczbgl/", "");
                            aLinks = aLinks.Substring(0, 6);
                        }
                        catch
                        {
                        }
                        InfoUrl    = "http://www.haizhu.gov.cn/site/jsj/bszx/ztbgl/" + iNode.ToHtml().GetATag(1).Link.Replace("./", "");
                        inviteType = prjName.GetInviteBidType();
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            htlDtl = htlDtl.GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "img")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            HtmlTxt   = dtlList.ToHtml();
                            inviteCtx = HtmlTxt.Replace("<p>", "\r\n").Replace("</p>", "\r\n").ToCtxString();

                            prjAddress = inviteCtx.GetAddressRegex();
                            buildUnit  = inviteCtx.GetBuildRegex();
                            code       = inviteCtx.GetCodeRegex();
                            msgType    = "广州市海珠区建设和园林绿化局";
                            specType   = "建设工程";
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州市区", "海珠区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aList != null && aList.Count > 0)
                            {
                                for (int c = 0; c < aList.Count; c++)
                                {
                                    ATag a = aList[c] as ATag;
                                    if (a.Link.IsAtagAttach())
                                    {
                                        string     alink  = "http://www.haizhu.gov.cn/site/jsj/bszx/ztbgl/jsgczbgl/" + aLinks + "/" + a.Link.Replace("./", "");
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText.Replace("&nbsp", "").Replace(";", "").Replace(";", ""), info.Id, alink);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Beispiel #3
0
 /// <summary>
 /// 获取ATag属性值
 /// </summary>
 /// <param name="value"></param>
 /// <param name="strName">ATag属性</param>
 /// <param name="i"></param>
 /// <returns></returns>
 public static string GetATagValue(this NodeList value, string strName = "href", int i = 0)
 {
     return(ToolHtml.GetHtmlAtagValue(strName, value.ToHtml(), null, i));
 }
Beispiel #4
0
 /// <summary>
 /// 获取ATag属性值
 /// </summary>
 /// <param name="value"></param>
 /// <param name="strName">ATag属性</param>
 /// <param name="i"></param>
 /// <returns></returns>
 public static string GetATagValue(this string value, string strName = "href", int i = 0)
 {
     return(ToolHtml.GetHtmlAtagValue(strName, value, null, i));
 }
Beispiel #5
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "yema")));

            if (noList != null && noList.Count > 0)
            {
                string temp = noList.AsString();
                try
                {
                    Regex  reg    = new Regex(@"/[^页]+页");
                    string result = reg.Match(temp).Value.Replace("页", "").Replace("/", "");
                    pageInt = Convert.ToInt32(result);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.gy-center.net/announce/list.jhtml?visi_id=&cid=76&chid=&gid=&thistype=&searchcid=&keyword=&action=yes&interval=&page=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "tab01"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                if (dtlList != null && dtlList.Count > 0)
                {
                    for (int j = 0; j < dtlList.Count - 1; j++)
                    {
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        string temp    = dtlList[j].ToPlainTextString();
                        string tempHtl = dtlList[j].ToHtml();
                        prjName   = ToolHtml.GetHtmlAtagValue("title", tempHtl);
                        beginDate = ToolHtml.GetRegexDateTime(temp);
                        InfoUrl   = "http://www.gy-center.net/announce/" + ToolHtml.GetHtmlAtagValue("href", tempHtl);
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                            htlDtl = System.Text.RegularExpressions.Regex.Replace(htlDtl, "(<script)[\\s\\S]*?(</script>)", "");
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList htlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "r_content_right_main")));
                        if (htlList != null && htlList.Count > 0)
                        {
                            HtmlTxt    = htlList.ToHtml();
                            inviteCtx  = Regex.Replace(HtmlTxt, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\t\t", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            prjAddress = ToolHtml.GetRegexString(inviteCtx, ToolHtml.AddressRegex);
                            buildUnit  = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex);
                            code       = ToolHtml.GetRegexString(inviteCtx, ToolHtml.CodeRegex);
                            prjAddress = ToolHtml.GetSubString(prjAddress, 150);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            code       = ToolHtml.GetSubString(code, 50);
                            if (string.IsNullOrEmpty(code))
                            {
                                code = "见招标信息";
                            }
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见招标信息";
                            }
                            specType = "其他";
                            msgType  = "工网在线";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "工网在线";
                            }
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "电网专项工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Beispiel #6
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt = 1;
            string html    = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new TagNameFilter("div")), new HasAttributeFilter("id", "page_div")));

            if (sNode != null && sNode.Count > 0)
            {
                string page = ToolHtml.GetRegexString(sNode.AsString(), "共", "页");
                try
                {
                    pageInt = int.Parse(page);
                }
                catch
                {
                    pageInt = 7;
                }
            }
            parser.Reset();
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.conghua.gov.cn/zgch/zbzb/list_" + i.ToString() + ".shtml", Encoding.Default);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_list"))), new TagNameFilter("table")));
                if (sNode != null && sNode.Count > 0)
                {
                    TableTag table = sNode[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        TableRow tr          = table.Rows[j];
                        string   projectName = ToolHtml.GetHtmlAtagValue("title", tr.ToHtml());
                        if (!projectName.Contains("中标") && !projectName.Contains("结果") && !projectName.Contains("候选单位公示"))
                        {
                            string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                                   prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                   specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                                   remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                                   CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            prjName    = projectName;
                            inviteType = ToolHtml.GetInviteTypes(projectName);
                            beginDate  = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString());
                            InfoUrl    = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", "");
                            string htmlDtl = string.Empty;
                            try
                            {
                                htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                                htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl);
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htmlDtl));
                            NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon")));
                            if (dtlList != null && dtlList.Count > 0)
                            {
                                HtmlTxt   = dtlList.ToHtml();
                                inviteCtx = dtlList.AsString().Replace("&nbsp;", "");

                                buildUnit = ToolHtml.GetRegexString(inviteCtx, ToolHtml.BuildRegex, true);
                                if (!string.IsNullOrEmpty(buildUnit) && buildUnit.Contains(" "))
                                {
                                    buildUnit = buildUnit.Remove(buildUnit.IndexOf(" "));
                                }

                                buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                                msgType    = "广州建设工程交易中心";
                                specType   = "建设工程";
                                inviteType = inviteType == "" ? "小型工程" : inviteType;
                                if (string.IsNullOrEmpty(buildUnit))
                                {
                                    buildUnit = "广州建设工程交易中心";
                                }
                                InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                                list.Add(info);
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                        else
                        {
                            string prjName = string.Empty,
                                   buildUnit = string.Empty, bidUnit = string.Empty,
                                   bidMoney = string.Empty, code = string.Empty,
                                   bidDate = string.Empty,
                                   beginDate = string.Empty,
                                   endDate = string.Empty, bidType = string.Empty,
                                   specType = string.Empty, InfoUrl = string.Empty,
                                   msgType = string.Empty, bidCtx = string.Empty,
                                   prjAddress = string.Empty, remark = string.Empty,
                                   prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                            prjName   = projectName;
                            bidType   = ToolHtml.GetInviteTypes(projectName);
                            beginDate = ToolHtml.GetRegexDateTime(tr.Columns[1].ToPlainTextString());
                            InfoUrl   = "http://www.conghua.gov.cn" + ToolHtml.GetHtmlAtagValue("href", tr.ToHtml()).Replace("..", "");
                            string htmlDtl = string.Empty;
                            try
                            {
                                htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                                htmlDtl = ToolHtml.GetRegexHtlTxt(htmlDtl);
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htmlDtl));
                            NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomcon")));
                            if (dtlList != null && dtlList.Count > 0)
                            {
                                HtmlTxt   = dtlList.ToHtml();
                                bidCtx    = dtlList.AsString();
                                buildUnit = ToolHtml.GetRegexString(bidCtx, ToolHtml.BuildRegex, true);
                                buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                                msgType   = "广州建设工程交易中心";
                                specType  = "建设工程";
                                bidType   = bidType == "" ? bidType : "小型工程";

                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (bidNode != null && bidNode.Count > 0)
                                {
                                    string   ctx      = string.Empty;
                                    TableTag bidTable = bidNode[0] as TableTag;
                                    try
                                    {
                                        for (int r = 0; r < bidTable.RowCount; r++)
                                        {
                                            ctx += bidTable.Rows[r].Columns[0].ToNodePlainString() + ":";
                                            ctx += bidTable.Rows[r].Columns[1].ToNodePlainString() + "\r\n";
                                        }
                                    }
                                    catch { }

                                    bidUnit  = ctx.GetRegex("单位名称,承包意向人名称");
                                    bidMoney = ctx.GetMoneyRegex();
                                    prjMgr   = ctx.GetMgrRegex();
                                    if (prjMgr.Contains("/"))
                                    {
                                        prjMgr = prjMgr.Remove(prjMgr.IndexOf("/"));
                                    }
                                }

                                if (string.IsNullOrEmpty(buildUnit))
                                {
                                    buildUnit = "广州建设工程交易中心";
                                }
                                BidInfo info = ToolDb.GenBidInfo("广东省", "广州市区", "从化市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                                 bidMoney, InfoUrl, prjMgr, HtmlTxt);
                                list.Add(info);
                                if (!crawlAll && list.Count >= this.MaxCount)
                                {
                                    return(list);
                                }
                            }
                        }
                    }
                }
            }
            return(list);
        }