Beispiel #1
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination f_right")), true), new TagNameFilter("a")));

            if (pageNode != null && pageNode.Count > 0)
                    string temp = pageNode[pageNode.Count - 1].GetATagValue("onclick").GetRegexBegEnd("Info", ",").GetReplace("(");
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&pageSize=15&pageNum=" + i);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "newsList")), true), new TagNameFilter("li")));
                if (listNode != null && listNode.Count > 0)
                    for (int j = 0; j < listNode.Count; j++)
                        INode node = listNode[j];
                        ATag  aTag = node.GetATag();
                        if (aTag == null)
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;
                        area      = node.ToNodePlainString().GetReplace("[", "【").GetReplace("]", "】").GetRegexBegEnd("【", "】");
                        prjName   = aTag.GetAttribute("title");
                        beginDate = node.ToPlainTextString().GetDateRegex();
                        InfoUrl   = "" + aTag.Link.GetReplace("amp;");
                        string id      = aTag.Link.Substring(aTag.Link.IndexOf("id="), aTag.Link.Length - aTag.Link.IndexOf("id=")).GetReplace("id=");
                        string htmldtl = string.Empty;
                            NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                            }, new string[] {
                            htmldtl = this.ToolWebSite.GetHtmlByUrl("", nvc).GetJsString();
                        catch { }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "new_detail")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt   = dtlNode.AsHtml();
                            bidCtx    = HtmlTxt.ToLower().GetReplace("</p>,<br/>,<br>", "\r\n").ToCtxString();
                            buildUnit = bidCtx.GetBuildRegex();
                            code      = bidCtx.GetCodeRegex().GetCodeDel();

                            bidUnit = bidCtx.GetBidRegex();
                            if (string.IsNullOrEmpty(bidUnit))
                                bidUnit = bidCtx.GetRegex("中标候选人名称,中签单位,第一成交候选人,成交候选人");
                            bidMoney = bidCtx.GetMoneyRegex();
                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                bidMoney = bidCtx.GetMoneyRegex(null, true);
                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                bidMoney = bidCtx.GetRegex("总额").GetMoney();
                            prjMgr = bidCtx.GetMgrRegex();
                            if (string.IsNullOrEmpty(bidUnit))
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                    for (int t = 0; t < tableNode.Count; t++)
                                        TableTag tag      = tableNode[t] as TableTag;
                                        string   classStr = tag.GetAttribute("class");
                                        if (!string.IsNullOrEmpty(classStr) && classStr.ToLower().Contains("table_detail"))

                                        string ctx = string.Empty;
                                        for (int r = 0; r < tag.RowCount; r++)
                                            for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                                string temp = tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:");
                                                if ((c + 1) % 2 == 0)
                                                    ctx += temp + "\r\n";
                                                    ctx += temp + ":";
                                        bidUnit = ctx.GetBidRegex();
                                        if (string.IsNullOrEmpty(bidUnit))
                                            bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人");
                                        if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                            bidMoney = ctx.GetMoneyRegex();
                                        if (string.IsNullOrEmpty(prjMgr))
                                            prjMgr = ctx.GetMgrRegex();
                                        if (string.IsNullOrEmpty(prjMgr))
                                            prjMgr = ctx.GetRegex("拟任总监,拟任项目经理");

                                        if (!bidUnit.Contains("公司"))
                                            ctx = string.Empty;
                                                for (int r = 1; r < tag.Rows[4].ColumnCount; r++)
                                                    string temp = tag.Rows[4].Columns[r].ToNodePlainString().GetReplace(":,:");
                                                    ctx += temp + ":";
                                                    ctx += tag.Rows[5].Columns[r].ToNodePlainString().GetReplace(":,:") + "\r\n";
                                                bidUnit = ctx.GetBidRegex(null, true, 200);
                                                if (string.IsNullOrEmpty(bidUnit))
                                                    bidUnit = ctx.GetRegex("成交候选人,中标单位名称,第一中标候选人,第一成交候选人");
                                                if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                                    bidMoney = ctx.GetMoneyRegex();
                                                if (string.IsNullOrEmpty(prjMgr))
                                                    prjMgr = ctx.GetMgrRegex();
                                                if (string.IsNullOrEmpty(prjMgr))
                                                    prjMgr = ctx.GetRegex("拟任总监,拟任项目经理");
                                            catch { }
                                if (Convert.ToDecimal(bidMoney) > 100000)
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                            catch { }
                            if (prjMgr.Contains("联系"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("联系"));
                            if (prjMgr.Contains("电话"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("电话"));
                            if (prjMgr.Contains("2"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("2"));
                            if (prjMgr.Contains("("))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("("));
                            if (prjMgr.Contains("("))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("("));
                            if (prjMgr.Contains("二"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("二"));
                            if (prjMgr.Contains("注册"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("注册"));
                            if (prjMgr.Contains("业绩"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("业绩"));
                            if (prjMgr.Contains("I"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("I"));
                            if (prjMgr.Contains("投标") || prjMgr.IsNumber())
                                prjMgr = "";
                            if (bidUnit.Contains("公司"))
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            bidUnit  = bidUnit.GetReplace("名称,1,、I标段");
                            prjMgr   = prjMgr.GetReplace("1,、,一,第一中标人,第一中标,第中标人,第名,I标段,第中标候选人,标段").GetCodeDel();
                            specType = bidType = "建设工程";
                            msgType  = "安徽省发展和改革委员会";
                            BidInfo info = ToolDb.GenBidInfo("安徽省", "安徽省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);

                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #2
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr);
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblPageCount")));

            if (pageNode != null && pageNode.Count > 0)
                    string temp = pageNode[0].ToNodePlainString();
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                    }, new string[] {
                        "", "", "",
                        "", "", "",
                        "5", "12"
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "DataGrid1")));
                if (listNode != null && listNode.Count > 0)
                    TableTag table = listNode[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                        TableRow tr   = table.Rows[j];
                        ATag     aTag = tr.Columns[1].GetATag();
                        if (aTag == null)
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName   = aTag.LinkText.GetReplace(" ");
                        beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        InfoUrl   = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "hei_text")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            TableTag dtlTable = dtlNode[0] as TableTag;
                            HtmlTxt = dtlTable.ToHtml();
                            bidCtx  = "";
                            for (int r = 0; r < dtlTable.RowCount; r++)
                                for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++)
                                    string temp = dtlTable.Rows[r].Columns[c].ToHtml().GetReplace("<br>,<br/>", "\r\n").ToCtxString();
                                    if (!temp.Contains("\r\n"))
                                        temp = dtlTable.Rows[r].Columns[c].ToNodePlainString();
                                    if (!IsTable(dtlTable.Rows[r].ToHtml()))
                                        if ((c + 1) % 2 == 0)
                                            bidCtx += temp + "\r\n";
                                            bidCtx += temp.GetReplace(":,:") + ":";
                                        bidCtx += GetTableBid(dtlTable.Rows[r].ToHtml());
                            bidCtx = bidCtx.GetReplace(":\r\n", ":");
                            code   = bidCtx.GetCodeRegex();

                            buildUnit = bidCtx.GetBuildRegex();
                            if (string.IsNullOrEmpty(buildUnit))
                                buildUnit = bidCtx.GetRegex("建设单位名称");

                            bidUnit = bidCtx.GetBidRegex();
                            if (string.IsNullOrEmpty(bidUnit))
                                bidUnit = bidCtx.GetRegex("中标侯选人");
                            bidMoney = bidCtx.GetMoneyRegex();
                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                bidMoney = bidCtx.GetRegex("中标合同额").GetMoney();
                            prjMgr = bidCtx.GetMgrRegex();

                            msgType  = "北京市建设工程发包承包交易中心";
                            specType = "建设工程";
                            bidType  = "勘察设计";
                            BidInfo info = ToolDb.GenBidInfo("北京市", "北京市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #3
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default);
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagebox")));

            if (pageNode != null && pageNode.Count > 0)
                    string temp = pageNode.AsString().GetRegexBegEnd("共", "页");
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl("" + i + "&_=", Encoding.Default);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")));
                if (listNode != null && listNode.Count > 0)
                    TableTag table = listNode[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;

                        TableRow tr   = table.Rows[j];
                        ATag     aTag = tr.GetATag();
                        if (aTag == null)
                        prjName   = aTag.LinkText;
                        beginDate = tr.Columns[0].ToNodePlainString().GetDateRegex("yyyy/MM/dd");
                        InfoUrl   = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "Content")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt    = dtlNode.AsHtml();
                            bidCtx     = HtmlTxt.ToCtxString();
                            buildUnit  = bidCtx.GetBuildRegex();
                            prjAddress = bidCtx.GetAddressRegex();
                            code       = bidCtx.GetCodeRegex();
                            bidMoney   = bidCtx.GetMoneyRegex();
                            bidUnit    = bidCtx.GetBidRegex();
                            if (string.IsNullOrWhiteSpace(bidUnit))
                                bidUnit = bidCtx.GetRegex("第一候选供应商");
                            if (bidUnit.Contains("第一"))
                                bidUnit = string.Empty;

                            if (string.IsNullOrWhiteSpace(bidUnit))
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (bidNode != null && bidNode.Count > 1)
                                    string   ctx = string.Empty;
                                    TableTag tag = bidNode[1] as TableTag;
                                    if (tag.RowCount > 1)
                                        for (int c = 0; c < tag.Rows[0].ColumnCount; c++)
                                                ctx += tag.Rows[0].Columns[c].ToNodePlainString().GetReplace("") + ":";
                                                ctx += tag.Rows[1].Columns[c].ToNodePlainString().GetReplace("") + "\r\n";
                                            catch { }
                                    bidUnit = ctx.GetBidRegex();
                                    if (string.IsNullOrWhiteSpace(bidUnit))
                                        bidUnit = ctx.GetRegex("投标单位,单位名称,投标人名称");
                                    if (bidMoney == "0" || string.IsNullOrWhiteSpace(bidMoney))
                                        bidMoney = ctx.GetMoneyRegex();
                                    if (bidMoney == "0" || string.IsNullOrWhiteSpace(bidMoney))
                                        bidMoney = ctx.GetMoneyString().GetMoney();

                            if (bidUnit.Contains("公司"))
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            if (bidUnit.Contains("第一"))
                                bidUnit = string.Empty;
                            bidType  = prjName.GetInviteBidType();
                            specType = "建设工程";
                            msgType  = "云南省发展和改革委员会";
                            BidInfo info = ToolDb.GenBidInfo("云南省", "云南省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #4
        protected override IList ExecuteCrawl(bool crawlAll)
            IList list = new List <InviteInfo>();

            foreach (string area in this.DicSiteUrl.Keys)
                int    pageInt = 1, count = 0;
                string html            = string.Empty;
                string viewState       = string.Empty;
                string eventValidation = string.Empty;
                    html = this.ToolWebSite.GetHtmlByUrl(this.DicSiteUrl[area], Encoding.UTF8);
                catch (Exception ex)

                Parser   parser = new Parser(new Lexer(html));
                NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page")));
                if (sNode != null && sNode.Count > 0)
                        string page = sNode.AsString().ToNodeString().Replace("createPageHTML(", "");
                        string temp = page.Remove(page.IndexOf(","));
                        pageInt = Convert.ToInt32(temp);
                    catch (Exception) { }

                for (int i = 1; i <= pageInt; i++)
                    if (i > 1)
                            html = this.ToolWebSite.GetHtmlByUrl(this.DicSiteUrl[area] + "index_" + (i - 1) + ".html".ToString(), Encoding.UTF8);
                        catch { continue; }
                    parser = new Parser(new Lexer(html));
                    sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "secondrightlistbox"))), new TagNameFilter("ul"))), new TagNameFilter("li")));
                    if (sNode != null && sNode.Count > 0)
                        for (int t = 0; t < sNode.Count; t++)
                            string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                            beginDate = sNode[t].ToNodePlainString().GetDateRegex();
                            prjName   = sNode[t].GetATagValue("title");

                            InfoUrl = this.DicSiteUrl[area] + sNode[t].GetATagHref().Replace("./", "");

                            string htmldetail = string.Empty;
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                                Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                                NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentrightlistbox2")));
                                HtmlTxt    = dtnodeHTML.AsHtml();
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n");
                            catch (Exception ex) { continue; }
                            Parser   dtlparser = new Parser(new Lexer(htmldetail));
                            NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentrightlistbox2")));

                            Regex regexCtx = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            inviteCtx = regexCtx.Replace(dtnode.AsString(), "").Replace(" ", "");
                            Regex regPrjAdd = new Regex(@"(工程地点|工程地址|项目地址)[:|:][^\r\n]+[\r\n]{1}");
                            prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点", "").Replace("工程地址", "").Replace("项目地址", "").Replace(":", "").Replace(":", "").Replace(")", "").Trim();

                            Regex regbuildUnit = new Regex(@"(招标单位|招标人|采购人):[^\r\n]+[\r\n]{1}");
                            buildUnit = regbuildUnit.Match(inviteCtx).Value.Replace("招标单位:", "").Replace("招标人:", "").Replace("采购人:", "").Trim();
                            if (buildUnit.Contains("招标代理机构"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理机构"));
                            if (buildUnit.Contains("联系人"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系人"));
                            if (buildUnit.Contains(";"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf(";"));
                            if (buildUnit.Contains("地址"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址"));
                            msgType    = "佛山市建设工程交易中心";
                            specType   = "政府采购";
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            string     are  = area != "市直" ? area : "";
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "佛山市区", are, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link.GetReplace("../,./");
                                        if (Encoding.Default.GetByteCount(link) > 500)
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && count >= this.MaxCount)
                                goto Funcs;
                Funcs :;
Beispiel #5
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            catch (Exception ex)
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")));

            if (sNode != null && sNode.Count > 0)
                    string temp = sNode.AsString().Replace(" ", "");
                    Regex  reg  = new Regex(@"/[^页]+页");
                    pageInt = Convert.ToInt32(reg.Match(temp).Value.Replace("/", "").Replace("页", ""));
                catch { pageInt = 1; }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl("" + i.ToString(), Encoding.UTF8);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "div_list"))), new TagNameFilter("ul")));
                if (viewList != null && viewList.Count > 0)
                    for (int j = 0; j < viewList.Count; j++)
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(viewList[j].ToPlainTextString()).Value;
                        //prjName = viewList[j].ToPlainTextString().Replace("\r", "").Replace("\n", "").Replace(beginDate, "");
                        ATag aTag = viewList.SearchFor(typeof(ATag), true)[j] as ATag;
                        prjName = aTag.GetAttribute("title");
                        InfoUrl = "" + aTag.Link;
                        string htmDtl = string.Empty;
                            htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htmDtl = regexHtml.Replace(htmDtl, "");
                        catch { continue; }
                        parser = new Parser(new Lexer(htmDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "divZoom")));
                        if (dtl != null && dtl.Count > 0)
                            HtmlTxt = dtl.AsHtml();
                            bidCtx  = HtmlTxt.ToCtxString();
                            NodeList ifrm = new Parser(new Lexer(htmDtl)).ExtractAllNodesThatMatch(new TagNameFilter("iframe"));
                            if (ifrm != null && ifrm.Count > 0)
                                IFrameTag frame = ifrm[0] as IFrameTag;
                                string    url   = frame.GetAttribute("src");
                                    string   htm     = this.ToolWebSite.GetHtmlByUrl(url, Encoding.Default);
                                    NodeList tabNode = new Parser(new Lexer(htm)).ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                    string   ctx     = tabNode.AsHtml().ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t").Replace("\r\n\t\r\n\t", "\r\n\t");
                                    bidCtx = ctx + bidCtx;
                                catch { }
                            //bidCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", "");
                            //bidCtx = System.Text.RegularExpressions.Regex.Replace(bidCtx, "<[^>]*>", "").Replace("&nbsp;", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");

                            Regex regPrjCode = new Regex(@"(工程编号|项目编号|招标编号|中标编号|编号)(:|:)[^\r\n]+\r\n");
                            code = regPrjCode.Match(bidCtx.Replace(" ", "")).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("招标编号", "").Replace("中标编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBuidUnit = new Regex(@"(建设单位|招标人|承包人|招标单位|招标方|招标代理机构)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("招标代理机构", "").Replace("建设单位", "").Replace("招标人", "").Replace("承包人", "").Replace("招标单位", "").Replace("招标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regMoney = new Regex(@"(中标价|投标价|总投资|发包价|投标报价|价格|金额|总价)(:|:|)[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx.Replace(" ", "")).Value.Replace("中标价", "").Replace("总投资", "").Replace("发包价", "").Replace("总价", "").Replace("投标报价", "").Replace("投标价", "").Replace("价格", "").Replace("金额", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim();

                            Regex regBidUnit = new Regex(@"(成交供应商|中标供应商|第一候选人|中标候选人|中标单位|中标人|中标方)(:|:)[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx.Replace(" ", "")).Value.Replace("成交供应商", "").Replace("中标供应商", "").Replace("中标候选人", "").Replace("第一候选人", "").Replace("中标单位", "").Replace("中标人", "").Replace("中标方", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regprjMgr = new Regex(@"(项目经理姓名|项目经理(或建造师)|项目经理|项目负责人|项目总监|建造师|总工程师|监理师)(:|:)[^\r\n]+\r\n");
                            prjMgr = regprjMgr.Match(bidCtx.Replace(" ", "")).Value.Replace("项目经理(或建造师)", "").Replace("项目经理姓名", "").Replace("总工程师", "").Replace("项目经理", "").Replace("项目总监", "").Replace("建造师", "").Replace("监理师", "").Replace("项目负责人", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                            if (bidMoney.Contains("万"))
                                bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                                bidMoney = regBidMoney.Match(bidMoney).Value;
                                    bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                    if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                        bidMoney = "0";
                                catch (Exception)
                                    bidMoney = "0";
                            if (prjMgr.Contains("资格"))
                                prjMgr = prjMgr.Remove(prjMgr.IndexOf("资格"));
                            bidUnit   = ToolHtml.GetStringTemp(bidUnit).Replace(";", "");
                            buildUnit = ToolHtml.GetSubString(buildUnit, 150);
                            bidUnit   = ToolHtml.GetSubString(bidUnit, 150);
                            code      = ToolHtml.GetSubString(code, 50);
                            prjMgr    = ToolHtml.GetSubString(prjMgr, 50);
                            msgType   = "惠州市公共资源交易中心";
                            specType  = "建设工程";
                            if (string.IsNullOrEmpty(buildUnit))
                                buildUnit = "惠州市公共资源交易中心";
                            bidType = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #6
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr);
            catch (Exception ex)
            Parser   parser        = new Parser(new Lexer(htl));
            NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "fanyie")));

            if (tableNodeList != null && tableNodeList.Count > 0)
                Regex regexPage = new Regex(@"共\d+页");
                page = int.Parse(regexPage.Match(tableNodeList.AsString()).Value.Trim(new char[] { '共', '页' }));
            for (int j = 1; j < page; j++)
                //if (j > 1)
                //    try
                //    {
                //        htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(GetStartUrl() + "&ipage=" + j.ToString()), Encoding.Default);
                //    }
                //    catch (Exception ex) { continue; }
                parser = new Parser(new Lexer(htl));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list_headnews")), true), new TagNameFilter("li")));
                for (int i = 0; i < nodeList.Count; i++)
                    ATag   aTag = nodeList.SearchFor(typeof(ATag), true)[i] as ATag;
                    string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                           prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                           specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                           remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                           CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                    prjName = nodeList[i].ToPlainTextString().Replace(" ", "");
                    InfoUrl = "" + aTag.Link;
                    Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                    beginDate = regDate.Match(prjName).Value.Trim();
                    prjName   = prjName.Replace(beginDate, "").Trim();
                    if (prjName.Contains("招标公告") || prjName.Contains("补充公告"))
                        inviteType = ToolHtml.GetInviteTypes(prjName);
                        string htmldetail = string.Empty;
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.Default).Replace("&nbsp;", "");
                        catch (Exception)
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "STYLE10")));
                        if (dtnode.Count <= 0)
                            parserdetail = new Parser(new Lexer(htmldetail));
                            dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("TABLE"), new HasAttributeFilter("id", "Table1")));
                        if (dtnode.Count > 0)
                            inviteCtx = dtnode.AsString().Replace("\n", "\r\n");
                            HtmlTxt   = dtnode.AsHtml();
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            inviteCtx = regexHtml.Replace(inviteCtx, "");
                            Regex regBuidUnit = new Regex(@"(招 标 人|招标人|招 标人)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招 标 人:", "").Replace("招 标人:", "").Replace("招标人:", "").Trim();
                            if (buildUnit.Contains("招标代理"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")).ToString().Trim();
                            msgType  = "云浮市工程建设交易中心";
                            specType = "建设工程";
                            if (buildUnit == "")
                                buildUnit = "";
                            prjAddress = "见招标信息";
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "云浮市区", "",
                                                                   string.Empty, code, prjName, prjAddress, buildUnit,
                                                                   beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            if (!crawlAll && list.Count >= this.MaxCount)

Beispiel #7
        protected override IList ExecuteCrawl(bool crawlAll)
            string url = "";

            Dictionary <string, string> ggType = new Dictionary <string, string>();

            ggType.Add("勘察设计", "18101");
            ggType.Add("施工", "18102");
            ggType.Add("监理", "18103");
            ggType.Add("设备", "18104");

            IList list = new List <BidInfo>();

            foreach (string key in ggType.Keys)
                int    pageInt          = 1;
                int    count            = 0;
                string html             = string.Empty;
                string viewState        = string.Empty;
                string eventValidation  = string.Empty;
                string cookiestr        = string.Empty;
                NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                                                                                  new string[] {
                    html = this.ToolWebSite.GetHtmlByUrl(url, nvc);
                catch { return(list); }
                JavaScriptSerializer        serializer  = new JavaScriptSerializer();
                Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)(((Dictionary <string, object>)serializer.DeserializeObject(html))["listpage"]);

                pageInt = Convert.ToInt32(smsTypeJson["pagecount"]);

                for (int i = 1; i <= pageInt; i++)
                    if (i > 1)
                        nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                                                                      new string[] {
                            html = this.ToolWebSite.GetHtmlByUrl(url, nvc);
                        catch { continue; }
                        serializer  = new JavaScriptSerializer();
                        smsTypeJson = (Dictionary <string, object>)(((Dictionary <string, object>)serializer.DeserializeObject(html))["listpage"]);

                    object[] listDatas = (object[])smsTypeJson["listdata"];

                    foreach (object obj in listDatas)
                        Dictionary <string, object> dic = (Dictionary <string, object>)obj;

                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;

                        string tempName = Convert.ToString(dic["Name"]);
                        area      = tempName.GetRegexBegEnd("【", "】");
                        prjName   = tempName.GetReplace("【" + area + "】");
                        beginDate = Convert.ToString(dic["FTime"]);
                        InfoUrl   = "" + dic["SUrl"];
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                        Parser   parser  = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblFZBContent")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt = dtlNode.AsHtml();
                            bidCtx  = HtmlTxt.GetReplace("<br/>,<br>,</p>", "\r\n").ToCtxString();
                            //TableTag table = dtlNode[0] as TableTag;
                            //for (int r = 0; r < table.RowCount; r++)
                            //    for (int c = 0; c < table.Rows[r].ColumnCount; c++)
                            //    {
                            //        string temp = table.Rows[r].Columns[c].ToNodePlainString();
                            //        if ((c + 1) % 2 == 0)
                            //            bidCtx += temp.GetReplace(":,:") + "\r\n";
                            //        else
                            //            bidCtx += temp.GetReplace(":,:") + ":";
                            //    }
                            prjAddress = bidCtx.GetAddressRegex();
                            buildUnit  = bidCtx.GetBuildRegex();
                            bidUnit    = bidCtx.GetBidRegex();
                            if (string.IsNullOrEmpty(bidUnit))
                                bidUnit = bidCtx.GetRegex("第一名,预中标单位");
                            bidMoney = bidCtx.GetMoneyRegex();
                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                bidMoney = bidCtx.GetRegex("中标造价,造价,预 中 标 价,预中标价格").GetMoney();
                            prjMgr = bidCtx.GetMgrRegex();
                            code   = bidCtx.GetCodeRegex().GetCodeDel();
                            if (buildUnit.Contains("公司"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司";
                            if (buildUnit.Contains("联系"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系"));
                            if (buildUnit.Contains("地址"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址"));
                            if (string.IsNullOrEmpty(code))
                                code = bidCtx.GetRegex("编码");
                            if (bidUnit.Contains("公司"))
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            if (bidUnit.Contains("研究院"))
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究院")) + "研究院";
                            if (bidUnit.Contains("研究所"))
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究所")) + "研究所";
                            bidType  = key;
                            specType = "建设工程";
                            msgType  = "黑龙江住房和城乡建设厅";
                            BidInfo info = ToolDb.GenBidInfo("黑龙江省", "黑龙江省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link.GetReplace("../,./");
                                        if (Encoding.Default.GetByteCount(link) > 500)
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && count >= this.MaxCount)
                                goto Found;
                Found :;
Beispiel #8
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8);
                    html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8);
                catch { return(list); }
            int      startIndex = html.LastIndexOf("<xml");
            int      endIndex   = html.LastIndexOf("</xml>");
            string   xmlstr     = html.Substring(startIndex, endIndex - startIndex).ToLower().GetReplace("infourl", "span").GetReplace("info", "div").GetReplace("publishedtime", "p");
            Parser   parser     = new Parser(new Lexer(xmlstr));
            NodeList pageNode   = parser.ExtractAllNodesThatMatch(new TagNameFilter("div"));

            if (pageNode != null && pageNode.Count > 0)
                for (int i = 0; i < pageNode.Count; i++)
                    parser = new Parser(new Lexer(pageNode[i].ToHtml()));
                    NodeList dateNode  = parser.ExtractAllNodesThatMatch(new TagNameFilter("p"));
                    string   beginDate = dateNode[0].ToPlainTextString().GetDateRegex();
                    NodeList urlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("span"));
                    string   InfoUrl = "" + urlNode[0].ToPlainTextString();
                    string   htmldtl = string.Empty;
                        htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                    parser = new Parser(new Lexer(htmldtl));
                    NodeList titleNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "zoomtitl")));
                    string   prjName   = string.Empty;
                    if (titleNode != null && titleNode.Count > 0)
                        prjName = titleNode[0].ToNodePlainString().GetReplace(" ");
                    NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "778")));
                    if (dtlNode != null && dtlNode.Count > 0)
                        if (prjName.Contains("中标"))
                            string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                            HtmlTxt = dtlNode[0].ToHtml();
                            bidCtx  = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString();

                            buildUnit = bidCtx.GetBuildRegex();
                            code      = bidCtx.GetCodeRegex().GetCodeDel();
                            bidUnit   = bidCtx.GetBidRegex();

                            bidMoney = bidCtx.GetRegex("中标值").GetMoney();
                            if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                                bidMoney = bidCtx.GetMoneyRegex();
                                if (decimal.Parse(bidMoney) < 1)
                                    bidMoney = "0";
                            catch { }
                            prjMgr = bidCtx.GetMgrRegex();

                            specType = "政府采购";
                            bidType  = prjName.GetInviteBidType();
                            msgType  = "东莞市中堂镇政府";
                            BidInfo info = ToolDb.GenBidInfo("广东省", "东莞市区", "中堂镇", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k].GetATag();
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            string code = string.Empty, buildUnit = string.Empty,
                                   prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                                   specType = string.Empty, endDate = string.Empty,
                                   remark = string.Empty, inviteCon = string.Empty,
                                   CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;

                            HtmlTxt   = dtlNode[0].ToHtml();
                            inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString();

                            buildUnit  = inviteCtx.GetBuildRegex();
                            prjAddress = inviteCtx.GetAddressRegex();
                            code       = inviteCtx.GetCodeRegex().GetCodeDel();
                            if (buildUnit.Contains("招标代理"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理"));
                            if (buildUnit.Contains("地址"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址"));

                            specType   = "政府采购";
                            inviteType = prjName.GetInviteBidType();
                            msgType    = "东莞市中堂镇政府";

                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "中堂镇", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k].GetATag();
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && list.Count >= this.MaxCount)

Beispiel #9
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);
            catch { }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "countPage")));

            if (pageNode != null && pageNode.Count > 0)
                    pageInt = int.Parse(pageNode[0].ToNodePlainString().GetRegexBegEnd("共", "页"));
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                    }, new string[] {
                        "1", "", ""
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "reflshPage")), true), new TagNameFilter("li")));
                if (listNode != null && listNode.Count > 0)
                    for (int j = 0; j < listNode.Count; j++)
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        ATag aTag = listNode[j].GetATag();
                        if (aTag == null)
                        prjName = aTag.GetAttribute("title");
                        string tempCode = prjName.GetReplace(" (项目编号:", "kdxx").GetReplace(")", ")").GetRegexBegEnd("kdxx", ")");
                        code      = tempCode.GetReplace("目编号:,目编号:");
                        beginDate = listNode[j].ToPlainTextString().GetDateRegex();
                        InfoUrl   = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body"));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt    = dtlNode.AsHtml();
                            inviteCtx  = System.Web.HttpUtility.HtmlDecode(HtmlTxt.ToCtxString());
                            buildUnit  = inviteCtx.GetBuildRegex();
                            prjAddress = inviteCtx.GetAddressRegex();
                            inviteType = prjName.GetInviteBidType();
                            specType   = "政府采购";
                            msgType    = "天津政府采购办公室";
                            InviteInfo info = ToolDb.GenInviteInfo("天津市", "天津市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int a = 0; a < aNode.Count; a++)
                                    ATag aFile = aNode[a] as ATag;
                                    if (aFile.IsAtagAttach())
                                        string link = string.Empty;
                                        if (aFile.Link.Contains("http"))
                                            link = aFile.Link;
                                            link = "" + aFile.Link;
                                        string text = System.Web.HttpUtility.HtmlDecode(aFile.LinkText);
                                        base.AttachList.Add(ToolDb.GenBaseAttach(text, info.Id, link));
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #10
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list = new List <NotifyInfo>();
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string cookiestr       = string.Empty;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr);
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pager")), true), new TagNameFilter("a")));

            if (pageNode != null && pageNode.Count > 0)
                    Regex  reg  = new Regex(@"[0-9]+");
                    string temp = reg.Match(pageNode[pageNode.Count - 1].GetATagHref().Replace("&#39;", "")).Value;
                    pageInt = int.Parse(temp);
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(
                        new string[] { "__VIEWSTATE",
                                       "pager_input" },
                        new string[] {
                        (i - 1).ToString()
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GV1")));
                if (listNode != null && listNode.Count > 0)
                    TableTag table = listNode[0] as TableTag;
                    for (int j = 1; j < table.RowCount - 1; j++)
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        headName    = tr.Columns[0].ToNodePlainString();
                        releaseTime = tr.Columns[1].ToPlainTextString();
                        infoUrl     = "" + tr.Columns[0].GetATagHref().Replace("&amp;", "&");
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lb_nr")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            ctxHtml = dtlNode.AsHtml();
                            infoCtx = ctxHtml.ToCtxString();

                            msgType = "银川市公共资源交易中心";
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "宁夏回族自治区", "宁夏回族自治区及地市", "银川市", infoCtx, "通知公告");
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate))
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                    for (int k = 0; k < aNode.Count; k++)
                                        ATag fileATag = aNode[k].GetATag();
                                        if (fileATag.IsAtagAttach())
                                            BaseAttach obj = null;
                                                if (fileATag.Link.ToLower().Contains("http"))
                                                    obj = ToolHtml.GetBaseAttach(fileATag.Link, headName, info.Id);
                                                    obj = ToolHtml.GetBaseAttach("" + fileATag.Link, headName, info.Id);
                                            catch { }
                                            if (obj != null)
                                                ToolDb.SaveEntity(obj, string.Empty);
                                    NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                    if (imgNode != null && imgNode.Count > 0)
                                        for (int k = 0; k < imgNode.Count; k++)
                                            ImageTag   img = imgNode[0] as ImageTag;
                                            BaseAttach obj = null;
                                                if (img.ImageURL.ToLower().Contains("http"))
                                                    obj = ToolHtml.GetBaseAttach(img.ImageURL, headName, info.Id);
                                                    obj = ToolHtml.GetBaseAttach("" + img.ImageURL, headName, info.Id);
                                            catch { }
                                            if (obj != null)
                                                ToolDb.SaveEntity(obj, string.Empty);
Beispiel #11
        protected override IList ExecuteCrawl(bool crawlAll)
            IList list = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            catch (Exception ex)
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages")));

            if (pageList != null && pageList.Count > 0)
                    SelectTag selectTag = pageList[0] as SelectTag;
                    pageInt = selectTag.OptionTags.Length;
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a",
                        "topicChrList_20070702_p", "topicChrList_20070702_s_name", "id", "method", "__ec_pages",
                        "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_ldate"
                    }, new string[] {
                        "topicChrList_20070702", "20", "", i.ToString(), "", "709", "view", i.ToString(), "20", "", ""
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "topicChrList_20070702_table"), new TagNameFilter("table")));
                if (dtList != null && dtList.Count > 0)
                    TableTag table = dtList[0] as TableTag;
                    for (int j = 3; j < table.RowCount; j++)
                        TableRow tr = table.Rows[j];
                        string   headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                                 infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;
                        headName    = tr.Columns[1].ToPlainTextString().Trim();
                        releaseTime = tr.Columns[2].ToPlainTextString().Trim();
                        infoType    = "政策法规";
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        infoUrl = "" + aTag.Link.Replace("/", "");
                        string htmldeil = string.Empty;
                            htmldeil = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(infoUrl), Encoding.UTF8);
                        catch { continue; }
                        Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                        htmldeil = regexHtml.Replace(htmldeil, "");

                        parser = new Parser(new Lexer(htmldeil));
                        NodeFilter filter = new TagNameFilter("body");
                        NodeList   noList = parser.ExtractAllNodesThatMatch(filter);
                        if (noList != null && noList.Count > 0)
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = noList.AsString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "");
                            msgType    = "深圳政府采购";
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            if (crawlAll && list.Count >= this.MaxCount)
Beispiel #12
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    pageInt         = 1;
            string eventValidation = string.Empty;

                htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr);
            catch { return(null); }
            Parser   parser   = new Parser(new Lexer(htl));
            NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "ctl00_cph_context_ZsjyjgsList2_GridViewPaging1_PagingDescTd")));

            if (nodeList != null && nodeList.Count > 0)
                    string   pagestr = nodeList[0].ToPlainTextString().Trim();
                    string[] page    = pagestr.Split(',');
                    pageInt = int.Parse(page[page.Length - 1].Replace("共", "").Replace("页", ""));
                catch { pageInt = 1; }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    viewState       = this.ToolWebSite.GetAspNetViewState(htl);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ctl00$ScriptManager1",
                                                                                                     "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE",
                                                                                                     "ctl00$cph_context$ZsjyjgsList2$ddlSearch", "ctl00$cph_context$ZsjyjgsList2$txtTitle",
                                                                                                     "ctl00$cph_context$ZsjyjgsList2$txtStartTime", "ctl00$cph_context$ZsjyjgsList2$txtEndTime",
                                                                                                     "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$cph_context$ZsjyjgsList2$GridViewPaging1$btnForwardToPage" },
                                                                                      new string[] { "ctl00$cph_context$ZsjyjgsList2$UpdatePanel2|ctl00$cph_context$ZsjyjgsList2$GridViewPaging1$btnForwardToPage", "", "", viewState, "xxbt", "", "", "", i.ToString(), "", eventValidation, "GO" });
                        htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                    catch { }
                parser = new Parser(new Lexer(htl));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_ZsjyjgsList2_GridView1")));
                if (dtList != null && dtList.Count > 0)
                    TableTag table = dtList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                        string bProv = string.Empty, bCity = string.Empty, bArea = string.Empty, bPrjno = string.Empty,
                               bPrjname = string.Empty, bBidresultendtime = string.Empty,
                               bBaseprice = string.Empty, bBiddate = string.Empty, bBuildunit = string.Empty, bBidmethod = string.Empty,
                               bRemark = string.Empty, bInfourl = string.Empty;
                        TableRow tr = table.Rows[j];
                        bPrjname   = tr.Columns[2].ToPlainTextString().Trim();
                        bBuildunit = tr.Columns[3].ToPlainTextString().Trim();
                        bBiddate   = tr.Columns[4].ToPlainTextString().Trim();
                        ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag;
                        bInfourl = "" + aTag.Link;
                        BidProject info   = ToolDb.GenResultProject("广东省", "深圳市", "龙岗区", bPrjno, bPrjname, bBidresultendtime, bBaseprice, bBiddate, bBuildunit, bBidmethod, bRemark, bInfourl);
                        string     sql    = string.Format("select Id from BidProject where 1=1 and PrjNo='{0}' and PrjName='{1}'", info.PrjNo, info.PrjName);
                        string     result = Convert.ToString(ToolDb.ExecuteScalar(sql));
                        if (!string.IsNullOrEmpty(result))
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                string htltxt = string.Empty;
                                    htltxt = this.ToolWebSite.GetHtmlByUrl(bInfourl, Encoding.UTF8);
                                catch { }
                                Parser   par      = new Parser(new Lexer(htltxt));
                                NodeList fileList = par.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_AccessoriesControl1_GridView1")));
                                if (fileList != null && fileList.Count > 0)
                                    string sqlDelete = string.Format("delete from BaseAttach where SourceId='{0}'", result);
                                    TableTag tab = fileList[0] as TableTag;
                                    for (int k = 1; k < tab.RowCount; k++)
                                        TableRow   dr        = tab.Rows[k];
                                        ATag       aLink     = dr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                                        string     data      = DateTime.Now.Year.ToString() + DateTime.Now.Month.ToString() + "\\";
                                        string     annexName = ToolDb.NewGuid;
                                        FilesClass file      = new FilesClass();
                                        file.strUrl = "" + aLink.Link.Replace("../", "");
                                        int    index    = aLink.LinkText.IndexOf(".");
                                        string fileName = annexName + aLink.LinkText.Substring(index, aLink.LinkText.Length - index);
                                        file.strFileName = fileName;
                                        file.strFile     = data;
                                        long size = file.DownLoadFile();
                                        if (size > 1024)
                                            BaseAttach baseInfo = ToolDb.GenBaseAttach(annexName, aLink.LinkText, info.Id, data + fileName, size.ToString(), "");
                                            ToolDb.SaveEntity(baseInfo, "");
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                string htltxt = string.Empty;
                                    htltxt = this.ToolWebSite.GetHtmlByUrl(bInfourl, Encoding.UTF8);
                                catch { }
                                Parser   par      = new Parser(new Lexer(htltxt));
                                NodeList fileList = par.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_AccessoriesControl1_GridView1")));
                                if (fileList != null && fileList.Count > 0)
                                    TableTag tab = fileList[0] as TableTag;
                                    for (int k = 1; k < tab.RowCount; k++)
                                        TableRow   dr        = tab.Rows[k];
                                        ATag       aLink     = dr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                                        string     data      = DateTime.Now.Year.ToString() + DateTime.Now.Month.ToString() + "\\";
                                        string     annexName = ToolDb.NewGuid;
                                        FilesClass file      = new FilesClass();
                                        file.strUrl = "" + aLink.Link.Replace("../", "");
                                        int    index    = aLink.LinkText.IndexOf(".");
                                        string fileName = annexName + aLink.LinkText.Substring(index, aLink.LinkText.Length - index);
                                        file.strFileName = fileName;
                                        file.strFile     = data;
                                        long size = file.DownLoadFile();
                                        if (size > 1024)
                                            BaseAttach baseInfo = ToolDb.GenBaseAttach(annexName, aLink.LinkText, info.Id, data + fileName, size.ToString(), "");
                                            ToolDb.SaveEntity(baseInfo, "");
Beispiel #13
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookieStr       = string.Empty;
            int    pageInt         = 1;

                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            catch (Exception ex)
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "ctl00_cph_context_GridViewPaingTwo1_PagingDescTd")));

            if (pageList != null && pageList.Count > 0)
                    string temp = pageList.AsString().GetRegexBegEnd("共", "页");
                    pageInt = int.Parse(temp);
                { pageInt = 1; }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        viewState       = this.ToolWebSite.GetAspNetViewState(html);
                        eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(
                            new string[] {
                            new string[] {
                            "1", "", "3", i.ToString(), "GO"
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1")));
                if (nodeList != null && nodeList.Count > 0)
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                        string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty;

                        TableRow tr = table.Rows[j];
                        InfoType    = "补充通知";
                        InfoTitle   = tr.Columns[2].ToNodePlainString();
                        prjCode     = tr.Columns[1].ToNodePlainString();
                        PublistTime = tr.Columns[4].ToPlainTextString().GetDateRegex();
                        InfoUrl     = SiteUrl;
                        InfoCtx     = "招标编号:" + prjCode + "\r\n工程名称:" + InfoTitle + "\r\n附件类型:" + tr.Columns[3].ToNodePlainString() + "\r\n上传时间:" + PublistTime;
                        htmlTxt     = InfoCtx;
                        NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "东莞市区", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, MsgTypeCosnt.DongGuanMsgType, InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, string.Empty, string.Empty, htmlTxt);
                        ATag fileUrl = tr.Columns[2].GetATag();
                        if (fileUrl != null)
                            string     alink  = "" + fileUrl.Link;
                            BaseAttach attach = ToolDb.GenBaseAttach(fileUrl.LinkText.Replace("&nbsp", "").Replace(";", "").Replace(";", ""), info.Id, alink);
                        if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #14
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string cookiestr       = string.Empty;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr);
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pager")), true), new TagNameFilter("a")));

            if (pageNode != null && pageNode.Count > 0)
                    Regex  reg  = new Regex(@"[0-9]+");
                    string temp = reg.Match(pageNode[pageNode.Count - 1].GetATagHref().Replace("&#39;", "")).Value;
                    pageInt = int.Parse(temp);
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(
                        new string[] { "__VIEWSTATE",
                                       "pager_input" },
                        new string[] {
                        (i - 1).ToString()
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GV1")));
                if (listNode != null && listNode.Count > 0)
                    TableTag table = listNode[0] as TableTag;
                    for (int j = 1; j < table.RowCount - 1; j++)
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        TableRow tr = table.Rows[j];
                        prjName   = tr.Columns[0].ToNodePlainString();
                        beginDate = tr.Columns[1].ToPlainTextString();
                        InfoUrl   = "" + tr.Columns[0].GetATagHref().Replace("&amp;", "&");
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lb_nr")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt   = dtlNode.ToHtml();
                            inviteCtx = HtmlTxt.ToCtxString();
                            code      = inviteCtx.GetCodeRegex();
                            buildUnit = inviteCtx.GetBuildRegex();
                            if (buildUnit.Contains(" "))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf(" "));
                            prjAddress = inviteCtx.GetAddressRegex().Replace(" ", "");
                            inviteType = prjName.GetInviteBidType();
                            specType   = "政府采购";
                            msgType    = "银川市公共资源交易中心";
                            InviteInfo info = ToolDb.GenInviteInfo("宁夏回族自治区", "宁夏回族自治区及地市", "银川市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #15
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pages")), true), new TagNameFilter("a")));

            if (pageNode != null && pageNode.Count > 0)
                    string temp = pageNode[pageNode.Count - 1].GetATagHref();
                    temp    = temp.Substring(temp.ToLower().IndexOf("p="), temp.Length - temp.ToLower().IndexOf("p=")).GetReplace("P,p,=");
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl("" + i);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "newlist pList")), true), new TagNameFilter("a")));
                if (listNode != null && listNode.Count > 0)
                    for (int j = 0; j < listNode.Count; j++)
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;
                        INode node = listNode[j];
                        ATag  aTag = node.GetATag();
                        prjName   = aTag.LinkText.GetReplace(" ");
                        beginDate = node.ToPlainTextString().GetDateRegex();
                        if (!string.IsNullOrEmpty(beginDate))
                            prjName = prjName.GetReplace(beginDate);
                        InfoUrl = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "Details")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt    = dtlNode.AsHtml();
                            inviteCtx  = HtmlTxt.ToCtxString();
                            prjAddress = inviteCtx.GetAddressRegex();
                            buildUnit  = inviteCtx.GetBuildRegex().GetReplace("&mdash,&emsp");
                            if (buildUnit.Contains("公司"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司";
                            if (buildUnit.Contains("地址"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")) + "地址";
                            code       = inviteCtx.GetCodeRegex().GetCodeDel();
                            msgType    = "太原市公共资源交易中心";
                            specType   = "政府采购";
                            inviteType = "建设工程";
                            InviteInfo info = ToolDb.GenInviteInfo("山西省", "山西省及地市", "太原市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link.GetReplace("../,./");
                                        if (Encoding.Default.GetByteCount(link) > 500)
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #16
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list     = new List <InviteInfo>();
            string html     = string.Empty;
            int    pageInt  = 1;
            string nextPage = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl);
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (pageNode != null && pageNode.Count > 0)
                string temp = pageNode.AsString().GetRegexBegEnd("总记录数:", ",每页显示");
                string sum  = pageNode.AsString().GetRegexBegEnd("每页显示", "条记录");
                    pageInt = int.Parse(temp) / int.Parse(sum) + 1;
                catch { }
                parser = new Parser(new Lexer(pageNode.AsHtml()));
                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                if (aNode != null && aNode.Count > 0)
                    for (int a = 0; a < aNode.Count; a++)
                        ATag aTag = aNode[a].GetATag();
                        if (aTag.LinkText.Contains("下一页"))
                            nextPage = "" + aTag.Link;
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        string inurl = "" + i + ".html";
                        html = this.ToolWebSite.GetHtmlByUrl(inurl);
                    catch { continue; }
                    parser = new Parser(new Lexer(html));
                    NodeList aNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "partPage")), true), new TagNameFilter("a")));
                    if (aNode != null && aNode.Count > 0)
                        for (int a = 0; a < aNode.Count; a++)
                            ATag aTag = aNode[a].GetATag();
                            if (aTag.LinkText.Contains("下一页"))
                                nextPage = "" + aTag.Link;
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("style", "width:100%;border-collapse:collapse;")));
                if (listNode != null && listNode.Count > 0)
                    TableTag table = listNode[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;

                        TableRow tr   = table.Rows[j];
                        ATag     aTag = tr.Columns[0].GetATag();
                        prjName = aTag.GetAttribute("title");
                        string m = tr.ChildrenHTML.ToString();
                        beginDate = m.GetRegexBegEnd("<span>", "</span>").GetDateRegex();
                        InfoUrl   = aTag.Link.GetReplace("&amp;", "&");
                        InfoUrl   = "" + InfoUrl;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page_con")));
                        if (dtlNode == null || dtlNode.Count < 1)
                            dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body"));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt    = dtlNode.AsHtml();
                            inviteCtx  = HtmlTxt.GetReplace("<br/>,</p>,<br>,<br />", "\r\n").ToCtxString();
                            buildUnit  = inviteCtx.GetBuildRegex();
                            prjAddress = inviteCtx.GetAddressRegex();
                            code       = inviteCtx.GetCodeRegex().GetCodeDel();
                            specType   = "政府采购";
                            inviteType = prjName.GetInviteBidType();
                            msgType    = "深圳市南山区教育局";
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int a = 0; a < aNode.Count; a++)
                                    ATag file = aNode[a].GetATag();
                                    if (file.IsAtagAttach())
                                        string link = file.Link;
                                        if (!link.ToLower().Contains("http"))
                                            link = "" + file.Link;
                                        base.AttachList.Add(ToolDb.GenBaseAttach(file.LinkText, info.Id, link));
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #17
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr);
            catch (Exception ex)
            Parser   parser    = new Parser(new Lexer(htl));
            NodeList nodeList  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxma03")));
            Regex    regexPage = new Regex(@"共\d+页");

                page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' }));
            catch (Exception)
            { }
            for (int i = 1; i <= page; i++)
                if (i > 1)
                        htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("" + i.ToString() + ".html"), Encoding.UTF8);
                    catch (Exception ex) { continue; }
                parser = new Parser(new Lexer(htl));
                NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxdianbeijing")));
                if (tableNodeList.Count > 0)
                    for (int j = 0; j < tableNodeList.Count; j++)
                        ATag   aTag = tableNodeList.SearchFor(typeof(ATag), true)[j] as ATag;
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, bidType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName = aTag.LinkText;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(tableNodeList[j].ToPlainTextString()).Value.Trim();
                        InfoUrl   = "" + aTag.Link.Replace("amp;", "").Trim();
                        string htmldetail = string.Empty;
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace("&nbsp;", "");
                        catch (Exception)
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxzf2")));
                        if (dtnode.Count > 0)
                            HtmlTxt = dtnode.AsHtml();
                            Regex regeximg = new Regex(@"<img[^>]*>");//去掉图片
                            HtmlTxt   = regeximg.Replace(HtmlTxt, "");
                            inviteCtx = dtnode.AsString().Replace("\n", "\r\n");
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            inviteCtx = regexHtml.Replace(inviteCtx, "");
                            Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(inviteCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim();
                            Regex regPrjAddr = new Regex(@"(工程地点|工程地址)(:|:)[^\r\n]+\r\n");
                            prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址", "").Replace(":", "").Trim();
                            Regex regCode = new Regex(@"工程编号(:|:)[^\r\n]+\r\n");
                            code = regCode.Match(inviteCtx).Value.Replace("工程编号:", "").Replace("工程编号:", "").Trim();
                            if (Encoding.Default.GetByteCount(code) > 50)
                                code = "";
                            msgType  = "深圳市南山区政府采购及招标中心";
                            specType = "建设工程";
                            if (buildUnit == "")
                                buildUnit = "";
                            if (prjAddress == "")
                                prjAddress = "见招标信息";
                            bidType = ToolHtml.GetInviteTypes(prjName);
                            InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "",
                                                                   string.Empty, code, prjName, prjAddress, buildUnit,
                                                                   beginDate, endDate, inviteCtx, remark, msgType, bidType, specType, otherType, InfoUrl, HtmlTxt);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #18
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <BidInfo>();
            string html            = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    pageInt         = 1;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default);
                string temp = html.GetRegexBegEnd("<strong>", "</strong>").GetReplace("<fontcolor=red>1</font>/");
                pageInt = int.Parse(temp);
            catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&SpecialID=0&page=" + i, Encoding.Default);
                    catch { continue; }
                Parser   parser   = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "200")));
                if (listNode != null && listNode.Count > 0)
                    parser = new Parser(new Lexer(listNode.AsHtml()));
                    NodeList fontNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                    if (fontNode != null && fontNode.Count > 0)
                        for (int j = 0; j < fontNode.Count; j++)
                            ATag aTag = fontNode[j] as ATag;
                            if (aTag == null)
                            string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty,
                                   bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty;
                            string temp = aTag.GetAttribute("title");
                            prjName   = temp.GetRegex("文章标题");
                            code      = temp.GetRegex("招标代码");
                            beginDate = temp.GetRegex("更新时间").GetDateRegex("yyyy/MM/dd");
                            InfoUrl   = "" + aTag.Link;
                            string htmldtl = string.Empty;
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                            catch { continue; }
                            parser = new Parser(new Lexer(htmldtl));
                            NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "200")));
                            if (dtlNode != null && dtlNode.Count > 0)
                                HtmlTxt = dtlNode.AsHtml();
                                bidCtx  = HtmlTxt.ToLower().GetReplace("</p>,<br/>,<br>", "\r\n").ToCtxString();

                                buildUnit = bidCtx.GetBuildRegex();
                                bidUnit   = bidCtx.GetBidRegex().GetReplace("名称");
                                if (string.IsNullOrEmpty(bidUnit))
                                    bidUnit = bidCtx.GetRegex("中标人名称").GetReplace("名称");
                                if (string.IsNullOrEmpty(bidUnit))
                                    bidUnit = bidCtx.GetReplace("中标人名称:,:中标人名称,中标人:,中标人:", "\r\n").GetBidRegex().GetReplace("名称");
                                if (string.IsNullOrEmpty(bidUnit))
                                    parser = new Parser(new Lexer(HtmlTxt));
                                    NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                    if (tableNode != null && tableNode.Count > 0)
                                        TableTag tag = tableNode[tableNode.Count - 1] as TableTag;
                                        string   ctx = string.Empty;
                                        for (int r = 0; r < tag.RowCount; r++)
                                            for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                                string tempStr = tag.Rows[r].Columns[c].ToNodePlainString();
                                                if ((c + 1) % 2 == 0)
                                                    ctx += tempStr.GetReplace(":,:") + "\r\n";
                                                    ctx += tempStr.GetReplace(":,:") + ":";
                                        bidUnit = ctx.GetBidRegex().GetReplace("名称");
                                        if (string.IsNullOrEmpty(bidUnit))
                                            bidUnit = ctx.GetRegex("中标人名称").GetReplace("名称");
                                bidMoney = bidCtx.GetMoneyRegex();
                                prjMgr   = bidCtx.GetMgrRegex();
                                if (buildUnit.Contains("公司"))
                                    buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司";
                                if (buildUnit.Contains("地址"))
                                    buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址"));
                                msgType  = "辽宁省招标投标协调管理办公室";
                                specType = "建设工程";
                                bidType  = "建设工程";
                                BidInfo info = ToolDb.GenBidInfo("辽宁省", "辽宁省及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                                if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #19
        protected override IList ExecuteCrawl(bool crawlAll)
            int    sqlCount        = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount);
            catch { return(null); }
            int startIndex = html.IndexOf("{");
            int endIndex   = html.LastIndexOf("}");

            html = html.Substring(startIndex, (endIndex + 1) - startIndex);
            JavaScriptSerializer        serializer  = new JavaScriptSerializer();
            Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html);

            foreach (KeyValuePair <string, object> obj in smsTypeJson)
                if (obj.Key != "rows")
                object[] array = (object[])obj.Value;
                foreach (object arrValue in array)
                    string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                    Dictionary <string, object> dic = (Dictionary <string, object>)arrValue;
                    headName    = Convert.ToString(dic["title"]);
                    releaseTime = Convert.ToString(dic["faBuStartDate"]);
                    infoUrl     = "" + Convert.ToString(dic["tongZhiGuid"]);

                    string attachId  = Convert.ToString(dic["attachFileGroupGuid"]);
                    string attachUrl = "" + attachId;

                    ctxHtml  = Convert.ToString(dic["content"]);
                    infoCtx  = ctxHtml.ToCtxString();
                    infoType = "通知公告";
                    msgType  = "深圳市建设工程交易中心龙岗分中心";

                    NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳龙岗区工程", "龙岗区", infoCtx, infoType);
                    if (!crawlAll && sqlCount > this.MaxCount)

                    if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                        if (!string.IsNullOrWhiteSpace(attachId))
                            string attchJson = this.ToolWebSite.GetHtmlByUrl(attachUrl);
                            JavaScriptSerializer attachSerializer = new JavaScriptSerializer();
                            object[]             dicTypeJson      = (object[])attachSerializer.DeserializeObject(attchJson);
                            foreach (object objJson in dicTypeJson)
                                Dictionary <string, object> dicFiles = objJson as Dictionary <string, object>;
                                string attachName   = Convert.ToString(dicFiles["attachName"]);
                                string attachDelId  = Convert.ToString(dicFiles["attachGuid"]);
                                string attachDelUrl = "" + attachDelId;

                                BaseAttach attach = null;
                                    attach = ToolHtml.GetBaseAttach(attachDelUrl, attachName, info.Id);
                                catch { }

                                if (attach != null)
                                    ToolDb.SaveEntity(attach, "AttachServerPath,SourceID");
Beispiel #20
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default);
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true")));

            if (pageNode != null && pageNode.Count > 0)
                    string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", "");
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    viewState = this.ToolWebSite.GetAspNetViewState(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                    }, new string[] {
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1")));
                if (listNode != null && listNode.Count > 0)
                    TableTag table = listNode[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty,
                               HtmlTxt = string.Empty, area = string.Empty;

                        TableRow tr   = table.Rows[j];
                        ATag     aTag = tr.Columns[1].GetATag();
                        prjName   = aTag.GetAttribute("title");
                        beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        InfoUrl   = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt    = dtlNode.AsHtml();
                            bidCtx     = HtmlTxt.GetReplace("</p>,<br />,<br/>", "\r\n").ToCtxString().GetReplace("\t", "\r\n");
                            prjAddress = bidCtx.GetAddressRegex();
                            buildUnit  = bidCtx.GetBuildRegex();
                            bidUnit    = bidCtx.GetBidRegex();
                            bidMoney   = bidCtx.GetMoneyRegex();
                            if (string.IsNullOrWhiteSpace(bidUnit))
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                    TableTag tableTag = tableNode[0] as TableTag;
                                    string   ctx      = string.Empty;
                                    for (int r = 0; r < tableTag.RowCount; r++)
                                        for (int c = 0; c < tableTag.Rows[r].ColumnCount; c++)
                                            string temp = tableTag.Rows[r].Columns[c].ToNodePlainString();
                                                ctx += temp.GetReplace(":,:") + ":" + tableTag.Rows[r + 1].Columns[c].ToNodePlainString() + "\r\n";
                                                ctx += tableTag.Rows[r].Columns[c + 1].ToNodePlainString().GetReplace(":,:") + ":" + tableTag.Rows[r + 1].Columns[c + 1].ToNodePlainString() + "\r\n";
                                            catch { }
                                    bidUnit = ctx.GetBidRegex();
                                    if (bidUnit.Contains("中标"))
                                        bidUnit = "";
                                    bidMoney = ctx.GetMoneyRegex();
                                    if (string.IsNullOrWhiteSpace(bidUnit))
                                        ctx = "";
                                        for (int r = 0; r < tableTag.RowCount; r++)
                                            for (int c = 0; c < tableTag.Rows[r].ColumnCount; c++)
                                                string temp = tableTag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:");

                                                if (c % 2 == 0)
                                                    ctx += temp + ":";
                                                    ctx += temp + "\r\n";
                                        bidUnit = ctx.GetBidRegex();
                                        if (bidUnit.Contains("中标"))
                                            bidUnit = "";
                                        bidMoney = ctx.GetMoneyRegex();
                                    if (string.IsNullOrWhiteSpace(bidUnit))
                                        tableTag = tableNode[1] as TableTag;
                                        if (tableTag != null)
                                            ctx = string.Empty;
                                            for (int r = 0; r < tableTag.RowCount; r++)
                                                for (int c = 0; c < tableTag.Rows[r].ColumnCount; c++)
                                                    string temp = tableTag.Rows[r].Columns[c].ToNodePlainString();
                                                        ctx += temp.GetReplace(":,:") + ":" + tableTag.Rows[r + 1].Columns[c].ToNodePlainString() + "\r\n";
                                                        ctx += tableTag.Rows[r].Columns[c + 1].ToNodePlainString().GetReplace(":,:") + ":" + tableTag.Rows[r + 1].Columns[c + 1].ToNodePlainString() + "\r\n";
                                                    catch { }
                                            bidUnit = ctx.GetBidRegex();
                                            if (bidUnit.Contains("中标"))
                                                bidUnit = "";
                                            bidMoney = ctx.GetMoneyRegex();
                                            if (string.IsNullOrWhiteSpace(bidUnit))
                                                ctx = "";
                                                for (int r = 0; r < tableTag.RowCount; r++)
                                                    for (int c = 0; c < tableTag.Rows[r].ColumnCount; c++)
                                                        string temp = tableTag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:");

                                                        if (c % 2 == 0)
                                                            ctx += temp + ":";
                                                            ctx += temp + "\r\n";
                                                bidUnit = ctx.GetBidRegex();
                                                if (bidUnit.Contains("中标"))
                                                    bidUnit = "";
                                                bidMoney = ctx.GetMoneyRegex();
                            if (buildUnit.Contains("公司"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司";
                            if (buildUnit.Contains("地址"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址"));
                            if (buildUnit.Contains("联系"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系"));
                            if (buildUnit.Contains("指挥部"))
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("指挥部"));
                            code = bidCtx.GetCodeRegex().GetCodeDel().GetReplace(".");
                            if (bidUnit.Contains("日历天") || bidUnit.Contains("预期中标") || bidUnit.Contains("投标人") || bidUnit.Contains("中标价"))
                                bidUnit = string.Empty;

                                if (decimal.Parse(bidMoney) < 1)
                                    bidMoney = "0";
                            catch { }

                            msgType   = "襄阳市公共资源交易中心";
                            specType  = "建设工程";
                            bidType   = prjName.GetInviteBidType();
                            buildUnit = buildUnit.Replace(" ", "");
                            BidInfo info = ToolDb.GenBidInfo("湖北省", "湖北省及地市", "襄阳市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #21
        protected override IList ExecuteCrawl(bool crawlAll)
            IList list = new ArrayList();
            int    pageInt         = 1;
            int    crawlMax        = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=0"), Encoding.Default).Replace("&nbsp;", "");
            catch (Exception ex)
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "page_PageList")));

            if (sNode != null && sNode.Count > 0)
                SelectTag select = sNode[0] as SelectTag;
                pageInt = select.OptionTags.Length;

            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + (i - 1).ToString(), Encoding.Default); }
                    catch (Exception ex) { continue; }
                parser = new Parser(new Lexer(html));
                sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("onmouseover", "\"#EFFCD0\";")));
                if (sNode != null && sNode.Count > 0)
                    for (int n = 0; n < sNode.Count; n++)
                        string   prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = sNode[n] as TableRow;
                        prjName = tr.Columns[0].ToPlainTextString().Trim();
                        bidUnit = tr.Columns[1].ToPlainTextString().Trim();

                        ATag  aTag      = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag;
                        Regex regexLink = new Regex(@"id=[^-]+");
                        InfoUrl = "" + regexLink.Match(aTag.Link).Value;
                        string htmldetail = string.Empty;
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).Replace("&nbsp;", "").GetJsString();
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "98%")));
                        if (dtnode != null && dtnode.Count > 0)
                            HtmlTxt = dtnode.AsHtml();
                            string   ctx   = string.Empty;
                            TableTag table = dtnode[0] as TableTag;
                            for (int k = 0; k < table.RowCount; k++)
                                for (int d = 0; d < table.Rows[k].ColumnCount; d++)
                                    if (d == 0)
                                        ctx += table.Rows[k].Columns[d].ToNodePlainString().Replace(":", "").Replace(":", "") + ":";
                                        ctx += table.Rows[k].Columns[d].ToNodePlainString() + "\r\n";
                            bidCtx    = ctx;
                            buildUnit = bidCtx.GetBuildRegex();
                            if (string.IsNullOrEmpty(buildUnit))
                                buildUnit = bidCtx.GetRegex("招标代理");
                            bidMoney   = bidCtx.GetMoneyRegex();
                            prjAddress = bidCtx.GetAddressRegex();
                            if (prjAddress.Contains("邮政编码"))
                                prjAddress = prjAddress.Remove(prjAddress.IndexOf("邮政编码"));
                            if (string.IsNullOrEmpty(beginDate))
                                beginDate = DateTime.Now.ToString("yyyy-MM-dd");
                            prjMgr   = bidCtx.GetMgrRegex();
                            code     = bidCtx.GetCodeRegex();
                            msgType  = "佛山市顺德区建设工程交易中心";
                            specType = "建设工程";
                            prjName  = ToolDb.GetPrjName(prjName);
                            bidType  = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "佛山市区", "顺德区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);

                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #22
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

                htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            catch (Exception ex)
            if (htl.Contains("RowCount"))
                    int     index   = htl.IndexOf("RowCount");
                    string  pageStr = htl.Substring(index, htl.Length - index).Replace("RowCount", "").Replace("}", "").Replace(":", "").Replace("\"", "");
                    decimal b       = decimal.Parse(pageStr) / 20;
                    if (b.ToString().Contains("."))
                        page = Convert.ToInt32(b) + 1;
                        page = Convert.ToInt32(b);
                catch { }
            for (int i = 1; i <= page; i++)
                if (i > 1)
                        htl = this.ToolWebSite.GetHtmlByUrl("" + i.ToString(), Encoding.UTF8);
                JavaScriptSerializer        serializer  = new JavaScriptSerializer();
                Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(htl);
                foreach (KeyValuePair <string, object> obj in smsTypeJson)
                    if (obj.Key != "DataList")
                    object[] array = (object[])obj.Value;
                    foreach (object obj2 in array)
                        Dictionary <string, object> dicSmsType = (Dictionary <string, object>)obj2;
                        string pPrjName = string.Empty, pBuildUnit = string.Empty,
                               pBuildAddress = string.Empty, pBuildManager = string.Empty,
                               pBuildScale = string.Empty, pPrjPrice = string.Empty,
                               pPrjStartDate = string.Empty, PrjEndDate = string.Empty,
                               pConstUnit = string.Empty, pConstUnitManager = string.Empty,
                               pSuperUnit = string.Empty, pSuperUnitManager = string.Empty,
                               pProspUnit = string.Empty, pProspUnitManager = string.Empty,
                               pDesignUnit = string.Empty, pDesignUnitManager = string.Empty,
                               pPrjManager = string.Empty, pSpecialPerson = string.Empty,
                               pLicUnit = string.Empty, pPrjLicCode = string.Empty,
                               PrjLicDate = string.Empty, pPrjDesc = string.Empty,
                               pProvince = string.Empty, pCity = string.Empty,
                               pInfoSource = string.Empty, pUrl = string.Empty,
                               pCreatetime = string.Empty, pPrjCode = string.Empty;
                            pPrjCode   = Convert.ToString(dicSmsType["AnnSerial"]);
                            pPrjName   = Convert.ToString(dicSmsType["PrjName"]);
                            pBuildUnit = Convert.ToString(dicSmsType["ConstOrg"]);
                            PrjLicDate = Convert.ToString(dicSmsType["IssueDate"]);
                            pUrl       = "" + pPrjCode;
                            string htmldetail = string.Empty;
                                htmldetail = this.ToolWebSite.GetHtmlByUrl(pUrl, Encoding.UTF8, ref cookiestr).Trim();
                            catch (Exception)
                            Parser   parser = new Parser(new Lexer(htmldetail));
                            NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "js-table mar-l-4")));
                            if (dtList != null && dtList.Count > 0)
                                TableTag table = dtList[0] as TableTag;
                                for (int j = 0; j < table.RowCount; j++)
                                    TableRow dr  = table.Rows[j];
                                    string   ctx = string.Empty;
                                    for (int k = 0; k < dr.ColumnCount; k++)
                                        ctx += dr.Columns[k].ToPlainTextString().Trim().Replace("\r", "").Replace("\n", "");
                                    pInfoSource += ctx + "\r\n";

                                Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n");
                                pDesignUnit = regpDesignUnit.Match(pInfoSource).Value.Replace("设计单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regAddree = new Regex(@"(工程地址|工程地点)(:|:)[^\r\n]+\r\n");
                                pBuildAddress = regAddree.Match(pInfoSource).Value.Replace("工程地址", "").Replace("工程地点", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regSca = new Regex(@"(建筑面积|建设规模)(:|:)[^\r\n]+\r\n");
                                pBuildScale = regSca.Match(pInfoSource).Value.Replace("建设规模", "").Replace("建筑面积", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n");
                                pSuperUnit = regpSuperUnit.Match(pInfoSource).Value.Replace("监理单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regpConstUnit = new Regex(@"施工单位(:|:)[^\r\n]+\r\n");
                                pConstUnit = regpConstUnit.Match(pInfoSource).Value.Replace("施工单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regpLicUnit = new Regex(@"发证机关(:|:)[^\r\n]+\r\n");
                                pLicUnit = regpLicUnit.Match(pInfoSource).Value.Replace("发证机关", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regpPosoUnit = new Regex(@"勘察单位(:|:)[^\r\n]+\r\n");
                                pProspUnit = regpPosoUnit.Match(pInfoSource).Value.Replace("勘察单位", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regMan = new Regex(@"(项目经理|项目负责人)(:|:)[^\r\n]+\r\n");
                                pPrjManager = regMan.Match(pInfoSource).Value.Replace("项目负责人", "").Replace("项目经理", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regBeg = new Regex(@"计划开工日期(:|:)[^\r\n]+\r\n");
                                pPrjStartDate = regBeg.Match(pInfoSource).Value.Replace("计划开工日期", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regEnd = new Regex(@"计划竣工日期(:|:)[^\r\n]+\r\n");
                                PrjEndDate = regEnd.Match(pInfoSource).Value.Replace("计划竣工日期", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();

                                Regex regpPrice = new Regex(@"工程造价(:|:)[^\r\n]+\r\n");
                                pPrjPrice = regpPrice.Match(pInfoSource).Value.Replace("工程造价", "").Replace("/", "").Replace(":", "").Replace(":", "").Trim();
                                Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                                if (pPrjPrice.Contains("万"))
                                    pPrjPrice = pPrjPrice.Remove(pPrjPrice.IndexOf("万")).Trim();
                                    pPrjPrice = regBidMoney.Match(pPrjPrice).Value;
                                        pPrjPrice = (decimal.Parse(regpPrice.Match(pPrjPrice).Value) / 10000).ToString();
                                        if (decimal.Parse(pPrjPrice) < decimal.Parse("0.1"))
                                            pPrjPrice = "0";
                                    catch (Exception)
                                        pPrjPrice = "0";
                                if (string.IsNullOrEmpty(pLicUnit))
                                    pLicUnit = "深圳市住房和建设局";
                                ProjectLic info = ToolDb.GenProjectLic(pPrjName, pBuildUnit, pBuildAddress, pBuildManager, pBuildScale, pPrjPrice, pPrjStartDate, PrjEndDate, pConstUnit, pConstUnitManager, pSuperUnit, pSuperUnitManager, pProspUnit, pProspUnitManager, pDesignUnit, pDesignUnitManager, pPrjManager, pSpecialPerson, pLicUnit, pPrjLicCode, PrjLicDate, pPrjDesc, "广东省", "深圳市区", pInfoSource, pUrl, pCreatetime, pPrjCode, "深圳市住房和建设局");
                                if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #23
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <NoticeInfo>();
            int    pageInt         = 37;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;
            string url             = ",App_Web_rzplwhmc.ashx?_method=getCurrentData&_session=rw";

                this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr);
                string post = "currentPage=1\r\nQuery=";
                html = ToolHtml.GetHtmlByUrlPost(url, post, Encoding.UTF8, ref cookiestr);
            catch { return(null); }

            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        string post = "currentPage=" + i + "\r\nQuery=";
                        html = ToolHtml.GetHtmlByUrlPost(url, post, Encoding.UTF8, ref cookiestr);
                    catch { continue; }
                Parser   parser   = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("li"));
                if (listNode != null && listNode.Count > 0)
                    for (int j = 0; j < listNode.Count; j++)
                        string InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, htmlTxt = string.Empty, area = string.Empty;
                        INode  node = listNode[j];

                        ATag aTag = node.GetATag();
                        InfoTitle   = aTag.GetAttribute("title");
                        InfoType    = "控制价公示";
                        PublistTime = node.GetSpan().StringText;
                        InfoUrl     = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ContentPlaceHolder1_InfoHtml")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            htmlTxt = dtlNode.AsHtml();
                            InfoCtx = htmlTxt.ToCtxString();
                            parser  = new Parser(new Lexer(htmlTxt));
                            NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                            if (tableNode != null && tableNode.Count > 0)
                                string   ctx   = string.Empty;
                                TableTag table = tableNode[0] as TableTag;
                                for (int r = 0; r < table.RowCount; r++)
                                    for (int c = 0; c < table.Rows[r].ColumnCount; c++)
                                        string temp = table.Rows[r].Columns[c].ToNodePlainString();
                                        if ((c + 1) % 2 == 0)
                                            ctx += temp.GetReplace(":,:") + "\r\n";
                                            ctx += temp.GetReplace(":,:") + ":";
                                buildUnit = ctx.GetBuildRegex();
                            NoticeInfo info = ToolDb.GenNoticeInfo("甘肃省", "甘肃省及地市", area, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "甘肃省公共资源交易中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, "交通运输工程", string.Empty, htmlTxt);

                            parser = new Parser(new Lexer(htmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    ATag a = aNode[k].GetATag();
                                    if (a.IsAtagAttach())
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                            link = a.Link;
                                            link = "" + a.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);

                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #24
        protected override IList ExecuteCrawl(bool crawlAll)
            IList list = new List <BidInfo>();
            Dictionary <string, string> citys = this.GetCitys();

            foreach (string area in citys.Keys)
                int    count           = 0;
                int    pageInt         = 1;
                string html            = string.Empty;
                string viewState       = string.Empty;
                string eventValidation = string.Empty;
                string cookiestr       = string.Empty;
                    html = this.ToolWebSite.GetHtmlByUrl(citys[area], Encoding.UTF8, ref cookiestr);
                catch { return(list); }
                Parser   parser   = new Parser(new Lexer(html));
                NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true")));
                if (pageNode != null && pageNode.Count > 0)
                        string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", "");
                        pageInt = int.Parse(temp);
                    catch { }
                for (int i = 1; i <= pageInt; i++)
                    if (i > 1)
                        viewState       = this.ToolWebSite.GetAspNetViewState(html);
                        eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                        string viewSTATEGENERATOR = ToolHtml.GetHtmlInputValue(html, "__VIEWSTATEGENERATOR");
                        NameValueCollection nvc   = this.ToolWebSite.GetNameValueCollection(new string[] {
                                                                                            new string[] {
                            html = this.ToolWebSite.GetHtmlByUrl(citys[area], nvc, Encoding.UTF8, ref cookiestr);
                        catch { continue; }
                    parser = new Parser(new Lexer(html));
                    NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1")));
                    if (listNode != null && listNode.Count > 0)
                        TableTag table = listNode[0] as TableTag;
                        for (int j = 1; j < table.RowCount; j++)
                            string prjName = string.Empty,
                                   buildUnit = string.Empty, bidUnit = string.Empty,
                                   bidMoney = string.Empty, code = string.Empty,
                                   bidDate = string.Empty,
                                   beginDate = string.Empty,
                                   endDate = string.Empty, bidType = string.Empty,
                                   specType = string.Empty, InfoUrl = string.Empty,
                                   msgType = string.Empty, bidCtx = string.Empty,
                                   prjAddress = string.Empty, remark = string.Empty,
                                   prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                            TableRow tr   = table.Rows[j];
                            ATag     aTag = tr.Columns[1].GetATag();
                            prjName   = aTag.GetAttribute("title").GetReplace("【正在报名】,【报名结束】");
                            beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex();
                            InfoUrl   = "" + aTag.Link;
                            string htmldtl = string.Empty;
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                            catch { continue; }
                            parser = new Parser(new Lexer(htmldtl));
                            NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent")));
                            if (dtlNode != null && dtlNode.Count > 0)
                                HtmlTxt    = dtlNode.AsHtml();
                                bidCtx     = HtmlTxt.GetReplace(new string[] { "<br/>", "<br />", "<br>" }, "\r\n").ToCtxString();
                                prjAddress = bidCtx.GetAddressRegex();
                                buildUnit  = bidCtx.GetBuildRegex();
                                bidUnit    = bidCtx.GetBidRegex();
                                bidMoney   = bidCtx.GetMoneyRegex();
                                prjMgr     = bidCtx.GetMgrRegex();
                                code       = bidCtx.GetCodeRegex().GetCodeDel();

                                if (string.IsNullOrEmpty(bidUnit))
                                    parser = new Parser(new Lexer(HtmlTxt));
                                    NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                    if (bidNode != null && bidNode.Count > 0)
                                        string   ctx      = string.Empty;
                                        TableTag bidTable = bidNode[0] as TableTag;
                                        for (int r = 0; r < bidTable.RowCount; r++)
                                            for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++)
                                                if ((c + 1) % 2 == 0)
                                                    ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n";
                                                    ctx += bidTable.Rows[r].Columns[c].ToNodePlainString() + ":";

                                        bidUnit = ctx.GetBidRegex();
                                        if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                                            bidMoney = ctx.GetMoneyString().GetMoney("万元");
                                        if (string.IsNullOrEmpty(prjAddress))
                                            prjAddress = ctx.GetAddressRegex();
                                        if (string.IsNullOrEmpty(buildUnit))
                                            buildUnit = ctx.GetBuildRegex();
                                        if (string.IsNullOrEmpty(code))
                                            code = ctx.GetCodeRegex().GetCodeDel();
                                        if (bidUnit.Contains("推荐") || bidUnit.Contains("中标") || bidUnit.Contains("地址"))
                                            bidUnit = string.Empty;
                                        if (string.IsNullOrEmpty(bidUnit))
                                            if (bidTable.RowCount > 1)
                                                ctx = string.Empty;
                                                for (int d = 0; d < bidTable.Rows[0].ColumnCount; d++)
                                                    ctx += bidTable.Rows[0].Columns[d].ToNodePlainString() + ":";
                                                        ctx += bidTable.Rows[1].Columns[d].ToNodePlainString() + "\r\n";
                                                    catch { }
                                                bidUnit = ctx.GetBidRegex();
                                                if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                                                    bidMoney = ctx.GetMoneyString().GetMoney();
                                                if (string.IsNullOrEmpty(prjAddress))
                                                    prjAddress = ctx.GetAddressRegex();
                                                if (string.IsNullOrEmpty(buildUnit))
                                                    buildUnit = ctx.GetBuildRegex();
                                                if (string.IsNullOrEmpty(code))
                                                    code = ctx.GetCodeRegex().GetCodeDel();
                                    if (decimal.Parse(bidMoney) > 10000)
                                        bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                catch { }
                                bidUnit = bidUnit.Replace("名称", "").Replace("单位", "").Replace("№", "").Replace("1", "").Replace("2", "").Replace("联合体", "").Replace("(", "");

                                if (bidUnit.Contains("公司"))
                                    bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                                if (bidUnit.Contains("研究院"))
                                    bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究院")) + "研究院";
                                if (bidUnit.Contains("研究所"))
                                    bidUnit = bidUnit.Remove(bidUnit.IndexOf("研究所")) + "研究所";
                                bidType  = "房建市政";
                                specType = "建设工程";
                                msgType  = "广西壮族自治区公共资源交易中心";
                                BidInfo info = ToolDb.GenBidInfo("广西壮族自治区", "广西壮族自治区及地市", area, string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                    for (int k = 0; k < aNode.Count; k++)
                                        ATag a = aNode[k] as ATag;
                                        if (a.IsAtagAttach())
                                            string link = string.Empty;
                                            if (a.Link.ToLower().Contains("http"))
                                                link = a.Link;
                                                link = "" + a.Link.GetReplace("../,./");
                                            if (Encoding.Default.GetByteCount(link) > 500)
                                            BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                                if (!crawlAll && count >= this.MaxCount)
                                    goto Funcs;
                Funcs :;
Beispiel #25
        protected override IList ExecuteCrawl(bool crawlAll)
            IList list = new ArrayList();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(""), Encoding.UTF8);
            catch (Exception ex)

            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagelist")));

            if (tdNodes != null && tdNodes.Count > 0)
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "");
                Regex  regpage  = new Regex(@"1/\d+");
                    pageInt = int.Parse(regpage.Match(pageTemp).Value.Replace("1/", ""));
                catch (Exception ex) { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                    html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("" + i.ToString()), Encoding.UTF8);
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new TagNameFilter("li"));

                if (nodeList != null && nodeList.Count > 0)
                    for (int j = 0; j < nodeList.Count; j++)
                        string       prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        CompositeTag obj = nodeList[j] as CompositeTag;

                        ATag aTag     = obj.SearchFor(typeof(ATag), true)[0] as ATag;
                        Span dateSpan = obj.SearchFor(typeof(Span), true)[0] as Span;
                        prjName   = aTag.GetAttribute("title");
                        beginDate = dateSpan.ToPlainTextString().Trim(new char[] { '[', ']' });
                        InfoUrl   = "" + aTag.Link;
                        string htmldetail = string.Empty;
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "news-content"), new TagNameFilter("div")));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n");
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            htmldetail = regexHtml.Replace(htmldetail, "");
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "news-content"), new TagNameFilter("div")));

                        bidCtx = dtnode.ToHtml().ToCtxString();
                        bidCtx = Regex.Replace(bidCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace("<", "").Replace(">", "").Replace("\n\n\n\t", "\r\n").Replace("\n\n", "\r\n");
                        Regex regCode = new Regex(@"招标编号(:|:)[^\r\n]+\r\n");
                        code = regCode.Match(bidCtx).Value.Replace("招标编号", "").Replace(":", "").Replace(":", "").Trim();
                        if (Encoding.Default.GetByteCount(code) > 50)
                            code = "";
                        Regex regbuildUnit = new Regex(@"(采购人|采购单位|采购代理机构)(:|:)[^\r\n]+\r\n");
                        buildUnit  = regbuildUnit.Match(bidCtx).Value.Replace("采购人", "").Replace("采购单位", "").Replace("采购代理机构", "").Replace(":", "").Replace(":", "").Trim();
                        prjAddress = bidCtx.GetAddressRegex();
                        bidUnit    = bidCtx.GetBidRegex();// regBidUnit.Match(bidCtx).Value.Replace("中标单位", "").Replace(":", "").Replace(":", "").Trim();

                        Regex  regBidMoneystr = new Regex(@"(中标价|价格|金额)(:|:)[^\r\n]+\r\n");
                        string monerystr      = regBidMoneystr.Match(bidCtx).Value.Replace("中标价", "").Replace("价格", "").Replace("金额", "").Replace("万元整", "").Replace(":", "").Replace(":", "").Replace(",", "").Replace(",", "").Trim();
                        Regex  regBidMoney    = new Regex(@"[0-9]+[.]{0,1}[0-9]+");

                        if (!string.IsNullOrEmpty(regBidMoney.Match(monerystr).Value))
                            if ((monerystr.Contains("万元") || monerystr.Contains("万美元")) && !monerystr.Contains("万元整"))
                                bidMoney = regBidMoney.Match(monerystr).Value;
                                    bidMoney = (decimal.Parse(regBidMoney.Match(monerystr).Value) / 10000).ToString();
                                    if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                        bidMoney = "0";
                                catch (Exception)
                                    bidMoney = "0";
                        specType = "其他";
                        msgType  = "深圳市深水水务咨询有限公司";
                        prjName  = ToolDb.GetPrjName(prjName);
                        bidType  = ToolHtml.GetInviteTypes(prjName);
                        BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                        if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #26
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <NoticeInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "input-group-addon")));

            if (tdNodes != null && tdNodes.Count > 0)
                    string reTemp   = tdNodes.AsString().GetRegexBegEnd("共", "项");
                    string pageTemp = tdNodes.AsString().GetRegexBegEnd("项", "页").GetReplace("共,项,页," + reTemp + ",,");
                    pageInt = int.Parse(pageTemp);
                catch (Exception) { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?pi=" + (i - 1), Encoding.UTF8);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table")));
                if (nodeList != null && nodeList.Count > 0)
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                        string   InfoTitle = string.Empty, InfoType = string.Empty, PublistTime = string.Empty, InfoCtx = string.Empty, InfoUrl = string.Empty, htmlTxt = string.Empty, prjCode = string.Empty, buildUnit = string.Empty, prjType = string.Empty;
                        TableRow tr = table.Rows[j];
                        prjCode     = tr.Columns[1].ToNodePlainString().Replace(" ", "");
                        InfoTitle   = tr.Columns[2].ToPlainTextString().Trim();
                        buildUnit   = tr.Columns[3].ToNodePlainString();
                        prjType     = tr.Columns[4].ToNodePlainString();
                        InfoType    = "标底公示";
                        PublistTime = tr.Columns[5].ToPlainTextString().Trim();
                        InfoUrl     = "" + tr.Columns[2].GetATagHref();
                        string ctxhtml = string.Empty;
                            ctxhtml = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8);
                        parser = new Parser(new Lexer(ctxhtml));
                        NodeList dtnode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "showContent"), new TagNameFilter("div")));
                        if (dtnode != null && dtnode.Count > 0)
                            htmlTxt = dtnode.AsHtml();
                            InfoCtx = htmlTxt.ToLower().Replace("</p>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n").ToCtxString();
                            NoticeInfo info = ToolDb.GenNoticeInfo("广东省", "深圳宝安区工程", string.Empty, string.Empty, InfoTitle, InfoType, InfoCtx, PublistTime, string.Empty, "深圳市建设工程交易中心宝安分中心", InfoUrl, prjCode, buildUnit, string.Empty, string.Empty, prjType, string.Empty, htmlTxt);
                            parser = new Parser(new Lexer(htmlTxt));
                            NodeList fileNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "inside_table")));
                            if (fileNode != null && fileNode.Count > 0 && fileNode[0] is TableTag)
                                TableTag fileTable = fileNode[0] as TableTag;
                                for (int f = 1; f < fileTable.Rows.Length; f++)
                                    BaseAttach attach = ToolDb.GenBaseAttach(fileTable.Rows[f].Columns[1].ToPlainTextString().Trim(), info.Id, "" + (fileTable.Rows[f].Columns[1].SearchFor(typeof(ATag), true)[0] as ATag).Link.Replace("../", "/"));
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #27
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr);
            catch (Exception ex)
            Parser   parser    = new Parser(new Lexer(htl));
            NodeList nodeList  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxma03")));
            Regex    regexPage = new Regex(@"共\d+页");

                page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' }));
            catch (Exception)
            { }
            for (int i = 1; i <= page; i++)
                if (i > 1)
                        htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("" + i.ToString() + ".html"), Encoding.UTF8);
                    catch (Exception ex) {  }
                parser = new Parser(new Lexer(htl));
                NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxdianbeijing")));
                if (tableNodeList.Count > 0)
                    for (int j = 0; j < tableNodeList.Count; j++)
                        ATag   aTag = tableNodeList.SearchFor(typeof(ATag), true)[j] as ATag;
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName = aTag.LinkText;
                        Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}");
                        beginDate = regDate.Match(tableNodeList[j].ToPlainTextString()).Value.Trim();

                        InfoUrl = "" + aTag.Link.Replace("amp;", "").Trim();
                        string htmldetail = string.Empty;
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(InfoUrl), Encoding.UTF8).Replace("&nbsp;", "");
                        catch (Exception)
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxzf2")));
                        if (dtnode.Count > 0)
                            HtmlTxt = dtnode.AsHtml();
                            Regex regeximg = new Regex(@"<img[^>]*>");//去掉图片
                            HtmlTxt = regeximg.Replace(HtmlTxt, "");
                            bidCtx  = dtnode.AsString().Replace("\n", "\r\n").Replace(" ", "").Trim();
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            bidCtx = regexHtml.Replace(bidCtx, "");
                            Regex regBuidUnit = new Regex(@"(招标人|建设单位)(:|:)[^\r\n]+\r\n");
                            buildUnit = regBuidUnit.Match(bidCtx).Value.Replace("招标人:", "").Replace("建设单位:", "").Trim();
                            Regex regCode = new Regex(@"工程编号(:|:)[^\r\n]+\r\n");
                            code = regCode.Match(bidCtx).Value.Replace("工程编号:", "").Trim();
                            Regex regBidUnit = new Regex(@"中标人(:|:)[^\r\n]+\r\n");
                            bidUnit = regBidUnit.Match(bidCtx).Value.Replace("中标人:", "").Trim();
                            Regex regMoney = new Regex(@"(中标价|中标价格)(:|:)[^\r\n]+\r\n");
                            bidMoney = regMoney.Match(bidCtx).Value.Replace("中标价:", "").Replace("中标价格:", "").Replace(",", "").Trim();
                            Regex regBidMoney = new Regex(@"[0-9]+[.]{0,1}[0-9]+");
                            if (bidMoney.Contains("万"))
                                bidMoney = bidMoney.Remove(bidMoney.IndexOf("万")).Trim();
                                bidMoney = regBidMoney.Match(bidMoney).Value;
                                    bidMoney = (decimal.Parse(regBidMoney.Match(bidMoney).Value) / 10000).ToString();
                                    if (decimal.Parse(bidMoney) < decimal.Parse("0.1"))
                                        bidMoney = "0";
                                catch (Exception)
                                    bidMoney = "0";
                            if (buildUnit == "")
                                buildUnit = "";
                            if (bidUnit == "")
                                bidUnit = "";
                            Regex regprjMgr = new Regex(@"(总监|建造师|建造师(总监))(:|:)[^\r\n]+\r\n");
                            prjMgr   = regprjMgr.Match(bidCtx).Value.Replace("建造师:", "").Replace("总监:", "").Replace("建造师(总监):", "").Trim();
                            msgType  = "深圳市南山区政府采购及招标中心";
                            specType = "建设工程";
                            bidType  = ToolHtml.GetInviteTypes(prjName);
                            prjName  = ToolDb.GetPrjName(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate,
                                                             bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            if (!crawlAll && list.Count >= this.MaxCount)
Beispiel #28
        protected override IList ExecuteCrawl(bool crawlAll)
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8);
            catch { return(null); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pages")));

            if (pageNode != null && pageNode.Count > 0)
                    string temp = pageNode.AsString().GetRegexBegEnd("共", "页");
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "index_" + (i - 1) + ".htm");
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "page_list_ul")), true), new TagNameFilter("li")));
                if (listNode != null && listNode.Count > 0)
                    for (int j = 1; j < listNode.Count; j++)
                        INode  node = listNode[j];
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        ATag aTag = node.GetATag();
                        headName    = aTag.GetAttribute("title");
                        releaseTime = node.ToPlainTextString().GetDateRegex();
                        infoType    = "通知公告";
                        msgType     = MsgTypeCosnt.ShenZhenMsgType;
                        infoUrl     = "" + aTag.Link.GetReplace("../,./");
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString();
                        catch { continue; }
                        if (aTag.IsAtagAttach())
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", "", infoCtx, infoType);
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate))
                                    BaseAttach obj = ToolHtml.GetBaseAttach(infoUrl, headName, info.Id);
                                    if (obj != null)
                                        ToolDb.SaveEntity(obj, string.Empty);
                                catch { }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "detail_contect")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            ctxHtml = dtlNode.AsHtml();
                            infoCtx = ctxHtml.ToCtxString();

                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", "", infoCtx, infoType);

                            if (!crawlAll && sqlCount >= this.MaxCount)
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate))
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                    for (int k = 0; k < aNode.Count; k++)
                                        ATag tag = aNode[k].GetATag();
                                        if (tag.IsAtagAttach())
                                            string temp  = aTag.Link.GetReplace("../,./");
                                            string alink = "" + temp.Substring(0, temp.LastIndexOf("/")) + tag.Link.GetReplace("./", "/");
                                                BaseAttach obj = ToolHtml.GetBaseAttach(alink, tag.LinkText, info.Id);
                                                if (obj != null)
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                            catch { }
Beispiel #29
        protected override IList ExecuteCrawl(bool crawlAll)
            IList list = new List <ItemPlan>();
            Dictionary <string, string> dic = GetCityList();

            if (dic == null || dic.Count < 1)

            foreach (string key in dic.Keys)
                string html = string.Empty;
                string cookiestr = string.Empty;
                string viewState = string.Empty;
                int    pageInt = 1, sqlCount = 0;
                string eventValidation = string.Empty;
                    this.ToolWebSite.GetHtmlByUrl(dic[key], Encoding.UTF8, ref cookiestr);
                    html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr);
                catch { }
                Parser   parser   = new Parser(new Lexer(html));
                NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "badoo")), true), new TagNameFilter("a")));
                if (pageNode != null && pageNode.Count > 0)
                        string temp = pageNode[pageNode.Count - 1].GetATag().Link.Replace("javascript", "").Replace("jumpPage(", "").Replace(")", "");
                        pageInt = int.Parse(temp);
                    catch { }
                for (int i = 1; i <= pageInt; i++)
                    if (i > 1)
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "", "param.proofCode", "page.pageNo", "page.orderBy", "page.order" }, new string[] {
                            "", "", i.ToString(), "", ""
                            html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                        catch { continue; }
                    parser = new Parser(new Lexer(html));
                    NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "hytab")));
                    if (listNode != null && listNode.Count > 0)
                        TableTag table = listNode[0] as TableTag;
                        for (int j = 1; j < table.RowCount; j++)
                            string   ItemCode = string.Empty, ItemName = string.Empty, ItemAddress = string.Empty, BuildUnit = string.Empty, BuildNature = string.Empty, TotalInvest = string.Empty, PlanInvest = string.Empty, IssuedPlan = string.Empty, InvestSource = string.Empty, ApprovalUnit = string.Empty, ApprovalDate = string.Empty, ApprovalCode = string.Empty, MsgUnit = string.Empty, PlanDate = string.Empty, PlanType = string.Empty, PlanBeginDate = string.Empty, PlanEndDate = string.Empty, CtxHtml = string.Empty, ItemCtx = string.Empty, ItemContent = string.Empty, InfoUrl = string.Empty, MsgType = string.Empty;
                            TableRow tr = table.Rows[j];
                            ItemName = tr.Columns[0].ToNodePlainString();
                            PlanDate = tr.Columns[2].ToPlainTextString().GetDateRegex();
                            InfoUrl  = "" + tr.Columns[0].GetATagHref();
                            string htmldtl = string.Empty;
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                            catch { continue; }
                            parser = new Parser(new Lexer(htmldtl));
                            NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "xmgknr")));
                            if (dtlNode != null && dtlNode.Count > 0)
                                CtxHtml = dtlNode.AsHtml();
                                parser  = new Parser(new Lexer(CtxHtml));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                    TableTag tab = tableNode[0] as TableTag;
                                    for (int k = 1; k < tab.RowCount; k++)
                                        TableRow dr = tab.Rows[k];
                                        if (dr.ColumnCount < 2)
                                            ItemCtx += dr.Columns[0].ToNodePlainString() + ":";
                                            ItemCtx += dr.Columns[1].ToNodePlainString() + "\r\n";
                                        catch (Exception ex) {
                                            Logger.Error(InfoUrl + ItemName + key + i);
                                    ItemCtx = CtxHtml.ToCtxString();
                                ApprovalCode = ItemCtx.GetRegex("备案项目编号");
                                ItemAddress  = ItemCtx.GetRegex("项目所在地");
                                TotalInvest  = ItemCtx.GetRegex("项目总投资").Replace("万元", "").Replace("万", "");
                                ItemContent  = ItemCtx.GetRegex("项目规模及内容");
                                ApprovalUnit = ItemCtx.GetRegex("备案机关");
                                ApprovalDate = ItemCtx.GetRegex("复核通过日期");
                                string   temp     = ItemCtx.GetRegex("项目起止年限");
                                string[] tempPlan = temp.Split('-');
                                if (tempPlan.Length == 2)
                                    PlanBeginDate = tempPlan[0];
                                    PlanEndDate   = tempPlan[1];
                                PlanType = "项目公开";
                                MsgType  = "广东省发展和改革委员会";
                                string city = key;
                                if (key.Contains("顺德"))
                                    city = "佛山市区";

                                ItemPlan info = ToolDb.GenItemPlan("广东省", city, "", ItemCode, ItemName, ItemAddress, BuildUnit, BuildNature, TotalInvest, PlanInvest, IssuedPlan, InvestSource, ApprovalUnit, ApprovalDate, ApprovalCode, MsgUnit, PlanDate, PlanType, PlanBeginDate, PlanEndDate, CtxHtml, ItemCtx, ItemContent, MsgType, InfoUrl);
                                if (!crawlAll && sqlCount >= this.MaxCount)
                                    goto type;
                type : continue;
Beispiel #30
        protected override IList ExecuteCrawl(bool crawlAll)
            IList  list            = new List <InviteInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8);
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "navigation")));

            if (pageNode != null && pageNode.Count > 0)
                string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("总共", "页").GetReplace("【,】,[,]");
                    pageInt = int.Parse(temp);
                catch { }
            for (int i = 1; i <= pageInt; i++)
                if (i > 1)
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.UTF8);
                    catch { continue; }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "slidingList")), true), new TagNameFilter("li")));
                if (listNode != null && listNode.Count > 0)
                    for (int j = 0; j < listNode.Count; j++)
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty,
                               prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty,
                               specType = string.Empty, beginDate = string.Empty, endDate = string.Empty,
                               remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        INode node = listNode[j];

                        ATag aTag = node.GetATag();
                        prjName   = aTag.GetAttribute("title");
                        beginDate = node.GetSpan().StringText;
                        if (!string.IsNullOrEmpty(beginDate))
                            beginDate = beginDate.Substring(0, 4) + "-" + beginDate.Substring(4, 2) + "-" + beginDate.Substring(6, 2);
                        InfoUrl = "" + aTag.Link;
                        string htmldtl = string.Empty;
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ContentPlaceHolder1_AnnoGoodsHtml")));
                        if (dtlNode != null && dtlNode.Count > 0)
                            HtmlTxt    = dtlNode.AsHtml();
                            inviteCtx  = HtmlTxt.ToCtxString();
                            msgType    = "甘肃省公共资源交易中心";
                            specType   = "政府采购";
                            inviteType = "水利及其他工程";
                            InviteInfo info = ToolDb.GenInviteInfo("甘肃省", "甘肃省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                            parser = new Parser(new Lexer(htmldtl));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("Iframe"), new HasAttributeFilter("id", "Iframe")));
                            if (aNode != null && aNode.Count > 0)
                                for (int k = 0; k < aNode.Count; k++)
                                    IFrameTag itag = aNode[k] as IFrameTag;
                                    string    link = itag.GetAttribute("src");
                                    if (!string.IsNullOrEmpty(link))
                                        BaseAttach attach = ToolDb.GenBaseAttach(prjName + ".pdf", info.Id, link);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList atagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (atagNode != null && atagNode.Count > 0)
                                for (int a = 0; a < atagNode.Count; a++)
                                    ATag fileTag = atagNode[a] as ATag;
                                    if (fileTag.IsAtagAttach())
                                        string link = string.Empty;
                                        if (fileTag.Link.Contains("http"))
                                            link = fileTag.Link;
                                            link = "" + fileTag.Link;
                                        BaseAttach attach = ToolDb.GenBaseAttach(fileTag.LinkText, info.Id, link);
                                        if (!base.AttachList.Exists(x => x.AttachServerPath == link))
                            if (!crawlAll && list.Count >= this.MaxCount)