Exemple #1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();

            //取得页码
            string html = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }

            int      pageInt  = 1;
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "totalpage")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    pageInt = Convert.ToInt32(pageNode[0].ToNodePlainString());
                }
                catch { }
            }
            for (int i = pageInt; i >= 1; i--)
            {
                if (i < pageInt)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.nmgztb.com/Html/gongchengxinxi/zhaobiaogonggao/index_" + (i - 1) + ".htm", Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));

                NodeList sNodes = parser.ExtractAllNodesThatMatch(new  AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")));
                //parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter( new TagNameFilter("div"),new HasAttributeFilter("class","lanmu_con")),true),new TagNameFilter("table")));

                //NodeList div = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "lanmu_con")));
                //parser = new Parser(new Lexer(div.ToHtml()));
                //NodeList table = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));

                if (sNodes != null && sNodes.Count > 0)
                {
                    TableTag table = sNodes[0] as TableTag;
                    for (int t = 0; t < table.RowCount; t++)
                    {
                        if (table.Rows[t].ColumnCount < 2)
                        {
                            continue;
                        }
                        string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty,
                               inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty,
                               endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty,
                               CreateTime = string.Empty, msgType = string.Empty, HtmlTxt = string.Empty;

                        StringBuilder ctx      = new StringBuilder();
                        TableRow      tr       = table.Rows[t] as TableRow;
                        NodeList      nodeList = tr.SearchFor(typeof(ATag), true);
                        if (nodeList.Count > 0)
                        {
                            ATag aTag = nodeList[0] as ATag;
                            InfoUrl = "http://www.nmgztb.com" + aTag.Link;
                            prjName = aTag.GetAttribute("title");
                            string htmldtl = string.Empty;//this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower();
                            try
                            {
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).ToLower();
                            }
                            catch (Exception ex)
                            {
                                continue;
                            }
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            htmldtl = regexHtml.Replace(htmldtl, "");
                            Parser   parserdtl = new Parser(new Lexer(htmldtl));
                            NodeList nodesDtl  = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "link_con_con")));
                            if (nodesDtl != null && nodesDtl.Count > 0)
                            {
                                Regex regex = new Regex(@"更新时间:\d{4}年\d{1,2}月\d{1,2}日");
                                Match math  = regex.Match(nodesDtl.AsString());
                                if (math != null)
                                {
                                    beginDate = math.Value.Replace("更新时间:", "").Replace("年", "-").Replace("月", "-").Replace("日", "").Trim();
                                }
                            }
                            parserdtl.Reset();
                            nodesDtl = parserdtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "oo")));
                            HtmlTxt  = nodesDtl.AsHtml();
                            string str      = nodesDtl.AsString().Replace("&nbsp;", "").Replace(" ", "");
                            Regex  regexCTX = new Regex(@"作者:[^更新时间]+更新时间:\d{4}年\d{1,2}月\d{1,2}日");
                            str = str.Replace(regexCTX.Match(str).Value, "");
                            if (str.IndexOf("上一篇:") > -1)
                            {
                                ctx.Append(str.Substring(0, str.IndexOf("上一篇:")));
                            }
                            else
                            {
                                ctx.Append(str);
                            }

                            if (ctx.ToString().Contains("招标人:") || ctx.ToString().Contains("招标单位:") || ctx.ToString().Contains("招标采购单位:"))
                            {
                                Regex regex = new Regex("(招标人|招标单位|招标采购单位):[^\r\n]+[\r\n]{1}");
                                Match match = regex.Match(ctx.ToString());
                                buildUnit = match.Value.Replace("招标人:", "").Replace("招标单位:", "").Replace("招标采购单位:", "").Trim();
                            }
                            if (ctx.ToString().Contains("招标编号:"))
                            {
                                Regex regex = new Regex("(招标编号):[^\r\n]+[\r\n]{1}");
                                Match match = regex.Match(ctx.ToString());
                                code = match.Value.Replace("招标编号:", "").ToUpper().Trim();
                                if (code.Length >= 50)
                                {
                                    code = "";
                                }
                            }
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "";
                            }
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                            buildUnit  = ToolHtml.GetSubString(buildUnit, 150);
                            prjAddress = ToolHtml.GetAddress(prjAddress);
                            code       = ToolHtml.GetSubString(code, 50);
                            InviteInfo info = ToolDb.GenInviteInfo("内蒙古自治区", "内蒙古自治区及盟市", "", string.Empty, code, prjName, "", buildUnit, beginDate, endDate, ctx.ToString(), remark, "内蒙古自治区建设工程招标投标服务中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }