Example #1
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <CorpMerit>();
            string html            = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    pageInt         = 1;
            string eventValidation = string.Empty;

            try
            {
                html = ToolWeb.GetHtmlByUrl(this.SiteUrl, Encoding.Default);
            }
            catch
            {
                return(list);
            }

            #region 优质专业工程
            Parser   parser   = new Parser(new Lexer(html));
            NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "29")), true), new TagNameFilter("table")));//parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("width","98%")));
            if (nodeList != null && nodeList.Count > 0)
            {
                TableTag table = nodeList[0] as TableTag;
                parser = new Parser(new Lexer(table.ToHtml()));
                NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                if (aTagNode != null && aTagNode.Count > 0)
                {
                    for (int j = 0; j < aTagNode.Count; j++)
                    {
                        ATag   aTag     = aTagNode[j].GetATag();
                        string name     = "优质专业工程";
                        string typename = aTag.LinkText.Replace("·", "");
                        string url      = "http://www.jianzhuxh.com/excellence/" + aTag.Link;
                        string htlList  = string.Empty;
                        int    page     = 1;
                        try
                        {
                            htlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default);
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlList));
                        NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "center")));
                        if (pageNode != null && pageNode.Count > 0)
                        {
                            try
                            {
                                string temp = pageNode.AsString().GetRegexBegEnd("/", "页");
                                page = int.Parse(temp);
                            }
                            catch { }
                        }
                        for (int d = 1; d <= page; d++)
                        {
                            if (d > 1)
                            {
                                try
                                {
                                    htlList = ToolWeb.GetHtmlByUrl(url + "&page=" + d, Encoding.Default);
                                }
                                catch { continue; }
                            }
                            parser = new Parser(new Lexer(htlList));
                            NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text")));
                            if (dtlNode != null && dtlNode.Count > 0)
                            {
                                parser = new Parser(new Lexer(dtlNode.ToHtml()));
                                NodeList dtlNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "844")));
                                if (dtlNodeList != null && dtlNodeList.Count > 0)
                                {
                                    TableTag tableTag = dtlNodeList[0] as TableTag;

                                    for (int k = 0; k < tableTag.RowCount; k++)
                                    {
                                        string   CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty;
                                        TableRow tr = tableTag.Rows[k];
                                        MeritName    = name;
                                        MeritType    = typename;
                                        MeritPrjName = tr.Columns[1].ToNodePlainString();
                                        CorpName     = tr.Columns[2].ToNodePlainString();
                                        PrjMgr       = tr.Columns[3].ToNodePlainString();
                                        SupUnit      = tr.Columns[4].ToNodePlainString();
                                        SupMgr       = tr.Columns[5].ToNodePlainString();
                                        ManCost      = tr.Columns[6].ToNodePlainString();
                                        if (ManCost.Contains("吨"))
                                        {
                                            ManCost = string.Empty;
                                        }
                                        ProArea   = tr.Columns[7].ToNodePlainString();
                                        MeritYear = tr.Columns[8].ToNodePlainString();

                                        CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType);

                                        list.Add(info);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            #endregion

            #region 其它工程
            parser = new Parser(new Lexer(html));
            NodeList theNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "32")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))));
            if (theNode != null && theNode.Count > 2)
            {
                TableTag table = theNode[2] as TableTag;
                parser = new Parser(new Lexer(table.ToHtml()));
                NodeList atagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                if (atagNode != null && atagNode.Count > 0)
                {
                    for (int j = 0; j < atagNode.Count; j++)
                    {
                        ATag   aTag     = atagNode[j].GetATag();
                        string typename = aTag.LinkText;
                        string url      = "http://www.jianzhuxh.com/excellence/" + aTag.Link;
                        string htmlList = string.Empty;
                        int    page     = 1;
                        try
                        {
                            htmlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default);
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htmlList));
                        NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "center")));
                        if (pageNode != null && pageNode.Count > 0)
                        {
                            try
                            {
                                string temp = pageNode.AsString().GetRegexBegEnd("/", "页");
                                page = int.Parse(temp);
                            }
                            catch { }
                        }
                        for (int k = 1; k <= page; k++)
                        {
                            if (k > 1)
                            {
                                try
                                {
                                    htmlList = ToolWeb.GetHtmlByUrl(url + "&page=" + k.ToString(), Encoding.Default);
                                }
                                catch { }
                            }
                            parser = new Parser(new Lexer(htmlList));
                            NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text16")));
                            if (dtlNode != null && dtlNode.Count > 0)
                            {
                                TableTag tableTag = dtlNode[0] as TableTag;
                                for (int t = 0; t < tableTag.RowCount; t++)
                                {
                                    TableRow tr = tableTag.Rows[t];
                                    string   CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty;
                                    MeritName = MeritType = typename;
                                    if (typename.Contains("优质工程"))
                                    {
                                        MeritName    = MeritType = "深圳市" + typename;
                                        MeritPrjName = tr.Columns[2].ToNodePlainString();
                                        CorpName     = tr.Columns[3].ToNodePlainString();
                                        PrjMgr       = tr.Columns[4].ToNodePlainString();
                                        SupUnit      = tr.Columns[5].ToNodePlainString();
                                        SupMgr       = tr.Columns[6].ToNodePlainString();
                                        PrjSupporter = tr.Columns[7].ToNodePlainString();
                                        string temp = tr.Columns[8].ToNodePlainString();
                                        if (temp.Contains("元"))
                                        {
                                            ManCost = temp;
                                        }
                                        else
                                        {
                                            ProArea = temp;
                                        }
                                        MeritYear = tr.Columns[9].ToNodePlainString();
                                    }
                                    else if (typename.Contains("优质结构工程"))
                                    {
                                        MeritName    = MeritType = "深圳市" + typename;
                                        MeritPrjName = tr.Columns[1].ToNodePlainString();
                                        CorpName     = tr.Columns[2].ToNodePlainString();
                                        PrjMgr       = tr.Columns[3].ToNodePlainString();
                                        PileConsUnit = tr.Columns[4].ToNodePlainString();
                                        SupUnit      = tr.Columns[5].ToNodePlainString();
                                        SupMgr       = tr.Columns[6].ToNodePlainString();
                                        string temp = tr.Columns[8].ToNodePlainString();
                                        if (temp.Contains("元"))
                                        {
                                            ManCost = temp;
                                        }
                                        else
                                        {
                                            ProArea = temp;
                                        }
                                        MeritYear = tr.Columns[10].ToNodePlainString();
                                    }
                                    else if (typename.Contains("用户满意工程"))
                                    {
                                        MeritName    = MeritType = "深圳市" + typename;
                                        MeritPrjName = tr.Columns[1].ToNodePlainString();
                                        CorpName     = tr.Columns[2].ToNodePlainString();
                                        SupUnit      = tr.Columns[3].ToNodePlainString();
                                        BuildingType = tr.Columns[4].ToNodePlainString();
                                        ProArea      = tr.Columns[5].ToNodePlainString();
                                        MeritYear    = tr.Columns[6].ToNodePlainString();
                                    }
                                    else if (typename.Contains("绿色施工示范工程"))
                                    {
                                        MeritName    = MeritType = "深圳市" + typename;
                                        MeritPrjName = tr.Columns[2].ToNodePlainString();
                                        CorpName     = tr.Columns[3].ToNodePlainString();
                                        PrjMgr       = tr.Columns[4].ToNodePlainString();
                                        SupUnit      = tr.Columns[5].ToNodePlainString();
                                        SupMgr       = tr.Columns[6].ToNodePlainString();
                                        PrjSupporter = tr.Columns[8].ToNodePlainString();
                                        MeritYear    = tr.Columns[10].ToNodePlainString();
                                    }
                                    else if (typename.Contains("文明工地") || typename.Contains("双优工地") || typename.Contains("双优样板工地"))
                                    {
                                        MeritPrjName = tr.Columns[1].ToNodePlainString();
                                        CorpName     = tr.Columns[2].ToNodePlainString();
                                        PrjMgr       = tr.Columns[3].ToNodePlainString();
                                        SupUnit      = tr.Columns[4].ToNodePlainString();
                                        SupMgr       = tr.Columns[5].ToNodePlainString();
                                        string temp = tr.Columns[6].ToNodePlainString();
                                        if (temp.Contains("元"))
                                        {
                                            ManCost = temp;
                                        }
                                        else
                                        {
                                            ProArea = temp;
                                        }
                                        MeritYear = tr.Columns[7].ToNodePlainString();
                                    }

                                    CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType);

                                    list.Add(info);
                                }
                            }
                        }
                    }
                }
            }
            #endregion

            #region 深圳地区
            parser = new Parser(new Lexer(html));
            NodeList areaNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "29")), true), new TagNameFilter("table")));
            if (areaNode != null && areaNode.Count > 0)
            {
                TableTag table = areaNode[1] as TableTag;
                parser = new Parser(new Lexer(table.ToHtml()));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                if (listNode != null && listNode.Count > 0)
                {
                    for (int j = 0; j < listNode.Count; j++)
                    {
                        ATag   aTag     = listNode[j].GetATag();
                        string typename = aTag.LinkText.Replace("·", "");
                        string url      = "http://www.jianzhuxh.com/excellence/" + aTag.Link;
                        string htmlList = string.Empty;
                        int    page     = 1;
                        try
                        {
                            htmlList = ToolWeb.GetHtmlByUrl(url, Encoding.Default);
                        }
                        catch
                        {
                            continue;
                        }
                        parser = new Parser(new Lexer(htmlList));
                        //continue;
                        NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "gopage")));
                        if (pageNode != null && pageNode.Count > 0)
                        {
                            try
                            {
                                string temp = pageNode.AsString().GetRegexBegEnd("/", "页");
                                page = int.Parse(temp);
                            }
                            catch { }
                        }
                        for (int k = 1; k <= page; k++)
                        {
                            if (k > 1)
                            {
                                try
                                {
                                    htmlList = ToolWeb.GetHtmlByUrl(url + "?page=" + k.ToString(), Encoding.Default);
                                }
                                catch { continue; }
                            }
                            parser = new Parser(new Lexer(htmlList));
                            NodeList tableNode = null;
                            if (typename.Contains("鲁班奖"))
                            {
                                tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "py_tbl")));
                            }
                            else
                            {
                                tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text18")));
                            }
                            if (tableNode != null && tableNode.Count > 0)
                            {
                                TableTag tableTag = tableNode[0] as TableTag;
                                for (int t = 1; t < tableTag.RowCount; t++)
                                {
                                    TableRow tr = tableTag.Rows[t];
                                    string   CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty, MeritType = string.Empty, PrjMgr = string.Empty, SupMgr = string.Empty, ManCost = string.Empty, ProArea = string.Empty, SupUnit = string.Empty, PileConsUnit = string.Empty, BuildingType = string.Empty;
                                    MeritName    = MeritType = typename;
                                    MeritPrjName = tr.Columns[1].ToNodePlainString();
                                    CorpName     = tr.Columns[2].ToNodePlainString();
                                    PrjSupporter = tr.Columns[3].ToNodePlainString().Replace("参建单位", "").Replace(":", "").Replace(":", "");
                                    SupUnit      = tr.Columns[4].ToNodePlainString();
                                    PrjMgr       = tr.Columns[5].ToNodePlainString();
                                    MeritYear    = tr.Columns[6].ToNodePlainString();

                                    CorpMerit info = ToolDb.GenCorpMerit("广东省", "深圳市", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, url, Remark, Details, MeritType, PrjMgr, SupMgr, ManCost, ProArea, SupUnit, PileConsUnit, BuildingType);

                                    list.Add(info);
                                }
                            }
                        }
                    }
                }
            }
            #endregion
            return(list);
        }
Example #2
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            string   html            = string.Empty;
            string   cookiestr       = string.Empty;
            string   viewState       = string.Empty;
            int      pageInt         = 1;
            Parser   parser          = null;
            string   eventValidation = string.Empty;
            DateTime dateTime        = DateTime.Now;
            DateTime begin           = DateTime.Parse("1980-01-01");

            for (DateTime t = begin; t <= dateTime; t = t.AddDays(30))
            {
                string endDate = t.AddDays(30).ToString("yyyy-MM-dd");
                try
                {
                    html   = ToolWeb.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr);
                    parser = new Parser(new Lexer(html));
                    NodeList pageInputNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_ValidateCode1_txtRanNum")));
                    string   pageValiCode  = string.Empty;
                    if (pageInputNode != null && pageInputNode.Count > 0)
                    {
                        pageValiCode = (pageInputNode[0] as InputTag).GetAttribute("value");
                    }
                    viewState = ToolWeb.GetAspNetViewState(html);
                    NameValueCollection nvc = ToolWeb.GetNameValueCollection(
                        new string[] {
                        "ctl00_ContentPlaceHolder1_toolkitScriptManager1_HiddenField",
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "__VIEWSTATE",
                        "ctl00$ContentPlaceHolder1$txtEnt_name",
                        "ctl00$ContentPlaceHolder1$txtAWARD_NAME",
                        "ctl00$ContentPlaceHolder1$txtStartDate",
                        "ctl00$ContentPlaceHolder1$txtEndDate",
                        "ctl00$ContentPlaceHolder1$ValidateCode1$txtValidateCode",
                        "ctl00$ContentPlaceHolder1$ValidateCode1$txtRanNum"
                    },
                        new string[] {
                        "",
                        "ctl00$ContentPlaceHolder1$AspNetPager2",
                        "1",
                        viewState,
                        "", "", t.ToString("yyyy-MM-dd"), endDate, "",
                        pageValiCode
                    });
                    html = ToolWeb.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                }
                catch
                {
                    return(null);
                }

                string opValue = string.Empty;
                parser = new Parser(new Lexer(html));
                NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_AspNetPager2")));
                if (pageList != null && pageList.Count > 0)
                {
                    try
                    {
                        string temp   = pageList[0].ToPlainTextString().GetRegexBegEnd("共", "条");
                        int    page   = int.Parse(temp);
                        int    result = page / 15;
                        if (page % 15 != 0)
                        {
                            pageInt = result + 1;
                        }
                        else
                        {
                            pageInt = result;
                        }
                    }
                    catch { pageInt = 1; }
                }
                for (int i = 1; i <= pageInt; i++)
                {
                    if (i > 1)
                    {
                        parser = new Parser(new Lexer(html));
                        NodeList pageInputNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_ValidateCode1_txtRanNum")));
                        string   pageValiCode  = string.Empty;
                        if (pageInputNode != null && pageInputNode.Count > 0)
                        {
                            pageValiCode = (pageInputNode[0] as InputTag).GetAttribute("value");
                        }
                        viewState = ToolWeb.GetAspNetViewState(html);
                        NameValueCollection nvc = ToolWeb.GetNameValueCollection(
                            new string[] {
                            "ctl00_ContentPlaceHolder1_toolkitScriptManager1_HiddenField",
                            "__EVENTTARGET",
                            "__EVENTARGUMENT",
                            "__VIEWSTATE",
                            "ctl00$ContentPlaceHolder1$txtEnt_name",
                            "ctl00$ContentPlaceHolder1$txtAWARD_NAME",
                            "ctl00$ContentPlaceHolder1$txtStartDate",
                            "ctl00$ContentPlaceHolder1$txtEndDate",
                            "ctl00$ContentPlaceHolder1$ValidateCode1$txtValidateCode",
                            "ctl00$ContentPlaceHolder1$ValidateCode1$txtRanNum"
                        },
                            new string[] {
                            "",
                            "ctl00$ContentPlaceHolder1$AspNetPager2",
                            i.ToString(),
                            viewState,
                            "", "", t.ToString("yyyy-MM-dd"), endDate, "",
                            pageValiCode
                        });
                        try
                        {
                            html = ToolWeb.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                        }
                        catch { continue; }
                    }
                    parser = new Parser(new Lexer(html));
                    NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tab_ent")));
                    if (nodeList != null && nodeList.Count > 0)
                    {
                        TableTag table = nodeList[0] as TableTag;
                        for (int j = 1; j < table.RowCount; j++)
                        {
                            string CorpCode = string.Empty, CorpName = string.Empty, MeritYear = string.Empty, MeritName = string.Empty, MeritDate = string.Empty, MeritLevel = string.Empty, MeritRegion = string.Empty, MeritSector = string.Empty, MeritPrjName = string.Empty, PrjSupporter = string.Empty, Source = string.Empty, Url = string.Empty, Remark = string.Empty, Details = string.Empty;


                            TableRow tr = table.Rows[j];

                            CorpName  = tr.Columns[2].ToNodePlainString();
                            MeritName = tr.Columns[1].ToNodePlainString();
                            MeritDate = tr.Columns[3].ToPlainTextString().GetDateRegex();

                            Url = "http://113.108.219.40/PlatForm/SearchCenter/" + tr.Columns[1].GetATagHref();
                            string htlDtl = string.Empty;
                            try
                            {
                                htlDtl = ToolWeb.GetHtmlByUrl(Url, Encoding.UTF8);
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htlDtl));
                            NodeList dtlList = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                            if (dtlList != null && dtlList.Count > 0)
                            {
                                TableTag tab = dtlList[0] as TableTag;
                                string   ctx = string.Empty;
                                for (int k = 0; k < tab.RowCount; k++)
                                {
                                    for (int d = 0; d < tab.Rows[k].ColumnCount; d++)
                                    {
                                        if ((d + 1) % 2 == 0)
                                        {
                                            ctx += tab.Rows[k].Columns[d].ToNodePlainString() + "\r\n";
                                        }
                                        else
                                        {
                                            ctx += tab.Rows[k].Columns[d].ToNodePlainString().Replace(":", "").Replace(":", "") + ":";
                                        }
                                    }
                                }
                                MeritLevel = ctx.GetRegex("获奖等级");
                                Remark     = ctx.GetRegex("备注");
                                Details    = ctx.GetRegex("表彰内容描述");
                                Source     = "广东省住房和城乡建设厅";
                                if (Remark.Contains("无备注") || Remark == "无")
                                {
                                    Remark = null;
                                }
                                CorpMerit info = ToolDb.GenCorpMerit("广东省", "广东地区", "", CorpCode, CorpName, MeritYear, MeritName, MeritDate, MeritLevel, MeritRegion, MeritSector, MeritPrjName, PrjSupporter, Source, Url, Remark, Details);
                                ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx);
                            }
                        }
                    }
                }
            }
            return(null);
        }