Exemple #1
0
 static void GetPicUrlsFromBeautyPersonalPage(ImageTag imgNode, int fileIndex, int type)
 {
     if (imgNode.Attributes.ContainsKey("SRC") || imgNode.Attributes.ContainsKey("DATA-CFSRC"))
     {
         string imgUrl = imgNode.Attributes.ContainsKey("SRC") ? imgNode.GetAttribute("SRC") : imgNode.GetAttribute("DATA-CFSRC");
         //2014年5月16日根据网页结构修改
         //if (imgUrl.Contains("/250x0/"))
         //{
         //    imgUrl = imgUrl.Substring(imgUrl.IndexOf("/250x0/") + 7);
         //    imgUrl = "http://" + imgUrl;
         //}
         //int startIndex = imgUrl.IndexOf("/media.curator.im/images/");
         //imgUrl = "http:/" + imgUrl.Substring(startIndex);
         if (!imgFileNameSet.Contains(imgUrl))
         {
             string imgName = "";
             if (imgNode.Attributes.ContainsKey("ALT"))
             {
                 imgName = imgNode.GetAttribute("ALT");
                 if (type == 2)//type为2是爬取“正妹流”中的妹子的网页的情况
                 {
                     imgName = imgName.Substring(4);
                 }
             }
             else
             {
                 Console.WriteLine("第" + fileIndex + "张图片无法获取alt属性!");
                 return;
             }
             imgFileNameSet.Add(imgUrl);
             //因为要把美女的名字作为文件夹名,所以要排除所有不能用于文件夹的字符
             int invalideCharIndex = imgName.IndexOfAny(Path.GetInvalidPathChars());
             while (invalideCharIndex != -1)
             {
                 imgName           = imgName.Remove(invalideCharIndex, 1);
                 invalideCharIndex = imgName.IndexOfAny(Path.GetInvalidPathChars());
             }
             //因为要把美女的名字作为文件名,所以要排除所有不能用于文件名的字符
             invalideCharIndex = imgName.IndexOfAny(Path.GetInvalidFileNameChars());
             while (invalideCharIndex != -1)
             {
                 imgName           = imgName.Remove(invalideCharIndex, 1);
                 invalideCharIndex = imgName.IndexOfAny(Path.GetInvalidFileNameChars());
             }
             string completeImgName = type == 1 ? saveOneDayOneBeautyBasePath + imgName : saveBeautyFlowBasePath + imgName;//和上面类似,用type来区别图片保存路径
             if (!Directory.Exists(completeImgName))
             {
                 Directory.CreateDirectory(completeImgName);
             }
             currentImgFileNameSet.Add(imgUrl, completeImgName + "\\" + imgName + " (" + fileIndex + ").jpg");
             thunderAgent.AddTask2(imgUrl, imgName + " (" + fileIndex + ").jpg", "D:\\Download\\" + completeImgName + "\\", "", "", 1, 0, 1);
             fileIndex++;
         }
     }
     else
     {
         Console.WriteLine("无法获取第" + fileIndex + "张图片!");
         return;
     }
 }
Exemple #2
0
        /// <summary>
        /// 把内容中的图片,相对地址变为绝对地址。
        /// </summary>
        /// <param name="tmp_content"></param>
        /// <param name="url"></param>
        public static void ImageSrc(ref string tmp_content, string url)
        {
            //如果遇到使用绝对路径的图片,转换为全路径。
            NodeList htmlNodes = new Parser(new Lexer(tmp_content.Replace("<IMG", "<img").Replace("<Img", "<img"))).Parse(new TagNameFilter("img"));

            for (int j = htmlNodes.Count - 1; j >= 0; j--)
            {
                ImageTag link = (ImageTag)htmlNodes.ElementAt(j);

                string urlpart = link.GetAttribute("src");

                if (!string.IsNullOrEmpty(urlpart) && !new Regex(@"^http:").IsMatch(urlpart))
                {
                    urlpart = new xkHttp().getDealUrl(url, urlpart);
                    string oldlink = link.ToHtml();

                    link.RemoveAttribute("src");
                    link.RemoveAttribute("onclick");
                    string newsrc  = "src=\"" + urlpart + "\" ";
                    string newlink = link.ToHtml();
                    newlink     = newlink.Insert(5, newsrc);
                    tmp_content = tmp_content.Replace(oldlink, newlink);
                    //EchoHelper.Echo("成功转换了一个图片的SRC属性!", "", EchoHelper.EchoType.普通信息);
                }
                if (!string.IsNullOrEmpty(urlpart) && new Regex(@"^\.\.").IsMatch(urlpart))
                {
                    string oldlink = link.ToHtml();
                    tmp_content = tmp_content.Replace(oldlink, "");
                }
            }
        }
Exemple #3
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "PageDataList__ctl7_LinkButton1")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.AsString();
                    pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页"));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        viewState = this.ToolWebSite.GetAspNetViewState(html);
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(
                            new string[] {
                            "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "head1:username",
                            "head1:Password", "head1:rbLoginType", "Tb_keyword", "ddlNewsType", "ddlistaddnewsdate"
                        },
                            new string[] {
                            "PageDataList$_ctl" + (i + 1).ToString() + "$LinkButton1", "", viewState, "", "", "unit", "", "20", ""
                        }
                            );
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", " tb_list")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "通知公告";
                        releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        headName    = tr.Columns[1].ToNodePlainString();
                        infoUrl     = "http://www.szpark.com.cn" + tr.Columns[1].GetATagHref();
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "newsinfo")));
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = ctxHtml.ToCtxString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            msgType    = MsgTypeCosnt.ShenZhenFJYLMsgType;
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            sqlCount++;
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                            {
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (imgList != null && imgList.Count > 0)
                                {
                                    for (int m = 0; m < imgList.Count; m++)
                                    {
                                        try
                                        {
                                            ImageTag img = imgList[m] as ImageTag;
                                            string   src = img.GetAttribute("src");
                                            if (src.ToLower().Contains(".gif"))
                                            {
                                                continue;
                                            }
                                            BaseAttach obj = null;
                                            if (src.Contains("http"))
                                            {
                                                obj = ToolHtml.GetBaseAttach(src, headName, info.Id);
                                            }
                                            else
                                            {
                                                obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id);
                                            }
                                            if (obj != null)
                                            {
                                                ToolDb.SaveEntity(obj, string.Empty);
                                            }
                                        }
                                        catch { }
                                    }
                                }
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ATag aTag = aNode[a] as ATag;
                                        if (aTag.IsAtagAttach())
                                        {
                                            try
                                            {
                                                BaseAttach obj  = null;
                                                string     href = aTag.GetATagHref();
                                                if (href.Contains("http"))
                                                {
                                                    obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id);
                                                }
                                                else
                                                {
                                                    obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id);
                                                }
                                                if (obj != null)
                                                {
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #4
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new List <NotifyInfo>();
            Dictionary <string, string> dic = new Dictionary <string, string>();

            //dic.Add("盐田区", "http://yt.szzfcg.cn/portal/topicView.do?method=view&id=50074439");
            dic.Add("龙华新区", "http://lhxq.szzfcg.cn/portal/topicView.do?method=view&id=110074439");
            dic.Add("大鹏新区", "http://dp.szzfcg.cn/portal/topicView.do?method=view&id=100074439");
            dic.Add("坪山新区", "http://ps.szzfcg.cn/portal/topicView.do?method=view&id=90074439");
            dic.Add("龙岗区", "http://lg.szzfcg.cn/portal/topicView.do?method=view&id=70074439");
            dic.Add("光明新区", "http://gm.szzfcg.cn/portal/topicView.do?method=view&id=10170626");
            dic.Add("福田区", "http://ft.szzfcg.cn/portal/topicView.do?method=view&id=30074439");
            dic.Add("罗湖区", "http://lh.szzfcg.cn/portal/topicView.do?method=view&id=20074439");
            dic.Add("南山区", "http://ns.szzfcg.cn/portal/topicView.do?method=view&id=40074439");

            Dictionary <string, string> dicCity = new Dictionary <string, string>();

            //dicCity.Add("盐田区", "yt");
            dicCity.Add("龙华新区", "lhxq");
            dicCity.Add("大鹏新区", "dp");
            dicCity.Add("坪山新区", "ps");
            dicCity.Add("龙岗区", "lg");
            dicCity.Add("光明新区", "gm");
            dicCity.Add("福田区", "ft");
            dicCity.Add("罗湖区", "lh");
            dicCity.Add("南山区", "ns");

            foreach (string key in dic.Keys)
            {
                int    pageInt = 1, sqlCount = 0;
                string html            = string.Empty;
                string viewState       = string.Empty;
                string eventValidation = string.Empty;
                try
                {
                    html = this.ToolWebSite.GetHtmlByUrl(dic[key]);
                }
                catch { continue; }
                Parser   parser   = new Parser(new Lexer(html));
                NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages")));
                if (pageNode != null && pageNode.Count > 0)
                {
                    SelectTag select = pageNode[0] as SelectTag;
                    try
                    {
                        pageInt = int.Parse(select.OptionTags[select.OptionTags.Length - 1].Value);
                    }
                    catch { }
                }
                for (int i = 1; i <= pageInt; i++)
                {
                    if (i > 1)
                    {
                        string id = dic[key].Substring(dic[key].IndexOf("id"), dic[key].Length - dic[key].IndexOf("id")).Replace("id=", "");
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                            "ec_i",
                            "topicChrList_20070702_crd",
                            "topicChrList_20070702_f_a",
                            "topicChrList_20070702_p",
                            "topicChrList_20070702_s_name",
                            "topicChrList_20070702_s_topName",
                            "id",
                            "method",
                            "__ec_pages",
                            "topicChrList_20070702_rd",
                            "topicChrList_20070702_f_name",
                            "topicChrList_20070702_f_topName",
                            "topicChrList_20070702_f_ldate",
                        }, new string[] {
                            "topicChrList_20070702",
                            "20",
                            "",
                            i.ToString(),
                            "",
                            "",
                            id,
                            "view",
                            i.ToString(),
                            "20",
                            "",
                            "",
                            ""
                        });
                        html = this.ToolWebSite.GetHtmlByUrl(dic[key], nvc);
                    }
                    parser = new Parser(new Lexer(html));
                    NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_20070702_table")));
                    if (listNode != null & listNode.Count > 0)
                    {
                        TableTag table = listNode[0] as TableTag;
                        for (int j = 3; j < table.RowCount; j++)
                        {
                            string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                            TableRow tr = table.Rows[j];
                            headName    = tr.Columns[1].ToNodePlainString();
                            releaseTime = tr.Columns[3].ToPlainTextString();
                            infoType    = "通知公告";
                            msgType     = "深圳市" + key + "政府采购中心";

                            infoUrl = "http://" + dicCity[key] + ".szzfcg.cn" + tr.Columns[1].GetATagHref();
                            string htmldtl = string.Empty;
                            try
                            {
                                htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString();
                            }
                            catch { continue; }
                            parser = new Parser(new Lexer(htmldtl));
                            NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center")));
                            if (dtlNode != null && dtlNode.Count > 0)
                            {
                                ctxHtml = dtlNode[0].ToHtml();
                                infoCtx = ctxHtml.ToCtxString();
                                NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳政府采购", key, infoCtx, infoType);
                                sqlCount++;
                                if (!crawlAll && sqlCount >= this.MaxCount)
                                {
                                    goto type;
                                }
                                ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate);
                            }
                            else
                            {
                                parser.Reset();
                                NodeList bodyNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body"));
                                if (bodyNode != null && bodyNode.Count > 0)
                                {
                                    ctxHtml = bodyNode.AsHtml();
                                    infoCtx = ctxHtml.ToCtxString();
                                    NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳政府采购", key, infoCtx, infoType);
                                    sqlCount++;
                                    if (!crawlAll && sqlCount >= this.MaxCount)
                                    {
                                        return(null);
                                    }
                                    if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate))
                                    {
                                        parser.Reset();
                                        NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                        if (imgList != null && imgList.Count > 0)
                                        {
                                            for (int m = 0; m < imgList.Count; m++)
                                            {
                                                try
                                                {
                                                    ImageTag   img = imgList[m] as ImageTag;
                                                    string     src = img.GetAttribute("src");
                                                    BaseAttach obj = null;
                                                    if (src.Contains("http"))
                                                    {
                                                        obj = ToolHtml.GetBaseAttach(src, headName, info.Id);
                                                    }
                                                    else
                                                    {
                                                        obj = ToolHtml.GetBaseAttach("http://" + dicCity[key] + ".szzfcg.cn" + src, headName, info.Id);
                                                    }
                                                    if (obj != null)
                                                    {
                                                        ToolDb.SaveEntity(obj, string.Empty);
                                                    }
                                                }
                                                catch { }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                type : continue;
            }
            return(list);
        }
Exemple #5
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "dnn_ctr467_ArticleList_cboPages")), true), new TagNameFilter("option")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    pageInt = pageList.Count;
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        viewState = this.ToolWebSite.GetAspNetViewState(html);
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(
                            new string[] { "__EVENTARGUMENT", "dnn:ctr467:ArticleList:cboPages",
                                           "ScrollTop", "__dnnVariable", "__VIEWSTATE" },
                            new string[] { "", (i - 1).ToString(), "", "", viewState }
                            );
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dnn_ctr467_ArticleList_PanelA")), true), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "通知公告";
                        releaseTime = "20" + tr.Columns[2].ToPlainTextString().GetDateRegex("yy-MM-dd");
                        headName    = tr.Columns[1].ToNodePlainString();
                        infoUrl     = "http://www.szmea.net" + tr.Columns[1].GetATagHref();
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = ToolHtml.GetHtmlByUrl(SiteUrl, infoUrl, Encoding.Default).GetJsString(); //ToolHtml.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString();
                        }
                        catch { }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "dnn_ctr391_ArticleShow_lblContent")));
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = ctxHtml.ToCtxString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            msgType    = MsgTypeCosnt.ShenZhenJLGCMsgType;
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            sqlCount++;
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                            {
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (imgList != null && imgList.Count > 0)
                                {
                                    for (int m = 0; m < imgList.Count; m++)
                                    {
                                        try
                                        {
                                            ImageTag img = imgList[m] as ImageTag;
                                            string   src = img.GetAttribute("src");
                                            if (src.ToLower().Contains(".gif"))
                                            {
                                                continue;
                                            }
                                            BaseAttach obj = null;
                                            if (src.Contains("http"))
                                            {
                                                obj = ToolHtml.GetBaseAttach(src, headName, info.Id);
                                            }
                                            else
                                            {
                                                obj = ToolHtml.GetBaseAttach("http://www.szmea.net" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id);
                                            }
                                            if (obj != null)
                                            {
                                                ToolDb.SaveEntity(obj, string.Empty);
                                            }
                                        }
                                        catch { }
                                    }
                                }
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ATag aTag = aNode[a] as ATag;
                                        if (aTag.IsAtagAttach())
                                        {
                                            try
                                            {
                                                BaseAttach obj  = null;
                                                string     href = aTag.GetATagHref();
                                                if (href.Contains("http"))
                                                {
                                                    obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id);
                                                }
                                                else
                                                {
                                                    obj = ToolHtml.GetBaseAttach("http://www.szmea.net" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id);
                                                }
                                                if (obj != null)
                                                {
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                                }
                                            }
                                            catch { continue; }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #6
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default);
            }
            catch { return(list); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mtop pages")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    string temp = pageNode.AsString().GetRegexBegEnd("1/", "页");
                    pageInt = int.Parse(temp);
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.gsei.com.cn/index.php/cms/item-list-category-1337-page-" + i + ".shtml", Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "label_ul_b")), true), new TagNameFilter("li")));
                if (listNode != null && listNode.Count > 0)
                {
                    for (int j = 0; j < listNode.Count; j++)
                    {
                        ATag aTag = listNode[j].GetATag();
                        if (aTag == null)
                        {
                            continue;
                        }
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName   = aTag.GetAttribute("title");
                        beginDate = listNode[j].ToPlainTextString().GetDateRegex();
                        InfoUrl   = aTag.Link;
                        string htmldtl = string.Empty;
                        try
                        {
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "p8_content_show")));
                        if (dtlNode != null && dtlNode.Count > 0)
                        {
                            HtmlTxt = dtlNode.AsHtml();
                            bidCtx  = HtmlTxt.ToLower().GetReplace("</p>,<br/>", "\r\n").ToCtxString();
                            bidUnit = bidCtx.GetBidRegex();
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegex("第一成交候选人,第一名,中标人为,中标单位名称");
                            }
                            bidMoney = bidCtx.GetMoneyRegex(null, false, "万元");
                            if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetMoneyRegex(new string[] { "中标造价" }, false, "万元");
                            }
                            prjMgr    = bidCtx.GetMgrRegex();
                            buildUnit = bidCtx.GetBuildRegex();
                            code      = bidCtx.GetCodeRegex().GetCodeDel();
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    TableTag tag = tableNode[0] as TableTag;
                                    string   ctx = string.Empty;
                                    for (int r = 0; r < tag.RowCount; r++)
                                    {
                                        string rowName = tag.Rows[r].ToNodePlainString();
                                        if (rowName.Contains("中标候选人名称") || rowName.Contains("中标价"))
                                        {
                                            for (int c = 0; c < 7; c++)
                                            {
                                                try
                                                {
                                                    if (c < 3)
                                                    {
                                                        ctx += tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":";
                                                    }
                                                    else
                                                    {
                                                        ctx += tag.Rows[r + 1].Columns[c - 3].ToNodePlainString().GetReplace(":,:") + ":";
                                                    }

                                                    ctx += tag.Rows[r + 2].Columns[c].ToNodePlainString().GetReplace(":,:") + "\r\n";
                                                }
                                                catch { }
                                            }
                                        }
                                        else
                                        {
                                            for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                            {
                                                string temp = tag.Rows[r].Columns[c].ToNodePlainString();

                                                if ((c + 1) % 2 == 0)
                                                {
                                                    ctx += temp.GetReplace(":,:") + "\r\n";
                                                }
                                                else
                                                {
                                                    ctx += temp.GetReplace(":,:") + ":";
                                                }
                                            }
                                        }
                                        if (rowName.Contains("中标候选人名称") || rowName.Contains("中标价"))
                                        {
                                            break;
                                        }
                                    }
                                    bidUnit = ctx.GetBidRegex().GetReplace("第一名,第二名,第三名,名次");
                                    if (string.IsNullOrEmpty(bidUnit))
                                    {
                                        bidUnit = ctx.GetRegex("中标候选人名称");
                                    }
                                    if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                    {
                                        bidMoney = ctx.GetMoneyRegex();
                                    }
                                    if (string.IsNullOrEmpty(buildUnit))
                                    {
                                        buildUnit = ctx.GetBuildRegex();
                                    }
                                    if (string.IsNullOrEmpty(prjMgr) || prjMgr.IsNumber())
                                    {
                                        prjMgr = ctx.GetMgrRegex();
                                    }
                                    if (string.IsNullOrEmpty(code))
                                    {
                                        code = ctx.GetCodeRegex().GetCodeDel();
                                    }

                                    if (string.IsNullOrEmpty(bidUnit) || bidUnit.IsNumber())
                                    {
                                        ctx = string.Empty;
                                        for (int r = 0; r < tag.RowCount; r++)
                                        {
                                            for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                            {
                                                string temp = tag.Rows[r].Columns[c].ToNodePlainString();

                                                if ((c + 1) % 2 == 0)
                                                {
                                                    ctx += temp.GetReplace(":,:") + "\r\n";
                                                }
                                                else
                                                {
                                                    ctx += temp.GetReplace(":,:") + ":";
                                                }
                                            }
                                        }
                                        bidUnit = ctx.GetBidRegex().GetReplace("第一名,第二名,第三名,名次");
                                        if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                        {
                                            bidMoney = ctx.GetMoneyRegex();
                                        }
                                        if (string.IsNullOrEmpty(buildUnit))
                                        {
                                            buildUnit = ctx.GetBuildRegex();
                                        }
                                        if (string.IsNullOrEmpty(prjMgr) || prjMgr.IsNumber())
                                        {
                                            prjMgr = ctx.GetMgrRegex();
                                        }
                                        if (string.IsNullOrEmpty(code))
                                        {
                                            code = ctx.GetCodeRegex().GetCodeDel();
                                        }

                                        if (string.IsNullOrEmpty(bidUnit) || bidUnit.IsNumber())
                                        {
                                            ctx = string.Empty;
                                            for (int c = 0; c < tag.Rows[0].ColumnCount; c++)
                                            {
                                                try
                                                {
                                                    ctx += tag.Rows[0].Columns[c].ToNodePlainString().GetReplace(":,:") + ":";
                                                    ctx += tag.Rows[1].Columns[c].ToNodePlainString().GetReplace(":,:") + "\r\n";
                                                }
                                                catch { }
                                            }
                                            bidUnit = ctx.GetBidRegex().GetReplace("第一名,第二名,第三名,名次");
                                            if (string.IsNullOrEmpty(bidUnit))
                                            {
                                                bidUnit = ctx.GetRegex("中标候选人名称");
                                            }
                                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                            {
                                                bidMoney = ctx.GetMoneyRegex();
                                            }
                                            if (string.IsNullOrEmpty(buildUnit))
                                            {
                                                buildUnit = ctx.GetBuildRegex();
                                            }
                                            if (string.IsNullOrEmpty(prjMgr) || prjMgr.IsNumber())
                                            {
                                                prjMgr = ctx.GetMgrRegex();
                                            }
                                            if (string.IsNullOrEmpty(code))
                                            {
                                                code = ctx.GetCodeRegex().GetCodeDel();
                                            }
                                        }
                                        if (string.IsNullOrEmpty(bidUnit) || bidUnit.IsNumber())
                                        {
                                            for (int r = 0; r < tag.RowCount; r++)
                                            {
                                                string rowName = tag.Rows[r].ToNodePlainString();
                                                if (rowName.Contains("中标候选人名称") || rowName.Contains("中标价"))
                                                {
                                                    for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                                    {
                                                        try
                                                        {
                                                            ctx += tag.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":";
                                                            ctx += tag.Rows[r + 1].Columns[c].ToNodePlainString().GetReplace(":,:") + "\r\n";
                                                        }
                                                        catch { }
                                                    }
                                                }
                                                else
                                                {
                                                    for (int c = 0; c < tag.Rows[r].ColumnCount; c++)
                                                    {
                                                        string temp = tag.Rows[r].Columns[c].ToNodePlainString();

                                                        if ((c + 1) % 2 == 0)
                                                        {
                                                            ctx += temp.GetReplace(":,:") + "\r\n";
                                                        }
                                                        else
                                                        {
                                                            ctx += temp.GetReplace(":,:") + ":";
                                                        }
                                                    }
                                                }
                                                if (rowName.Contains("中标候选人名称") || rowName.Contains("中标价"))
                                                {
                                                    break;
                                                }
                                            }
                                            bidUnit = ctx.GetBidRegex().GetReplace("第一名,第二名,第三名,名次");
                                            if (string.IsNullOrEmpty(bidUnit))
                                            {
                                                bidUnit = ctx.GetRegex("中标候选人名称");
                                            }
                                            if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                            {
                                                bidMoney = ctx.GetMoneyRegex();
                                            }
                                            if (string.IsNullOrEmpty(buildUnit))
                                            {
                                                buildUnit = ctx.GetBuildRegex();
                                            }
                                            if (string.IsNullOrEmpty(prjMgr) || prjMgr.IsNumber())
                                            {
                                                prjMgr = ctx.GetMgrRegex();
                                            }
                                            if (string.IsNullOrEmpty(code))
                                            {
                                                code = ctx.GetCodeRegex().GetCodeDel();
                                            }
                                        }
                                    }
                                }
                            }

                            if (bidUnit.Contains("公司"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("公司")) + "公司";
                            }
                            if (bidUnit.Contains("中标价"))
                            {
                                bidUnit = "";
                            }
                            if (buildUnit.Contains("公司"))
                            {
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司";
                            }
                            if (buildUnit.Contains("地址"))
                            {
                                buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址"));
                            }
                            List <string> imgList = new List <string>();
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                            if (imgNode != null && imgNode.Count > 0)
                            {
                                for (int m = 0; m < imgNode.Count; m++)
                                {
                                    ImageTag tag     = imgNode[m] as ImageTag;
                                    string   link    = tag.GetAttribute("src");
                                    string   webLink = "http://www.gsei.com.cn/" + link;
                                    HtmlTxt = HtmlTxt.GetReplace(link, webLink);
                                    imgList.Add(webLink);
                                }
                            }
                            if (!bidUnit.Contains("公司") && !bidUnit.Contains("研究院") && !bidUnit.Contains("管理局") && !bidUnit.Contains("院"))
                            {
                                bidUnit = "";
                            }
                            msgType  = "甘肃省信息中心";
                            specType = "政府采购";
                            bidType  = "建设工程";
                            BidInfo info = ToolDb.GenBidInfo("甘肃省", "甘肃省及地市", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (imgList.Count > 0)
                            {
                                foreach (string img in imgList)
                                {
                                    string linkName = string.Empty;
                                    if (img.Contains("/"))
                                    {
                                        linkName = img.Substring(img.LastIndexOf("/"));
                                    }
                                    else
                                    {
                                        linkName = img;
                                    }
                                    BaseAttach attach = ToolDb.GenBaseAttach(linkName, info.Id, img);
                                    base.AttachList.Add(attach);
                                }
                            }
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (aNode != null && aNode.Count > 0)
                            {
                                for (int k = 0; k < aNode.Count; k++)
                                {
                                    ATag a = aNode[k] as ATag;
                                    if (a.IsAtagAttach())
                                    {
                                        string link = string.Empty;
                                        if (a.Link.ToLower().Contains("http"))
                                        {
                                            link = a.Link;
                                        }
                                        else
                                        {
                                            link = "http://www.gsei.com.cn/" + a.Link.GetReplace("../,./");
                                        }
                                        if (Encoding.Default.GetByteCount(link) > 500)
                                        {
                                            continue;
                                        }
                                        BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link);
                                        base.AttachList.Add(attach);
                                    }
                                }
                            }
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemple #7
0
        protected List <BidInfo> GetBidInfo(string url)
        {
            List <BidInfo> list            = new List <BidInfo>();
            int            pageInt         = 1;
            string         html            = string.Empty;
            string         viewState       = string.Empty;
            string         eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(url, Encoding.Default);
            }
            catch
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "cn6")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    string temp = pageNode.AsString().Replace("(", "kdxx").GetRegexBegEnd("kdxx", ",");
                    pageInt = int.Parse(temp);
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(url + "index_" + (i - 1).ToString() + ".htm", Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position2")), true), new TagNameFilter("li")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty,
                               city = string.Empty;

                        prjName = nodeList[j].GetATagValue("title");
                        if (string.IsNullOrEmpty(prjName))
                        {
                            continue;
                        }
                        if (prjName.Contains("项目所在地区"))
                        {
                            city    = "广州市区";
                            prjName = prjName.Replace("[", "").Replace("]", "").Replace("my:项目所在地区notset区", "");
                        }
                        if (prjName.Contains("广东省"))
                        {
                            city    = "广州市区";
                            prjName = prjName.Replace("[", "").Replace("]", "").Replace("广东省", "");
                        }
                        else
                        {
                            string temp = prjName.Replace("[", "kdxx").Replace("]", "xxdk").GetRegexBegEnd("kdxx", "xxdk");
                            if (!string.IsNullOrEmpty(temp))
                            {
                                prjName = prjName.Replace("[", "").Replace("]", "").Replace(temp, "");
                                city    = temp + "区";
                            }
                            else
                            {
                                prjName = prjName.Replace("[", "").Replace("]", "");
                                city    = "广州市区";
                            }
                        }
                        prjName = prjName.Replace("--中标结果", "");
                        if (!string.IsNullOrEmpty(prjName))
                        {
                            if (prjName[0] == '-')
                            {
                                prjName = prjName.Substring(1, prjName.Length - 1);
                            }
                        }

                        InfoUrl = url + nodeList[j].GetATagHref().Replace("../", "").Replace("./", "");
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString();
                        }
                        catch
                        {
                            Logger.Error(nodeList[j].ToNodePlainString());
                            continue;
                        }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1")));
                        if (dtlNode != null && dtlNode.Count > 0)
                        {
                            HtmlTxt = dtlNode.AsHtml().Replace("<br/>", "\r\n").Replace("<br>", "\r\n").Replace("<BR/>", "\r\n").Replace("<BR>", "\r\n").Replace("<p>", "\r\n").Replace("</p>", "\r\n").Replace("<P>", "\r\n").Replace("</P>", "\r\n");
                            bidCtx  = HtmlTxt.ToCtxString();

                            buildUnit  = bidCtx.GetBuildRegex();
                            code       = bidCtx.GetCodeRegex();
                            prjAddress = bidCtx.GetAddressRegex();
                            bidType    = prjName.GetInviteBidType();

                            bidUnit  = bidCtx.GetBidRegex();
                            bidMoney = bidCtx.GetMoneyRegex(null, false, "万元");
                            if (bidMoney != "0")
                            {
                                try
                                {
                                    int money = int.Parse(bidMoney);
                                    if (money > 100000)
                                    {
                                        bidMoney = bidCtx.GetMoneyRegex();
                                    }
                                }
                                catch { }
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegexBegEnd("评标结果,", "为该项目");
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegexBegEnd("第一中标候选人,", "为本工程");
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                string ctx = string.Empty;
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList bidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoNormalTable")));
                                if (bidNode != null && bidNode.Count > 0)
                                {
                                    TableTag table = bidNode[0] as TableTag;
                                    if (table.RowCount > 1)
                                    {
                                        try
                                        {
                                            for (int cell = 0; cell < table.Rows[0].ColumnCount; cell++)
                                            {
                                                ctx += table.Rows[0].Columns[cell].ToNodePlainString().Replace(" ", "").Replace(" ", "") + ":"; ctx += table.Rows[1].Columns[cell].ToNodePlainString().Replace(" ", "").Replace(" ", "") + "\r\n";
                                            }
                                        }
                                        catch { }
                                    }
                                    else
                                    {
                                        parser.Reset();
                                        bidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoTableGrid")));
                                        if (bidNode != null && bidNode.Count > 0)
                                        {
                                            table = bidNode[0] as TableTag;
                                            if (table.RowCount > 1)
                                            {
                                                try
                                                {
                                                    for (int r = 0; r < table.RowCount; r++)
                                                    {
                                                        ctx += table.Rows[r].Columns[0].ToNodePlainString() + ":";
                                                        ctx += table.Rows[r].Columns[1].ToNodePlainString() + "\r\n";
                                                    }
                                                }
                                                catch { }
                                            }
                                        }
                                    }
                                    bidUnit = ctx.GetRegex("投标单位,第一候选人,单位名称");
                                    if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                    {
                                        bidMoney = ctx.GetMoneyRegex(null, false, "万元");
                                        if (bidMoney != "0")
                                        {
                                            try
                                            {
                                                int money = int.Parse(bidMoney);
                                                if (money > 100000)
                                                {
                                                    bidMoney = ctx.GetMoneyRegex();
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                                else
                                {
                                    parser.Reset();
                                    bidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "MsoTableGrid")));
                                    if (bidNode == null || bidNode.Count < 1)
                                    {
                                        parser.Reset();
                                        bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                                    }
                                    if (bidNode != null && bidNode.Count > 0)
                                    {
                                        ctx = "";
                                        TableTag table = null;
                                        if (bidNode.Count > 1)
                                        {
                                            table = bidNode[1] as TableTag;
                                        }
                                        else
                                        {
                                            table = bidNode[bidNode.Count - 1] as TableTag;
                                        }
                                        if (table.RowCount > 1)
                                        {
                                            try
                                            {
                                                for (int r = 0; r < table.RowCount; r++)
                                                {
                                                    ctx += table.Rows[r].Columns[0].ToNodePlainString() + ":";
                                                    ctx += table.Rows[r].Columns[1].ToNodePlainString() + "\r\n";
                                                }
                                            }
                                            catch { }
                                        }
                                        bidUnit = ctx.GetRegex("投标单位,单位名称");
                                        if (bidMoney == "0" || string.IsNullOrEmpty(bidMoney))
                                        {
                                            bidMoney = ctx.GetMoneyRegex(null, false, "万元");
                                            if (bidMoney != "0")
                                            {
                                                try
                                                {
                                                    int money = int.Parse(bidMoney);
                                                    if (money > 100000)
                                                    {
                                                        bidMoney = ctx.GetMoneyRegex();
                                                    }
                                                }
                                                catch { }
                                            }
                                        }
                                    }
                                }
                            }
                            if (bidUnit.Contains("中标价"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("中标价"));
                            }
                            if (bidUnit.Contains("报价"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("报价"));
                            }
                            if (bidUnit.Contains("项目负责人"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.IndexOf("项目负责人"));
                            }

                            if (bidUnit.Contains("公司"))
                            {
                                bidUnit = bidUnit.Remove(bidUnit.LastIndexOf("公司")) + "公司";
                            }



                            bidUnit  = bidUnit.Replace("(牵头人)", "");
                            msgType  = "广东省招标投标监管网";
                            specType = "建设工程";
                            if (bidCtx.IndexOf("发布日期") != -1)
                            {
                                string ctx = bidCtx.Substring(bidCtx.IndexOf("发布日期"), bidCtx.Length - bidCtx.IndexOf("发布日期"));
                                beginDate = ctx.GetDateRegex();
                            }
                            else if (bidCtx.IndexOf("发布时间") != -1)
                            {
                                string ctx = bidCtx.Substring(bidCtx.IndexOf("发布时间"), bidCtx.Length - bidCtx.IndexOf("发布时间"));
                                beginDate = ctx.GetDateRegex();
                            }
                            if (string.IsNullOrEmpty(beginDate))
                            {
                                beginDate = DateTime.Now.ToString("yyyy-MM-dd");
                            }
                            BidInfo info = ToolDb.GenBidInfo("广东省", city, "", string.Empty, code, prjName, buildUnit, beginDate,
                                                             bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                            if (imgNode != null && imgNode.Count > 0)
                            {
                                for (int img = 0; img < imgNode.Count; img++)
                                {
                                    ImageTag imgTag = imgNode[img] as ImageTag;
                                    string   tagUrl = imgTag.GetAttribute("src");
                                    if (!string.IsNullOrWhiteSpace(tagUrl))
                                    {
                                        string     srcUrl     = InfoUrl.Remove(InfoUrl.LastIndexOf("/"));
                                        string     src        = srcUrl + tagUrl.Replace("./", "/");
                                        string     attachName = tagUrl.Replace("./", "");
                                        BaseAttach attach     = null;
                                        if (Encoding.Default.GetByteCount(attachName) < 400)
                                        {
                                            attach = ToolDb.GenBaseAttach(attachName, info.Id, src);
                                        }
                                        base.AttachList.Add(attach);
                                        info.CtxHtml = info.CtxHtml.Replace(tagUrl, src);
                                    }
                                }
                            }
                            list.Add(info);
                            if (list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemple #8
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "f12")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.AsString();
                    pageInt = Convert.ToInt32(temp.GetRegexBegEnd("/", "页"));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "32")), true), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 0; j < table.RowCount - 1; j++)
                    {
                        if ((j + 1) % 2 == 0)
                        {
                            continue;
                        }
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "通知公告";
                        releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        headName    = tr.Columns[1].ToNodePlainString();
                        infoUrl     = "http://www.jianzhuxh.com/news/" + tr.Columns[1].GetATagValue("onclick").GetRegexBegEnd("'", "'");
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text18")));
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList[0].ToHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = ctxHtml.ToCtxString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            msgType    = MsgTypeCosnt.ShenZhenJZYMsgType;
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            sqlCount++;
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                            {
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (imgList != null && imgList.Count > 0)
                                {
                                    for (int m = 0; m < imgList.Count; m++)
                                    {
                                        try
                                        {
                                            ImageTag img = imgList[m] as ImageTag;
                                            string   src = img.GetAttribute("src");
                                            if (src.ToLower().Contains(".gif"))
                                            {
                                                continue;
                                            }
                                            BaseAttach obj = null;
                                            if (src.Contains("http"))
                                            {
                                                obj = ToolHtml.GetBaseAttach(src, headName, info.Id);
                                            }
                                            else
                                            {
                                                obj = ToolHtml.GetBaseAttach("http://www.jianzhuxh.com" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id);
                                            }
                                            if (obj != null)
                                            {
                                                ToolDb.SaveEntity(obj, string.Empty);
                                            }
                                        }
                                        catch { }
                                    }
                                }
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ATag aTag = aNode[a] as ATag;
                                        if (aTag.IsAtagAttach())
                                        {
                                            try
                                            {
                                                BaseAttach obj  = null;
                                                string     href = aTag.GetATagHref();
                                                if (href.Contains("http"))
                                                {
                                                    obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id);
                                                }
                                                else
                                                {
                                                    obj = ToolHtml.GetBaseAttach("http://www.jianzhuxh.com" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id);
                                                }
                                                if (obj != null)
                                                {
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #9
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            int    count           = 1;
            IList  list            = new List <CorpWarning>();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    pageInt         = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = ToolWeb.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(htl));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "lx")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    string temp = pageNode.GetATagHref().GetRegexBegEnd("page=", "&");
                    pageInt = int.Parse(temp);
                }
                catch
                {    }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = ToolWeb.GetHtmlByUrl(this.SiteUrl + "&page=" + i.ToString(), Encoding.Default);
                    }
                    catch
                    {
                        continue;
                    }
                }
                parser = new Parser(new Lexer(htl));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bean")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string code = string.Empty, warningName = string.Empty, deliveryDate = string.Empty, warningType = string.Empty, punishmentType = string.Empty, prjNumber = string.Empty, totalScore = string.Empty, resultScore = string.Empty, corpType = string.Empty, publicEndDate = string.Empty, warningEndDate = string.Empty, prjName = string.Empty, badInfo = string.Empty, msgType = string.Empty, color = string.Empty;

                        TableRow tr  = table.Rows[j];
                        ImageTag img = tr.Columns[1].SearchFor(typeof(ImageTag), true)[0] as ImageTag;
                        color       = img.GetAttribute("src").ToLower().Contains("red") ? "1" : "0";
                        warningName = tr.Columns[2].ToNodePlainString();
                        prjName     = tr.Columns[3].ToNodePlainString();
                        warningType = "直接红黄色警示";
                        msgType     = "深圳市住房和建设局";
                        CorpWarning info = ToolDb.GenCorpWarning("广东省", "深圳市区", "", code, warningName, deliveryDate, warningType, punishmentType, prjNumber, totalScore, resultScore, corpType, publicEndDate, warningEndDate, prjName, badInfo, msgType, color);
                        list.Add(info);
                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                        count++;
                        if (count >= 200)
                        {
                            count = 1;
                            Thread.Sleep(480000);
                        }
                    }
                }
            }
            return(list);
        }
Exemple #10
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.AsString().GetRegexBegEnd("HTML", ",");
                    pageInt = int.Parse(temp.Replace("(", ""));
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                }
                parser = new Parser(new Lexer(html));

                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "secondrightlistbox")), true), new TagNameFilter("ul")), true), new TagNameFilter("li")));

                if (nodeList != null && nodeList.Count > 0)
                {
                    for (int j = 0; j < nodeList.Count; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                               infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        headName    = nodeList[j].ToNodePlainString().Replace("[", "").Replace("]", "");
                        releaseTime = nodeList[j].ToPlainTextString().GetDateRegex();
                        headName    = headName.Replace(releaseTime, "");
                        infoType    = "办事指南";
                        infoUrl     = "http://www.fsggzy.cn/" + nodeList[j].GetATagHref().Replace("../", "").Replace("./", "");
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content2")));
                        if (dtlList == null || dtlList.Count < 1)
                        {
                            infoUrl = "http://www.fsggzy.cn/gcjy/gc_jyzn/" + nodeList[j].GetATagHref().Replace("../", "").Replace("./", "");
                            try
                            {
                                htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString();
                            }
                            catch { continue; }
                            parser  = new Parser(new Lexer(htldtl));
                            dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content2")));
                        }
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            ctxHtml = dtlList.AsHtml();
                            List <string> imgUrlLen = new List <string>();
                            parser = new Parser(new Lexer(ctxHtml));
                            NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                            if (imgList != null && imgList.Count > 0)
                            {
                                for (int d = 0; d < imgList.Count; d++)
                                {
                                    ImageTag img = imgList[d] as ImageTag;
                                    string   url = img.GetAttribute("src").Replace("../", "").Replace("./", "");
                                    if (url.ToLower().Contains("http:"))
                                    {
                                        imgUrlLen.Add(url);
                                    }
                                    else
                                    {
                                        string[] strLen = infoUrl.Split('/');
                                        string   value  = string.Empty;
                                        for (int k = 0; k < strLen.Length - 1; k++)
                                        {
                                            value += strLen[k] + "/";
                                        }
                                        string imgUrl = value + url;
                                        imgUrlLen.Add(imgUrl);
                                    }
                                }
                            }
                            infoCtx = dtlList.AsString().ToCtxString();
                            msgType = MsgTypeCosnt.FouShanMsgType;
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "佛山市区", string.Empty, infoCtx, infoType);
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            else
                            {
                                sqlCount++;
                                if (!ToolDb.SaveEntity(info, this.ExistCompareFields))
                                {
                                    if (imgUrlLen != null && imgUrlLen.Count > 0)
                                    {
                                        for (int c = 0; c < imgUrlLen.Count; c++)
                                        {
                                            try
                                            {
                                                BaseAttach obj = ToolHtml.GetBaseAttach(imgUrlLen[c], headName, info.Id);
                                                if (obj != null)
                                                {
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="fansList">保存爬得的粉丝数组</param>
        public void GetInfoFromHtml(List <Fan> fansList)
        {
            Lexer  lexer  = new Lexer(currentHtmlContent);
            Parser parser = new Parser(lexer);
            //获取包含每条微博的div标记列表
            NodeList fansNodeList = parser.Parse(fanFilter);

            for (int i = 0; i < fansNodeList.Size(); i++)
            {
                Fan fan = new Fan();
                //获取包含一个粉丝的<li>标记
                Bullet fanBullet = (Bullet)fansNodeList[i];

                #region 获取该粉丝头像
                NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true);
                if (fanPortraitNodeList.Size() == 1)
                {
                    Div      fanPortraitDiv = (Div)fanPortraitNodeList[0];
                    NodeList imgNodeList    = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true);
                    if (imgNodeList.Size() == 1)
                    {
                        ImageTag imgNode = (ImageTag)imgNodeList[0];
                        if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT"))
                        {
                            string imgUrl  = imgNode.GetAttribute("SRC");
                            string imgName = imgNode.GetAttribute("ALT");
                            fan.Name = imgName;
                            WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie
                            wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg");
                            wc.DownloadFileCompleted += wc_DownloadFileCompleted;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!");
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!");
                }
                #endregion

                #region 获取该粉丝的关注数/粉丝数/微博数
                NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true);
                if (fanConnectNodeList.Size() == 1)
                {
                    NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (ATagList.Size() == 3)
                    {
                        for (int j = 0; j < 3; j++)
                        {
                            ATag aTag = (ATag)ATagList[j];
                            switch (j)
                            {
                            case 0:
                                if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow"))
                                {
                                    fan.FollowCount = Int32.Parse(aTag.StringText);
                                }
                                else
                                {
                                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!");
                                }
                                break;

                            case 1:
                                if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans"))
                                {
                                    fan.FansCount = Int32.Parse(aTag.StringText);
                                }
                                else
                                {
                                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!");
                                }
                                break;

                            default:
                                fan.FeedsCount = Int32.Parse(aTag.StringText);
                                break;
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!");
                }
                #endregion

                #region 获取该粉丝的简介信息
                NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true);
                if (fanInfoNodeList.Size() == 1)
                {
                    //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml());
                    Div    fanInfoDiv = (Div)fanInfoNodeList[0];
                    string intro      = fanInfoDiv.StringText;
                    if (intro.Substring(0, 2).Equals("简介"))
                    {
                        fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " ");
                    }
                }
                else
                {
                    if (fanInfoNodeList.Size() == 0)
                    {
                        fan.Introduction = "";
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!");
                    }
                }
                #endregion

                #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息
                NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true);
                if (fanLocationNodeList.Size() == 1)
                {
                    //获取粉丝的UserID信息;校验该粉丝的用户名信息
                    NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (aTagNodeList.Size() >= 1)
                    {
                        ATag nameNode = (ATag)aTagNodeList[0];
                        if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF"))
                        {
                            //获取粉丝的UserID信息
                            string uidStr = nameNode.GetAttribute("USERCARD");
                            if (uidStr.Substring(0, 3).Equals("id="))
                            {
                                fan.UserID = uidStr.Substring(3, uidStr.Length - 3);
                            }

                            //获取粉丝的微博链接
                            string linkUrl = nameNode.GetAttribute("HREF");
                            fan.LinkURL = "http://www.weibo.com" + linkUrl;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!");
                        }
                        //校验该粉丝的用户名信息
                        if (!nameNode.StringText.Equals(fan.Name))
                        {
                            Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!");
                        }
                    }

                    //获取粉丝的性别和地点信息
                    NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true);
                    if (locationNodeList.Size() == 1)
                    {
                        string locationStr = "";
                        for (int j = 0; j < locationNodeList[0].Children.Size(); j++)
                        {
                            INode node = locationNodeList[0].Children[j];
                            if (node.GetType().Equals(typeof(TextNode)))
                            {
                                TextNode tNode = (TextNode)node;
                                locationStr += tNode.ToPlainTextString();
                            }
                            if (node.GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tNode = (TagNode)node;
                                if (tNode.Attributes.ContainsKey("CLASS"))
                                {
                                    if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= =
                                    {
                                        fan.Gender = "female";
                                    }
                                    else
                                    {
                                        if (tNode.GetAttribute("CLASS").Contains("male"))
                                        {
                                            fan.Gender = "male";
                                        }
                                        else
                                        {
                                            fan.Gender = "unknown";
                                            Console.WriteLine("第" + i + "个粉丝性别不明!");
                                        }
                                    }
                                }
                            }
                        }
                        fan.Location = locationStr.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!");
                }
                #endregion

                #region 获取该粉丝关注用户的方式
                NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true);
                if (followMethodNodeList.Size() == 1)
                {
                    NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                    if (methodNodeList.Size() == 1)
                    {
                        ATag methodNode = (ATag)methodNodeList[0];
                        fan.FollowMethod = methodNode.StringText.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!");
                }
                #endregion

                fansList.Add(fan);
            }
        }
Exemple #12
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            int    sqlCount        = 0;
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr);
            }
            catch
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(htl));
            NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pagination")), true), new TagNameFilter("a")));

            if (nodeList != null && nodeList.Count > 0)
            {
                try
                {
                    string temp      = nodeList[nodeList.Count - 1].GetATagHref();
                    string pageCount = temp.Replace(temp.Remove(temp.IndexOf("=")), "").Replace("=", "");
                    page = int.Parse(pageCount);
                }
                catch { }
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "?page=" + i.ToString(), Encoding.UTF8, ref cookiestr);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList liNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "list")), true), new TagNameFilter("li")));
                if (liNode != null && liNode.Count > 0)
                {
                    for (int j = 0; j < liNode.Count; j++)
                    {
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        beginDate = liNode[j].ToPlainTextString().GetDateRegex("yyyy/MM/dd");
                        prjName   = liNode[j].ToPlainTextString().Replace(beginDate, "").ToNodeString().Replace(" ", "").Replace("·", "");
                        ATag aTag = liNode[j].GetATag();
                        InfoUrl = "http://www.yjggzy.cn" + aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "");
                        }
                        catch
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtlNode      = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("dl"), new HasAttributeFilter("class", "acticlecontent")));
                        if (dtlNode == null || dtlNode.Count < 1)
                        {
                            parserdetail.Reset();
                            dtlNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "nr")));
                        }
                        if (dtlNode != null && dtlNode.Count > 0)
                        {
                            HtmlTxt = dtlNode.ToHtml();

                            bidCtx = HtmlTxt.ToCtxString().Replace("\r\n\t\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n");
                            bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n");
                            bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n");
                            bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n");
                            bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n");
                            bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t");
                            bidCtx = bidCtx.Replace("\r\n\r\n", "\r\n");
                            bidCtx = bidCtx.Replace("\r\n\t\r\n\t", "\r\n\t");

                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList dtlNodeList = parser.ExtractAllNodesThatMatch(new TagNameFilter("table"));
                            if (dtlNodeList != null && dtlNodeList.Count > 0)
                            {
                                string   ctx      = string.Empty;
                                TableTag tableTag = dtlNodeList[0] as TableTag;
                                foreach (TableRow row in tableTag.Rows)
                                {
                                    int colIndex = 0;
                                    foreach (TableColumn col in row.Columns)
                                    {
                                        if (row.Columns.Length == 3)
                                        {
                                            if (colIndex == 0 && col.GetAttribute("colspan") != "2")
                                            {
                                                colIndex++;
                                                continue;
                                            }
                                            else if (col.GetAttribute("colspan") == "2" && colIndex == 1)
                                            {
                                                ctx += col.ToNodePlainString() + ":";
                                            }
                                            else if (!string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 2)
                                            {
                                                ctx += col.ToNodePlainString() + "\r\n";
                                            }
                                            else if (string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 1)
                                            {
                                                ctx += col.ToNodePlainString() + ":";
                                            }
                                            else if (string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 2)
                                            {
                                                ctx += col.ToNodePlainString() + "\r\n";
                                            }
                                            colIndex++;
                                            continue;
                                        }
                                        if (row.Columns.Length == 2)
                                        {
                                            if (colIndex == 0)
                                            {
                                                ctx += col.ToNodePlainString() + ":";
                                            }
                                            else if (colIndex == 1)
                                            {
                                                ctx += col.ToNodePlainString() + "\r\n";
                                            }
                                            colIndex++;
                                            continue;
                                        }
                                        if (colIndex == 0 && col.GetAttribute("colspan") != "2")
                                        {
                                            colIndex++;
                                            continue;
                                        }
                                        else if (colIndex == 1 && col.GetAttribute("colspan") != "2")
                                        {
                                            ctx += col.ToNodePlainString() + ":";
                                        }
                                        else if (colIndex == 2 && col.GetAttribute("colspan") != "2")
                                        {
                                            ctx += col.ToNodePlainString() + "\r\n";
                                        }
                                        else if (col.GetAttribute("colspan") == "2" && colIndex == 0)
                                        {
                                            ctx += col.ToNodePlainString() + ":";
                                        }

                                        else if (!string.IsNullOrEmpty(col.GetAttribute("colspan")) && colIndex == 1)
                                        {
                                            ctx += col.ToNodePlainString() + "\r\n";
                                        }

                                        colIndex++;
                                    }
                                }

                                buildUnit  = ctx.GetBuildRegex();
                                bidUnit    = ctx.GetBidRegex();
                                code       = ctx.GetCodeRegex();
                                prjAddress = ctx.GetAddressRegex();
                                prjMgr     = ctx.GetMgrRegex();
                                if (string.IsNullOrEmpty(prjMgr))
                                {
                                    prjMgr = ctx.GetRegex("项目负责人姓名", true, 50);
                                }
                                bidMoney = ctx.GetMoneyRegex();
                            }
                            else
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (imgNode != null && imgNode.Count > 0)
                                {
                                    ImageTag img  = imgNode[0] as ImageTag;
                                    string   link = "http://www.yjggzy.cn" + img.GetAttribute("src");
                                    HtmlTxt = HtmlTxt.GetReplace(img.GetAttribute("src"), link);
                                }
                            }
                            msgType  = "阳江市建设工程交易中心";
                            specType = "建设工程";
                            bidType  = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "阳江市区", "", string.Empty, code, prjName, buildUnit, beginDate,
                                                             bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            sqlCount++;
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ImageTag img = aNode[a] as ImageTag;
                                        try
                                        {
                                            BaseAttach attach = ToolHtml.GetBaseAttach(img.GetAttribute("src"), prjName, info.Id, "SiteManage\\Files\\InviteAttach\\");
                                            if (attach != null)
                                            {
                                                ToolDb.SaveEntity(attach, "SourceID,AttachServerPath");
                                            }
                                        }
                                        catch { }
                                    }
                                }
                            }
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #13
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new List <BidInfo>();
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser = new Parser(new Lexer(html));
            NodeList sNode  = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "Normal")));

            if (sNode != null && sNode.Count > 0)
            {
                try
                {
                    string temp = sNode.AsString().GetRegexBegEnd("/", "跳");
                    pageInt = int.Parse(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.szgm.gov.cn/gmbsc/143049/143173/143181/ecdc3e5c-" + i.ToString() + ".html", Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "hyxejc")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))));
                if (viewList != null && viewList.Count > 0)
                {
                    for (int j = 1; j < viewList.Count - 1; j++)
                    {
                        TableRow tr   = (viewList[j] as TableTag).Rows[0];
                        ATag     aTag = tr.GetATag();
                        if (aTag == null || tr.ColumnCount != 3)
                        {
                            continue;
                        }

                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        prjName   = aTag.GetAttribute("title");

                        InfoUrl = "http://www.szgm.gov.cn" + aTag.Link;
                        string htlDtl = string.Empty;
                        try
                        {
                            htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString();
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            htlDtl = regexHtml.Replace(htlDtl, "");
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htlDtl));
                        NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page_con")));
                        if (dtl != null && dtl.Count > 0)
                        {
                            HtmlTxt = dtl.AsHtml();
                            bidCtx  = HtmlTxt.ToCtxString();
                            string tempName = bidCtx.GetRegex("工程名称");
                            if (!string.IsNullOrWhiteSpace(tempName))
                            {
                                prjName = tempName;
                            }
                            code       = bidCtx.GetCodeRegex().GetCodeDel();
                            buildUnit  = bidCtx.GetBuildRegex();
                            prjAddress = bidCtx.GetAddressRegex();
                            bidUnit    = bidCtx.GetBidRegex();
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegex("委托单位");
                            }
                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                bidUnit = bidCtx.GetRegexBegEnd("确认", "为中标单位");
                            }
                            bidMoney = bidCtx.GetMoneyRegex();
                            if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetRegex("合同价").GetMoney();
                            }
                            if (string.IsNullOrWhiteSpace(bidMoney) || bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetRegexBegEnd("人民币", "元").GetMoney();
                            }

                            if (string.IsNullOrWhiteSpace(bidUnit))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList tableNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("border", "1")));
                                if (tableNode != null && tableNode.Count > 0)
                                {
                                    TableTag tableBid = tableNode[0] as TableTag;
                                    if (tableBid.RowCount > 1)
                                    {
                                        string ctx = string.Empty;
                                        for (int c = 0; c < tableBid.Rows[0].ColumnCount; c++)
                                        {
                                            try
                                            {
                                                ctx += tableBid.Rows[0].Columns[c].ToNodePlainString() + ":";
                                                ctx += tableBid.Rows[1].Columns[c].ToNodePlainString() + "\r\n";
                                            }
                                            catch { }
                                        }

                                        bidUnit = ctx.GetBidRegex();
                                        if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0")
                                        {
                                            bidMoney = ctx.GetMoneyRegex(null, false, "万元");
                                        }
                                    }
                                }
                            }
                            try
                            {
                                if (decimal.Parse(bidMoney) > 50000)
                                {
                                    bidMoney = (decimal.Parse(bidMoney) / 10000).ToString();
                                }
                            }
                            catch { }


                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                            if (imgNode != null && imgNode.Count > 0)
                            {
                                for (int img = 0; img < imgNode.Count; img++)
                                {
                                    ImageTag image   = imgNode[img] as ImageTag;
                                    string   url     = image.GetAttribute("src");
                                    string   saveUrl = "http://www.szgm.gov.cn" + url;
                                    HtmlTxt = HtmlTxt.Replace(url, saveUrl);
                                }
                            }

                            msgType = "深圳市光明新区公明街道办事处";
                            if (string.IsNullOrEmpty(prjAddress))
                            {
                                prjAddress = "见招标信息";
                            }
                            specType = "政府采购";
                            bidType  = "小型工程";
                            if (string.IsNullOrEmpty(buildUnit))
                            {
                                buildUnit = "深圳市光明新区公明街道办事处";
                            }
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType,
                                                             bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
Exemple #14
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).GetJsString();
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "font9green2")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.GetATag(pageList.Count - 1).Link.Replace("&", "kdxx") + "kdxx";
                    temp    = temp.GetRegexBegEnd("page=", "kdxx").Replace("&amp;", "");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default).GetJsString();
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "font9grey1")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "办事指南";
                        headName    = tr.Columns[1].ToNodePlainString();
                        releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        infoUrl     = "http://www.gzzb.gd.cn" + tr.Columns[1].GetATagHref();
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "contentDiv")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            ctxHtml = dtlList.AsHtml();
                            infoCtx = ctxHtml.ToCtxString();
                            msgType = MsgTypeCosnt.GuangZhouMsgType;
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "广州市区", string.Empty, infoCtx, infoType);
                            sqlCount++;
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                            {
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (imgList != null && imgList.Count > 0)
                                {
                                    for (int img = 0; img < imgList.Count; img++)
                                    {
                                        ImageTag imgTag = imgList[img] as ImageTag;
                                        try
                                        {
                                            BaseAttach obj = null;
                                            if (imgTag.GetAttribute("src").Contains("http"))
                                            {
                                                obj = ToolHtml.GetBaseAttach(imgTag.GetAttribute("src"), headName, info.Id);
                                            }
                                            else
                                            {
                                                obj = ToolHtml.GetBaseAttach("http://www.gzzb.gd.cn" + imgTag.GetAttribute("src"), headName, info.Id);
                                            }
                                            if (obj != null)
                                            {
                                                ToolDb.SaveEntity(obj, string.Empty);
                                            }
                                        }
                                        catch { }
                                    }
                                }
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ATag aTag = aNode[a] as ATag;
                                        if (aTag.IsAtagAttach())
                                        {
                                            try
                                            {
                                                BaseAttach obj = null;
                                                if (aTag.Link.Contains("http"))
                                                {
                                                    obj = ToolHtml.GetBaseAttach(aTag.Link, aTag.LinkText, info.Id);
                                                }
                                                else
                                                {
                                                    obj = ToolHtml.GetBaseAttach("http://www.gzzb.gd.cn" + aTag.Link, aTag.LinkText, info.Id);
                                                }
                                                if (obj != null)
                                                {
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #15
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "page")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList.AsString().GetRegexBegEnd("HTML", ",").Replace("(", "");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl("http://www.fsggzy.cn/gcjy/gc_zcfg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8).GetJsString();
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "secondrightlistbox")), true), new TagNameFilter("table")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        if (j % 2 == 0)
                        {
                            continue;
                        }
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "政策法规";
                        headName    = tr.Columns[0].ToNodePlainString();
                        releaseTime = tr.Columns[1].ToPlainTextString().GetDateRegex();
                        infoUrl     = "http://www.fsggzy.cn/gcjy/gc_zcfg/" + tr.Columns[0].GetATagHref().Replace("../", "").Replace("./", "");
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content2")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            ctxHtml = dtlList.AsHtml();
                            infoCtx = ctxHtml.ToCtxString();
                            msgType = MsgTypeCosnt.GuangZhouMsgType;
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "佛山市区", string.Empty, infoCtx, infoType);
                            sqlCount++;
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                            {
                                string str = string.Empty;
                                if (infoUrl.IndexOf("/") != -1)
                                {
                                    str = infoUrl.Remove(infoUrl.LastIndexOf("/"));
                                }
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (imgList != null && imgList.Count > 0)
                                {
                                    for (int img = 0; img < imgList.Count; img++)
                                    {
                                        ImageTag imgTag = imgList[img] as ImageTag;
                                        try
                                        {
                                            BaseAttach obj = null;
                                            if (imgTag.GetAttribute("src").Contains("http"))
                                            {
                                                obj = ToolHtml.GetBaseAttach(imgTag.GetAttribute("src"), headName, info.Id);
                                            }
                                            else
                                            {
                                                obj = ToolHtml.GetBaseAttach(str + imgTag.GetAttribute("src").Replace("../", "/").Replace("./", "/"), headName, info.Id);
                                            }
                                            if (obj != null)
                                            {
                                                ToolDb.SaveEntity(obj, string.Empty);
                                            }
                                        }
                                        catch { }
                                    }
                                }
                                parser = new Parser(new Lexer(ctxHtml));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ATag aTag = aNode[a] as ATag;
                                        if (aTag.IsAtagAttach())
                                        {
                                            try
                                            {
                                                BaseAttach obj = null;
                                                if (aTag.Link.Contains("http"))
                                                {
                                                    obj = ToolHtml.GetBaseAttach(aTag.Link, aTag.LinkText, info.Id);
                                                }
                                                else
                                                {
                                                    obj = ToolHtml.GetBaseAttach(str + aTag.Link.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id);
                                                }
                                                if (obj != null)
                                                {
                                                    ToolDb.SaveEntity(obj, string.Empty);
                                                }
                                            }
                                            catch { }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #16
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    string temp = pageList[pageList.Count - 1].GetATagValue().Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk");
                    pageInt = Convert.ToInt32(temp);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        string typeId           = ToolHtml.GetHtmlInputValue(html, "typeId");
                        string boardId          = ToolHtml.GetHtmlInputValue(html, "boardId");
                        string totalRows        = ToolHtml.GetHtmlInputValue(html, "totalRows");
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                            "typeId", "boardId", "totalRows", "pageNO"
                        }, new string[] {
                            typeId, boardId, totalRows, i.ToString()
                        });
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                               infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        infoType    = "政策法规";
                        headName    = tr.Columns[1].ToNodePlainString();
                        releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex();
                        infoUrl     = tr.Columns[1].GetATagHref();
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div")));
                        if (dtlList != null && dtlList.Count > 0)
                        {
                            ctxHtml = dtlList.AsHtml();
                            infoCtx = ctxHtml.ToCtxString();
                            msgType = MsgTypeCosnt.ZhongShanMsgType;
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "中山市区", string.Empty, infoCtx, infoType);
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            else
                            {
                                sqlCount++;
                                if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                {
                                    parser = new Parser(new Lexer(ctxHtml));
                                    NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                    if (imgList != null && imgList.Count > 0)
                                    {
                                        for (int img = 0; img < imgList.Count; img++)
                                        {
                                            ImageTag   imgTag   = imgList[img] as ImageTag;
                                            BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl(imgTag.GetAttribute("src"), headName, info.Id);
                                            if (baseInfo != null)
                                            {
                                                ToolDb.SaveEntity(baseInfo, string.Empty);
                                            }
                                        }
                                    }
                                    parser = new Parser(new Lexer(ctxHtml));
                                    NodeList attachList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                                    if (attachList != null && attachList.Count > 0)
                                    {
                                        for (int a = 0; a < attachList.Count; a++)
                                        {
                                            ATag aTag = attachList[a] as ATag;
                                            if (aTag.IsAtagAttach())
                                            {
                                                try
                                                {
                                                    BaseAttach obj = ToolHtml.GetBaseAttachByUrl(aTag.Link, aTag.LinkText, info.Id);
                                                    if (obj != null)
                                                    {
                                                        ToolDb.SaveEntity(obj, string.Empty);
                                                    }
                                                }
                                                catch { }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
Exemple #17
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            int    sqlCount        = 0;
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr);
            }
            catch { return(null); }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_total_page_count")));

            if (pageNode != null && pageNode.Count > 0)
            {
                try
                {
                    string temp = pageNode[0].ToNodePlainString();
                    pageInt = int.Parse(temp);
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "__VIEWSTATE",
                        "__EVENTVALIDATION",
                        "ctl00$ContentPlaceHolder1$CP"
                    }, new string[] {
                        "ctl00$ContentPlaceHolder1$DownPage",
                        "",
                        viewState,
                        eventValidation,
                        i.ToString()
                    });
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "secondrightlistbox")), true), new TagNameFilter("li")));
                if (listNode != null && listNode.Count > 0)
                {
                    for (int j = 0; j < listNode.Count; j++)
                    {
                        INode node = listNode[j];
                        ATag  aTag = node.GetATag();
                        if (aTag == null)
                        {
                            continue;
                        }
                        string prjName = string.Empty,
                               buildUnit = string.Empty, bidUnit = string.Empty,
                               bidMoney = string.Empty, code = string.Empty,
                               bidDate = string.Empty,
                               beginDate = string.Empty,
                               endDate = string.Empty, bidType = string.Empty,
                               specType = string.Empty, InfoUrl = string.Empty,
                               msgType = string.Empty, bidCtx = string.Empty,
                               prjAddress = string.Empty, remark = string.Empty,
                               prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        prjName   = aTag.GetAttribute("title");
                        beginDate = node.ToPlainTextString().GetDateRegex();
                        InfoUrl   = "http://www.swggzy.cn/" + aTag.Link.GetReplace("amp;");
                        string htmldtl = string.Empty;
                        try
                        {
                            htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldtl));
                        NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "contentbox2")));
                        if (dtlNode != null && dtlNode.Count > 0)
                        {
                            HtmlTxt  = dtlNode.AsHtml();
                            bidType  = prjName.GetInviteBidType();
                            msgType  = "汕尾市公共资源交易中心";
                            specType = "建设工程";
                            BidInfo info = ToolDb.GenBidInfo("广东省", "汕尾市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            sqlCount++;
                            if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx))
                            {
                                parser = new Parser(new Lexer(HtmlTxt));
                                NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img"));
                                if (aNode != null && aNode.Count > 0)
                                {
                                    for (int a = 0; a < aNode.Count; a++)
                                    {
                                        ImageTag img = aNode[a] as ImageTag;

                                        try
                                        {
                                            BaseAttach attach = ToolHtml.GetBaseAttach(img.GetAttribute("src"), prjName, info.Id, "SiteManage\\Files\\InviteAttach\\");
                                            if (attach != null)
                                            {
                                                ToolDb.SaveEntity(attach, "SourceID,AttachServerPath");
                                            }
                                        }
                                        catch { }
                                    }
                                }
                            }
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                        }
                    }
                }
            }
            return(null);
        }