예제 #1
0
파일: Fetch.cs 프로젝트: zq535228/renzhex3
        /// <summary>
        /// 函数名称:GetPattern
        /// 功能说明:用于判定索引页正文是储存在Li中还是table中
        /// 参数:string rawtext 去掉style 等无关标签之后的网页源码
        /// 返回值 bool  true表明是table型;false表明是li型
        /// </summary>
        /// <param name="rawtext"></param>
        /// <returns></returns>
        public static bool GetPattern(string rawtext)
        {
            Lexer      lexer     = new Lexer(rawtext);
            Parser     parser    = new Parser(lexer);
            NodeFilter filter    = new TagNameFilter("li");//解析出其中的li元素
            NodeList   htmlNodes = parser.Parse(filter);

            if (htmlNodes.Count == 0)
            {
                return(true);//如果源码中不含有li元素则该索引页属于table型。
            }
            else
            {
                //去掉其中不含有时间的条目
                Regex f2 = new Regex(@"\d\d:\d\d");
                for (int i = htmlNodes.Count - 1; i >= 0; i--)
                {
                    if (!f2.IsMatch(htmlNodes[i].ToHtml()))
                    {
                        htmlNodes.Remove(i);
                    }
                }

                if (htmlNodes.Count == 0)//如果网页源码中含有li元素,但是li元素中不含有带发布时间的连接,则该索引页属于table型
                {
                    return(true);
                }
                else//否则为li型
                {
                    return(false);
                }
            }
        }
        private string HtmlText(string sourceHtml)
        {
            hParser.Parser parser = hParser.Parser.CreateParser(sourceHtml.Replace(System.Environment.NewLine, ""), "utf-8");

            StringBuilder builderHead = new StringBuilder();
            StringBuilder builderBody = new StringBuilder();


            hParser.NodeFilter html  = new TagNameFilter("TR");
            hParser.INode      nodes = parser.Parse(html)[0];
            builderHead.Append(nodes.Children[0].ToHtml());
            hParser.INode body = nodes.Children[1];
            hParser.INode div  = body.Children[0];


            for (int i = 0; i < div.Children.Count; i++)
            {
                if (div.Children[i] is hParser.ITag)
                {
                    builderBody.Append(div.Children[i].ToHtml());
                }
            }

            StringBuilder builder = new StringBuilder();

            builder.Append("<html>");
            builder.Append(builderHead.ToString());
            builder.Append("<body>");
            builder.Append(string.Format("<{0}>", div.GetText()));
            builder.Append(builderBody.ToString());
            builder.Append("</div>");
            builder.Append("</body>");
            builder.Append("</html>");
            return(builder.ToString());
        }
예제 #3
0
 public async Task <PaginatedData <TagOutput> > GetAllTagsAsync(
     [FromQuery] Pagination pagination,
     [FromQuery] TagNameFilter tagNameFilter
     )
 {
     return(await _tagService.FindAllTagsAsync(tagNameFilter, pagination));
 }
예제 #4
0
        public static string GetTitleFromContent(string content)
        {
            content = DropComment(content);

            Lexer      lexer  = new Lexer(content);
            Parser     parser = new Parser(lexer);
            NodeFilter filter = new TagNameFilter("TITLE");
            NodeList   list   = parser.ExtractAllNodesThatMatch(filter);

            return(list.Count == 0 ? "" : list[0].ToPlainTextString());
        }
예제 #5
0
            /// <summary>
            /// 转换html源码为xml格式
            /// </summary>
            /// <param name="html">html源码</param>
            /// <returns>xml字符串</returns>
            /// <param name="TargetTag">需转换的标记名</param>
            public static string CovertHtmlToXml(string html, string targetTag)
            {
                try
                {
                    XmlDocument doc            = new XmlDocument();
                    XmlNode     xmlDeclaration = doc.CreateXmlDeclaration("1.0", "utf-8", null);
                    doc.AppendChild(xmlDeclaration);

                    // 借助htmlparser解析html内容
                    Parser parser = Parser.CreateParser(html, "GBK");
                    // 筛选出指定的节点
                    TagNameFilter tnf   = new TagNameFilter(targetTag);
                    NodeList      nodes = parser.Parse(tnf);

                    // 创建根节点
                    XmlElement root = doc.CreateElement("Tags");

                    TagNode      tagNode = null;
                    Hashtable    ht      = null;
                    XmlAttribute attr    = null;
                    XmlElement   parent  = null;
                    for (int i = 0; i < nodes.Size(); i++)
                    {
                        tagNode = nodes[i] as TagNode;
                        parent  = doc.CreateElement(tagNode.TagName);

                        // 添加属性
                        ht = tagNode.Attributes;
                        foreach (DictionaryEntry ent in ht)
                        {
                            // 查看属性名是否合法
                            if (Regex.IsMatch(ent.Key.ToString(), validName))
                            {
                                attr       = doc.CreateAttribute(ent.Key.ToString());
                                attr.Value = ent.Value.ToString();
                                parent.Attributes.Append(attr);
                            }
                        }// end foreach (DictionaryEntry ent in ht)

                        AppendChild(tagNode, parent, doc);

                        root.AppendChild(parent);
                    }
                    doc.AppendChild(root);

                    return(doc.OuterXml);

                    //throw new Exception("给定的html文本必须至少包含一个" + targetTag + "节点");
                }
                catch (Exception ex)
                {
                    throw new Exception("转换html内容出错:" + ex.Message);
                }
            }
예제 #6
0
        /// <summary>
        /// 获取BodyHtml,去除Script
        /// </summary>
        public string GetBodyHtml(string html)
        {
            HtmlPage   page      = GetPage(html);
            NodeList   nodelist  = page.Body;
            NodeFilter filter    = new TagNameFilter("script");
            NodeList   childnode = nodelist.ExtractAllNodesThatMatch(filter, true);

            for (int i = 0; i < childnode.Size(); i++)
            {
                nodelist.Remove(childnode[i]);
            }
            return(nodelist.ToHtml());
        }
예제 #7
0
        public async Task <PaginatedData <TagOutput> > FindAllTagsAsync(TagNameFilter tagNameFilter, Pagination pagination)
        {
            var tags = _tagRepository.FindAll();

            if (!string.IsNullOrWhiteSpace(tagNameFilter.Name))
            {
                tags = tagNameFilter.SearchType == SearchType.Equals
                    ? tags.Where(tag => tagNameFilter.Name == tag.Name)
                    : tags.Where(tag => tag.Name.Contains(tagNameFilter.Name !));
            }

            return(await tags
                   .Select(tag => new TagOutput(tag))
                   .PaginateAsync(pagination));
        }
예제 #8
0
        /// <summary>
        /// 移除 html 某些标签 using 新闻 by LYM
        /// </summary>
        /// <param name="html">html内容</param>
        /// <returns>清理后的html内容</returns>
        public static String HtmlFilter(String html)
        {
            Parser     parser     = Parser.CreateParser(html, "utf-8");
            NodeFilter scriptNode = new TagNameFilter("script");
            NodeList   nodes      = parser.Parse(scriptNode);

            if (nodes.Count > 0)
            {
                HtmlFilter(nodes[0].Page.GetText().Replace(nodes[0].ToHtml(), ""));
                return(nodes[0].Page.GetText().Replace(nodes[0].ToHtml(), ""));
            }
            else
            {
                return(html);
            }
        }
예제 #9
0
        //根据各种筛选条件,获取到需要的元素,后其看是否改为全Filter
        public string GetByFilter(string html, FilterModel model)//OR与AND都只能同时接受两个
        {
            string result = "";

            if (model.EType.ToLower().Equals("title"))
            {
                return(GetTitle(html));
            }
            NodeList nodes = GetTagList(html, model.EType);

            if (!string.IsNullOrEmpty(model.ID))
            {
                HasAttributeFilter filter = new HasAttributeFilter("id", model.ID);
                nodes = nodes.ExtractAllNodesThatMatch(filter);
            }
            if (!string.IsNullOrEmpty(model.CSS))
            {
                HasAttributeFilter filter = new HasAttributeFilter("class", model.CSS);
                nodes = nodes.ExtractAllNodesThatMatch(filter);
            }
            if (!model.AllowScript)
            {
                TagNameFilter filter = new TagNameFilter("script");
                nodes.ExtractAllNodesThatMatch(filter, true);
            }
            //将图片文件本地化
            {
                TagNameFilter filter = new TagNameFilter("img");
                NodeList      imgs   = nodes.ExtractAllNodesThatMatch(filter, true);
                for (int i = 0; i < imgs.Count; i++)
                {
                    ImageTag img      = imgs[i] as ImageTag;
                    string   savepath = function.VToP(vdir + Path.GetFileName(img.ImageURL));
                    if (File.Exists(savepath))
                    {
                        continue;
                    }                                       //避免图片重复下载
                    img.ImageURL = httpHelp.DownFile(baseurl, img.ImageURL, savepath);
                }
            }
            result = nodes.AsHtml();
            if (!string.IsNullOrWhiteSpace(model.Start) && !string.IsNullOrWhiteSpace(model.End))
            {
                result = regHelper.GetValueBySE(result, model.Start, model.End);
            }
            return(result);
        }
예제 #10
0
        public static void parseIndexHtml(string HtmlString, int num, out string href, out string title)
        {
            href  = "";
            title = "";
            //进行解析
            Parser parser = Parser.CreateParser(HtmlString, "utf-8");
            //筛选要查找的对象 这里查找td,封装成过滤器
            NodeFilter filter = new TagNameFilter("dd");

            new AndFilter(new TagNameFilter("dd"), new HasParentFilter(new AndFilter(new TagNameFilter("dl"), new HasAttributeFilter("class", "txt_box"))));
            //将过滤器导入筛选,得到对象列表
            NodeList nodes = parser.Parse(filter);

            if (nodes.Size() > num)
            {
                INode textnode = nodes[num];
                ITag  tag      = getTag(textnode.FirstChild.NextSibling);
                href  = tag.GetAttribute("href");
                title = tag.GetAttribute("title");
            }
        }
예제 #11
0
        public DataTable InventoryData(string strBuff)
        {
            #region 分析网页html节点

            DataTable dt = new DataTable();
            dt.Columns.Add("ArticleNo");
            dt.Columns.Add("StoreCode");
            dt.Columns.Add("StoreName");
            dt.Columns.Add("Inventory");
            dt.Columns.Add("Sales");
            dt.Columns.Add("Date");
            dt.Columns.Add("BarCode");
            dt.Columns.Add("TopStoreCode");
            dt.Columns.Add("TopStoreName");

            Lexer      lexer  = new Lexer(strBuff);
            Parser     parser = new Parser(lexer);
            NodeFilter html   = new TagNameFilter("Table");

            NodeList htmlNodes = parser.Parse(html);

            for (int j = 1; j <= 1; j++)
            {
                lexer  = new Lexer(strBuff);
                parser = new Parser(lexer);
                html   = new TagNameFilter("Table");

                htmlNodes = parser.Parse(html);

                Lexer  lexers  = new Lexer(htmlNodes[22].ToHtml());
                Parser parsers = new Parser(lexers);

                NodeFilter htmls    = new TagNameFilter("TR");
                NodeList   htmlNode = parsers.Parse(htmls);

                string strArticleNo = string.Empty;
                for (int i = 2; i < htmlNode.Count; i++)
                {
                    if (htmlNode[i].Children[0].Children != null)
                    {
                        strArticleNo = htmlNode[i].Children[0].Children[0].ToHtml().Replace("&nbsp;", "");
                    }
                    string strStoreCode = htmlNode[i].Children[2].Children[0].ToHtml().Split('-')[0];
                    string strStoreName = htmlNode[i].Children[2].Children[0].ToHtml().Split('-')[1];
                    string strInventory = htmlNode[i].Children[4].Children[0].ToHtml();
                    string strSales     = htmlNode[i].Children[9].Children[0].ToHtml();
                    string strDate      = DateTime.Now.ToString("yyyy-MM-dd");

                    //获取条码
                    string barCode = DataHelper.GetBarCode(strArticleNo, _dtGoods);

                    //获取客户编号及客户简称
                    string strTopStoreCode = "";
                    string strTopStoreName = "";
                    DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName);

                    DataRow dr = dt.NewRow();
                    dr[0] = strArticleNo;
                    dr[1] = strStoreCode;
                    dr[2] = strStoreName;
                    dr[3] = strInventory;
                    dr[4] = strSales;
                    dr[5] = strDate;
                    dr[6] = barCode;
                    dr[7] = strTopStoreCode;
                    dr[8] = strTopStoreName;
                    dt.Rows.Add(dr);
                }
            }

            #endregion

            return(dt);
        }
예제 #12
0
        private void GrapBaiduMsg(HttpContext context)
        {
            string sRslt = GetHtmlStr("http://www.yododo.com/ask/list/");

            ClassLibrary.BLL.RouteClass          rcBll  = new ClassLibrary.BLL.RouteClass();
            List <ClassLibrary.Model.RouteClass> rcList = rcBll.GetModelList("classLevel = 3");

            Parser     parser   = Parser.CreateParser(sRslt, "utf-8");
            NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix"));
            NodeList   liList   = parser.Parse(filterUL);
            string     links    = liList[0].ToHtml();

            parser = Parser.CreateParser(links, "utf-8");
            NodeFilter filterLI = new TagNameFilter("li"); //new NodeClassFilter(typeof(ATag));
            NodeList   nodelist = parser.Parse(filterLI);

            //string strGn = nodelist[1].ToHtml();
            string strCj = nodelist[0].ToHtml();

            //parser = Parser.CreateParser(nodelist.ToHtml(), "utf-8");
            NodeFilter filterA = new NodeClassFilter(typeof(ATag));

            /*NodeList aGnList = parser.Parse(filterA);
             * for (int i = 0; i < aGnList.Count; i++)
             * {
             *  ITag tag = getTag(aGnList[i]);
             *  string url = "http://www.yododo.com" + tag.GetAttribute("href") + "s1";  //已解决
             *  string className = tag.ToPlainTextString();
             *  if (className == "全部") continue;
             *
             *  ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return rc.ClassName == className; });
             *  if (model == null) continue;
             *
             *  paserData(aGnList[i], url, model.ID);
             * }*/

            parser = Parser.CreateParser(strCj, "utf-8");
            NodeList areaCjList = parser.Parse(filterA);

            for (int i = 0; i < areaCjList.Count; i++)
            {
                ITag   tag       = getTag(areaCjList[i]);
                string url       = "http://www.yododo.com" + tag.GetAttribute("href"); //各洲
                string className = tag.ToPlainTextString();
                if (className == "全部" || className == "中国")
                {
                    continue;
                }

                parser = Parser.CreateParser(GetHtmlStr(url), "utf-8");
                //NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix"));
                NodeList liListCj = parser.Parse(filterUL);
                string   linksCj  = liListCj[0].ToHtml();

                parser = Parser.CreateParser(linksCj, "utf-8");
                //NodeFilter filterA = new NodeClassFilter(typeof(ATag));
                NodeList aCjList = parser.Parse(filterA);
                for (int j = 0; j < aCjList.Count; j++)
                {
                    ITag   cjtag      = getTag(aCjList[j]);
                    string url1       = "http://www.yododo.com" + cjtag.GetAttribute("href") + "s1"; //已解决
                    string className1 = cjtag.ToPlainTextString();
                    if (className1 == "全部")
                    {
                        continue;
                    }

                    ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return(rc.ClassName == className1); });
                    if (model == null)
                    {
                        continue;
                    }

                    paserData(aCjList[j], url1, model.ID);
                }
            }

            Print(context, "success");
        }
예제 #13
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_RightList1_GridViewPaging1_lblGridViewPagingDesc")));

            if (pageList != null && pageList.Count > 0)
            {
                string pageStr = pageList[0].ToPlainTextString().Trim();
                try
                {
                    Regex regexPage = new Regex(@"页,共[^页]+页");
                    Match pageMatch = regexPage.Match(pageStr);
                    pageInt = int.Parse(pageMatch.Value.Replace("页,共", "").Replace("页", "").Trim());
                }
                catch
                { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "ctl00$ScriptManager1",
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "__VIEWSTATE",
                        "ctl00$cph_context$RightList1$search",
                        "ctl00$cph_context$RightList1$txtTitle",
                        "ctl00$cph_context$RightList1$GridViewPaging1$txtGridViewPagingForwardTo",
                        "__VIEWSTATEENCRYPTED",
                        "__EVENTVALIDATION",
                        "ctl00$cph_context$RightList1$GridViewPaging1$btnForwardToPage"
                    }, new string[] {
                        "ctl00$cph_context$RightList1$update1|ctl00$cph_context$RightList1$GridViewPaging1$btnForwardToPage",
                        "", "", viewState,
                        "标题", "", i.ToString(), "", eventValidation, "GO"
                    });
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_RightList1_GridView1")));
                if (dtList != null && dtList.Count > 0)
                {
                    TableTag table = dtList[0] as TableTag;
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                               infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;
                        TableRow tr = table.Rows[j];
                        headName    = tr.Columns[1].ToPlainTextString().Trim();
                        releaseTime = tr.Columns[3].ToPlainTextString().Trim();
                        infoScorce  = tr.Columns[2].ToPlainTextString().Trim();
                        infoType    = "通知公告";
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        infoUrl = "http://jyzx.cb.gov.cn/LGjyzxWeb/SiteManage/" + aTag.Link;
                        if (infoUrl.Contains("%25"))
                        {
                            infoUrl = infoUrl.Replace("%25", "%");
                        }
                        string htmldetailtxt = string.Empty;
                        try
                        {
                            htmldetailtxt = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8);
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldetailtxt));
                        NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text_contend")));
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = noList.AsString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "");
                            msgType    = "深圳市龙岗建设工程交易中心";
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳龙岗区工程", string.Empty, infoCtx, infoType);
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            else
                            {
                                sqlCount++;
                                if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                {
                                    parser = new Parser(new Lexer(ctxHtml));
                                    NodeList fileList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_JyxxUploadFile1_GridView1")));
                                    if (fileList != null && fileList.Count > 0)
                                    {
                                        string fileHtl = fileList.AsHtml();
                                        parser = new Parser(new Lexer(fileHtl));
                                        NodeFilter aLink = new TagNameFilter("a");
                                        NodeList   aList = parser.ExtractAllNodesThatMatch(aLink);
                                        if (aList != null && aList.Count > 0)
                                        {
                                            for (int k = 0; k < aList.Count; k++)
                                            {
                                                ATag a = aList.SearchFor(typeof(ATag), true)[k] as ATag;
                                                if (a != null)
                                                {
                                                    AddBaseFile("http://jyzx.cb.gov.cn/LGjyzxWeb/" + a.Link.Replace("../", ""), a.LinkText, info);
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
예제 #14
0
파일: Fetch.cs 프로젝트: zq535228/renzhex3
        /// <summary>
        /// 函数名称:ItemRetrival_1
        /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间
        /// 参数:string url表示帖子列表url
        /// 参数 ref Encoding encode 用于获取网页字符集编码
        /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息
        /// </summary>
        /// <param name="url"></param>
        /// <param name="encode"></param>
        /// <param name="listurl"></param>
        /// <param name="listtitle"></param>
        /// <param name="listtime"></param>
        public static void ItemRetrival_1(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle,
                                          ref List <string> listTime)
        {
            //获取网页源码;
            string rawtext = GetDataFromUrl(url);
            //将无关的style,script等标签去掉;
            string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->";

            rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "");


            //以下用htmlparser提取源码中的目标table;

            Lexer lexer = new Lexer(rawtext);
            //解析出其中的table元素
            Parser     parser    = new Parser(lexer);
            NodeFilter filter    = new TagNameFilter("table");
            NodeList   htmlNodes = parser.Parse(filter);
            //去除嵌套式table
            Regex f1 = new Regex(@"<table.*?>");

            for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                MatchCollection myCollection = f1.Matches(htmlNodes[i].ToHtml());
                if (myCollection.Count > 1)
                {
                    htmlNodes.Remove(i);
                }
            }

            //去除没有时间的table,认为这种table是无效table
            Regex f2 = new Regex(@"\d\d:\d\d");

            for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                if (!f2.IsMatch(htmlNodes[i].ToHtml()))
                {
                    htmlNodes.Remove(i);
                }
            }



            //以下程序解析出以上三种目标信息

            string     final      = htmlNodes.ToHtml();
            Lexer      lex2       = new Lexer(final);
            Parser     par2       = new Parser(lex2);
            NodeFilter filter2    = new TagNameFilter("tr");
            NodeList   finalNodes = par2.Parse(filter2);
            //提取发帖时间信息
            RegexFilter rf = new RegexFilter(@"\d\d:\d\d");

            for (int i = 0; i < finalNodes.Count; i++)
            {
                Lexer    lexerTmp  = new Lexer(finalNodes[i].ToHtml());
                Parser   parserTmp = new Parser(lexerTmp);
                NodeList tmp       = parserTmp.Parse(rf);
                if (tmp.Count > 0)
                {
                    for (int j = 0; j < tmp.Count; j++)
                    {
                        string temp = tmp[j].ToHtml();
                        ModifyRawText(ref temp);
                        listTime.Add(temp);
                    }
                }
            }
            //提取帖子URL以及帖子标题
            string     atagAssist = finalNodes.ToHtml();
            Lexer      lex3       = new Lexer(atagAssist);
            Parser     par3       = new Parser(lex3);
            NodeFilter filter3    = new TagNameFilter("a");
            NodeList   atagNodes  = par3.Parse(filter3);
            string     urlpart    = new Regex(@"http://.*?(?=/)").Match(url).Value;

            for (int i = 0; i < atagNodes.Count; i++)
            {
                ATag   link  = (ATag)atagNodes.ElementAt(i);
                string temp1 = link.GetAttribute("href");
                string temp2 = link.StringText;

                if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url
                {
                    temp1 = urlpart + temp1;                            //将提取出的url构造完整,形成完整的url
                }
                ModifyRawText(ref temp2);
                listUrl.Add(temp1);
                listTitle.Add(temp2);
            }
        }
예제 #15
0
        static void download_url(string url)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393";
            request.Timeout   = 30000;
            try
            {
                using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
                {
                    if (response.StatusCode == HttpStatusCode.OK)
                    {
                        using (Stream s = response.GetResponseStream())
                        {
                            using (StreamReader sr = new StreamReader(s, Encoding.UTF8))
                            {
                                string html   = sr.ReadToEnd();
                                string encode = HttpUtility.HtmlDecode(html);
                                download_pic(encode);
                                Lexer      lexer      = new Lexer(encode);
                                Parser     par        = new Parser(lexer);
                                NodeFilter nodefilter = new TagNameFilter("a");
                                NodeList   nodes      = par.ExtractAllNodesThatMatch(nodefilter);
                                for (int i = 0; i < nodes.Count; i++)
                                {
                                    ITag tag     = nodes[i] as ITag;
                                    bool isexist = false;
                                    foreach (string ss in links)
                                    {
                                        if (ss == tag.GetAttribute("href"))
                                        {
                                            isexist = true;
                                            break;
                                        }
                                    }
                                    if (!isexist)
                                    {
                                        links.Add(tag.GetAttribute("href"));
                                        Console.WriteLine("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href"));
                                        using (FileStream fs = new FileStream(@"e:/Photos/crawl_log.txt", FileMode.Append))
                                        {
                                            byte[] bytes = Encoding.UTF8.GetBytes("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href") + "\r\n");
                                            fs.Write(bytes, 0, bytes.Length);
                                        }
                                        download_url("http://taylorpictures.net/" + tag.GetAttribute("href"));
                                    }
                                    else
                                    {
                                        continue;
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("Error");
                    }
                }
            }
            catch
            {
                Console.WriteLine("404");
            }
        }
예제 #16
0
파일: Fetch.cs 프로젝트: zq535228/renzhex3
        /// <summary>
        /// 函数名称:ItemRetrival_2
        /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间
        /// 参数:string url表示帖子列表url
        /// 参数 ref Encoding encode 用于获取网页字符集编码
        /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息
        ///
        /// </summary>
        /// <param name="url"></param>
        /// <param name="encode"></param>
        /// <param name="listurl"></param>
        /// <param name="listtitle"></param>
        /// <param name="listtime"></param>
        public static void ItemRetrival_2(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle,
                                          ref List <string> listTime)
        {
            //获取网页源码;
            string rawtext = GetDataFromUrl(url);
            string reg1    = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->";

            rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "");
            //将无关的style,script等标签去掉;
            //以下操作用于提取帖子页面的发帖时间、帖子URL,帖子标题等信息
            //用htmlparser获取目标li元素
            Lexer      lexer     = new Lexer(rawtext);
            Parser     parser    = new Parser(lexer);
            NodeFilter filter    = new TagNameFilter("li");//解析出其中的li元素
            NodeList   htmlNodes = parser.Parse(filter);
            //去掉其中不含有时间的条目
            Regex f2 = new Regex(@"\d\d:\d\d");

            for (int i = htmlNodes.Count - 1; i >= 0; i--)
            {
                if (!f2.IsMatch(htmlNodes[i].ToHtml()))
                {
                    htmlNodes.Remove(i);
                }
            }
            RegexFilter rf    = new RegexFilter(@"\d\d:\d\d");
            string      final = htmlNodes.ToHtml();

            for (int i = 0; i < htmlNodes.Count; i++)
            {
                Lexer    lexerTmp  = new Lexer(htmlNodes[i].ToHtml());
                Parser   parserTmp = new Parser(lexerTmp);
                NodeList tmp       = parserTmp.Parse(rf);
                if (tmp.Count > 0)
                {
                    for (int j = 0; j < tmp.Count; j++)
                    {
                        string temp = tmp[j].ToHtml();
                        ModifyRawText(ref temp);
                        listTime.Add(temp);
                    }
                }
            }


            //提取帖子url和标题
            string     atagAssist = htmlNodes.ToHtml();
            Lexer      lex3       = new Lexer(atagAssist);
            Parser     par3       = new Parser(lex3);
            NodeFilter filter3    = new TagNameFilter("a");
            NodeList   atagNodes  = par3.Parse(filter3);

            for (int i = 0; i < atagNodes.Count; i++)
            {
                string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value;
                ATag   link    = (ATag)atagNodes.ElementAt(i);
                string temp1   = link.GetAttribute("href");
                string temp2   = link.StringText;

                if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url
                {
                    temp1 = urlpart + temp1;                            //将提取出的url构造完整,形成完整的url
                }
                ModifyRawText(ref temp2);
                listUrl.Add(temp1);
                listTitle.Add(temp2);
            }
        }
예제 #17
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string htlPage = tdNodes.ToHtml();
                parser = new Parser(new Lexer(htlPage));
                NodeFilter filer    = new TagNameFilter("a");
                NodeList   pageList = parser.ExtractAllNodesThatMatch(filer);
                if (pageList != null && pageList.Count > 0)
                {
                    for (int i = pageList.Count - 1; i >= 0; i--)
                    {
                        try
                        {
                            ATag   aTag     = pageList.SearchFor(typeof(ATag), true)[i] as ATag;
                            string pageTemp = aTag.Link.Replace("main.aspx?flg=3&id=6&page=", "");
                            pageInt = int.Parse(pageTemp);
                            break;
                        }
                        catch (Exception ex) { }
                    }
                }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8);
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "760")));

                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = table.Rows[j];

                        beginDate = tr.Columns[2].ToPlainTextString().Trim();
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        prjName = aTag.LinkText;
                        InfoUrl = "http://www.uho.cn/" + aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#FFFFFF"), new TagNameFilter("table")));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).ToLower().Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n");
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            htmldetail = regexHtml.Replace(htmldetail, "");
                        }
                        catch (Exception ex) { continue; }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#ffffff"), new TagNameFilter("table")));
                        inviteCtx  = dtnode.AsString();
                        specType   = "其他";
                        msgType    = "深圳市友和保险经纪有限公司";
                        inviteType = ToolHtml.GetInviteTypes(prjName);
                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                        list.Add(info);

                        if (!crawlAll && list.Count >= this.MaxCount)
                        {
                            return(list);
                        }
                    }
                }
            }
            return(list);
        }
예제 #18
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "fenye123")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string pageTemp = tdNodes.AsString().Replace("&nbsp;", "").Trim();
                try
                {
                    pageInt = int.Parse(ToolHtml.GetRegexString(pageTemp, "共", "页"));
                }
                catch (Exception ex) { }


                for (int i = 1; i <= pageInt; i++)
                {
                    if (i > 1)
                    {
                        try
                        {
                            html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.sz-otc.com/zhaobiao/index_" + i.ToString()) + ".html", Encoding.Default);
                        }
                        catch (Exception ex)
                        {
                            continue;
                        }
                    }

                    parser = new Parser(new Lexer(html));
                    NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "zhaobiao_list")));

                    if (nodeList != null && nodeList.Count > 0)
                    {
                        TableTag table = nodeList[0] as TableTag;
                        for (int j = 0; j < nodeList.Count; j++)
                        {
                            string htl = string.Empty;
                            htl = nodeList[j].ToHtml();
                            Parser     ul     = new Parser(new Lexer(htl));
                            NodeFilter filter = new TagNameFilter("li");
                            NodeList   liList = ul.ExtractAllNodesThatMatch(filter);
                            if (liList != null && liList.Count > 0)
                            {
                                for (int k = 0; k < liList.Count; k++)
                                {
                                    string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                                    ATag   aTag = liList.SearchFor(typeof(ATag), true)[k] as ATag;
                                    InfoUrl = "http://www.sz-otc.com" + aTag.Link;
                                    prjName = aTag.LinkText.Replace("[新]", "").Replace("&#160;", "");
                                    if (prjName.Contains("]"))
                                    {
                                        try
                                        {
                                            int beg = prjName.IndexOf("]");
                                            prjName = prjName.Substring(beg + 1, prjName.Length - beg - 1);
                                        }
                                        catch { }
                                    }
                                    string htmldetail = string.Empty;
                                    try
                                    {
                                        htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default);
                                    }
                                    catch { return(null); }
                                    Parser   dtlparser = new Parser(new Lexer(htmldetail));
                                    NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "right_content"), new TagNameFilter("div")));
                                    if (dtnode != null && dtnode.Count > 0)
                                    {
                                        HtmlTxt   = dtnode.ToHtml();
                                        inviteCtx = dtnode.AsString().Replace("&nbsp;", "").Replace(" ", "");
                                        string invite = inviteCtx.Replace("点击", "\r\n").Replace("发布人", "\r\n");
                                        specType = "其他";
                                        msgType  = "深圳市东方招标有限公司";
                                        if (string.IsNullOrEmpty(prjName))
                                        {
                                            Regex regexName = new Regex(@"(工程名称|项目名称)(:|:)[^\r\n]+\r\n");
                                            prjName = regexName.Match(inviteCtx).Value.Replace("工程名称", "").Replace("项目名称", "").Replace(":", "").Replace(":", "").Trim();
                                        }
                                        Regex regex = new Regex(@"(工程编号|招标编号)(:|:)[^\r\n]+\r\n");
                                        code = regex.Match(invite).Value.Replace("工程编号", "").Replace("招标编号", "").Replace(":", "").Replace(":", "").Trim();

                                        Regex regexAddress = new Regex(@"(地址|项目地址)(:|:)[^\r\n]+\r\n");
                                        prjAddress = regexAddress.Match(inviteCtx).Value.Replace("地址", "").Replace("项目地址", "").Replace(":", "").Replace(":", "").Trim();

                                        Regex regexUnit = new Regex(@"(招标单位|招标机构)(:|:)[^\r\n]+\r\n");
                                        buildUnit = regexUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标机构", "").Replace(":", "").Replace(":", "").Trim();


                                        Regex regexCar = new Regex(@"(开始日期|发布日期)(:|:)[^\r\n]+\r\n");
                                        beginDate = regexCar.Match(invite).Value.Replace("开始日期", "").Replace("发布日期", "").Replace(":", "").Replace(":", "").Trim();

                                        if (!string.IsNullOrEmpty(beginDate))
                                        {
                                            string time = string.Empty;
                                            for (int leng = 0; leng < beginDate.Length; leng++)
                                            {
                                                if (leng < 10)
                                                {
                                                    time += beginDate.Substring(leng, 1);
                                                }
                                            }
                                            beginDate = time;
                                        }

                                        specType = "其他";
                                        msgType  = "深圳市东方招标有限公司";
                                        if (buildUnit == "")
                                        {
                                            buildUnit = "";
                                        }
                                        inviteType = ToolHtml.GetInviteTypes(prjName);
                                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt);
                                        list.Add(info);

                                        if (!crawlAll && list.Count >= 20)
                                        {
                                            return(list);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #19
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    SelectTag selectTag = pageList[0] as SelectTag;
                    pageInt = selectTag.OptionTags.Length;
                }
                catch { }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a",
                        "topicChrList_20070702_p", "topicChrList_20070702_s_name", "id", "method", "__ec_pages",
                        "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_ldate"
                    }, new string[] {
                        "topicChrList_20070702", "20", "", i.ToString(), "", "709", "view", i.ToString(), "20", "", ""
                    });
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "topicChrList_20070702_table"), new TagNameFilter("table")));
                if (dtList != null && dtList.Count > 0)
                {
                    TableTag table = dtList[0] as TableTag;
                    for (int j = 3; j < table.RowCount; j++)
                    {
                        TableRow tr = table.Rows[j];
                        string   headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                                 infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;
                        headName    = tr.Columns[1].ToPlainTextString().Trim();
                        releaseTime = tr.Columns[2].ToPlainTextString().Trim();
                        infoType    = "政策法规";
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        infoUrl = "http://www.szzfcg.cn/portal/documentView.do?method=view&id=" + aTag.Link.Replace("/viewer.do?id=", "");
                        string htmldeil = string.Empty;
                        try
                        {
                            htmldeil = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(infoUrl), Encoding.UTF8);
                        }
                        catch { continue; }
                        Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                        htmldeil = regexHtml.Replace(htmldeil, "");

                        parser = new Parser(new Lexer(htmldeil));
                        NodeFilter filter = new TagNameFilter("body");
                        NodeList   noList = parser.ExtractAllNodesThatMatch(filter);
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = noList.AsString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "");
                            msgType    = "深圳政府采购";
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            list.Add(info);
                            if (crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #20
0
        public DataTable OrderData(string strBuff)
        {
            #region 分析网页html节点

            DataTable dt = new DataTable();
            dt.Columns.Add("ArticleNo");
            dt.Columns.Add("StoreCode");
            dt.Columns.Add("StoreName");
            dt.Columns.Add("OrderQuantity");
            dt.Columns.Add("OrderAmount");
            dt.Columns.Add("OrderNo");
            dt.Columns.Add("BarCode");
            dt.Columns.Add("TopStoreCode");
            dt.Columns.Add("TopStoreName");
            Lexer      lexerOrder  = new Lexer(strBuff);
            Parser     parserOrder = new Parser(lexerOrder);
            NodeFilter htmlOrder   = new TagNameFilter("Table");

            NodeList htmlNodeOrders = parserOrder.Parse(htmlOrder);
            Lexer    lexerOrders    = new Lexer(htmlNodeOrders[21].ToHtml());
            Parser   parserOrders   = new Parser(lexerOrders);

            NodeFilter htmlOrderss   = new TagNameFilter("A");
            NodeList   htmlNodeOrder = parserOrders.Parse(htmlOrderss);
            for (int i = 0; i < htmlNodeOrder.Count; i++)
            {
                string strOrderNumber = htmlNodeOrder[i].Children[0].ToHtml();
                if (htmlNodeOrder[i].ToHtml().Contains("*"))
                {
                    strOrderNumber = htmlNodeOrder[i].Children[1].ToHtml();
                }
                string strStoreCode = string.Empty;
                if (htmlNodeOrder[i] is ITag)
                {
                    ITag tag = (htmlNodeOrder[i] as ITag);

                    if (!tag.IsEndTag())
                    {
                        if (tag.Attributes != null && tag.Attributes.Count > 0)
                        {
                            if (tag.Attributes["HREF"] != null)
                            {
                                strStoreCode = tag.Attributes["HREF"].ToString();
                                strStoreCode = strStoreCode.Split('=')[2];
                            }
                        }
                    }
                }

                if (!string.IsNullOrEmpty(strStoreCode))
                {
                    string path2    = AppDomain.CurrentDomain.BaseDirectory + @"Areas\\BusinessData\Test" + $@"\b_{strOrderNumber.Trim('*')}_{strStoreCode}.txt";;
                    string strBuff2 = DataHelper.Read(path2, Encoding.Default);
                    if (string.IsNullOrEmpty(strBuff2))
                    {
                        continue;
                    }
                    Lexer      lexer  = new Lexer(strBuff2);
                    Parser     parser = new Parser(lexer);
                    NodeFilter html   = new TagNameFilter("Table");

                    NodeList htmlNodes = parser.Parse(html);
                    Lexer    lexers    = new Lexer(htmlNodes[23].ToHtml());
                    Parser   parsers   = new Parser(lexers);

                    NodeFilter htmls    = new TagNameFilter("TR");
                    NodeList   htmlNode = parsers.Parse(htmls);

                    string strArticleNo = string.Empty;
                    for (int j = 2; j < htmlNode.Count; j++)
                    {
                        strArticleNo = htmlNode[j].Children[1].Children[0].ToHtml();
                        string strStoreName = string.Empty;

                        string strOrderQuantity = string.Empty;
                        if (strOrderNumber.Contains("*"))
                        {
                            strOrderQuantity = htmlNode[j].Children[6].Children[0].ToHtml();
                        }
                        else
                        {
                            strOrderQuantity = htmlNode[j].Children[5].Children[0].ToHtml();
                        }
                        //获取条码
                        string barCode = DataHelper.GetBarCode(strArticleNo, _dtGoods);

                        //获取客户编号及客户简称
                        string strTopStoreCode = "";
                        string strTopStoreName = "";
                        DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName);
                        DataRow dr = dt.NewRow();
                        dr[0] = strArticleNo;
                        dr[1] = strStoreCode;
                        dr[2] = strStoreName;
                        dr[3] = strOrderQuantity;
                        dr[4] = "0";
                        dr[5] = strOrderNumber.Trim('*');
                        dt.Rows.Add(dr);
                    }
                }
            }
            #endregion

            return(dt);
        }
예제 #21
0
        public DataTable InventoryData(string strBuff)
        {
            #region 分析网页html节点

            DataTable dt = new DataTable();
            dt.Columns.Add("ArticleNo"); //商品编码
            dt.Columns.Add("StoreCode"); //区域/仓库
            dt.Columns.Add("StoreName"); //区域/仓库
            dt.Columns.Add("Inventory"); //总库存
            dt.Columns.Add("Date");
            dt.Columns.Add("BarCode");
            dt.Columns.Add("TopStoreCode");
            dt.Columns.Add("TopStoreName");


            for (int j = 1; j <= 1; j++)//分页
            {
                Lexer      lexer     = new Lexer(strBuff);
                Parser     parser    = new Parser(lexer);
                NodeFilter html      = new TagNameFilter("Table");
                NodeList   htmlNodes = parser.Parse(html);

                string strArticleNo = "";
                int    rowspan      = 1;
                int    count        = 1;
                for (int i = 9; i <= htmlNodes[1].Children.Count - 4; i++)//空格也算一个元素
                {
                    if (count == rowspan)
                    {
                        rowspan = 1;
                    }
                    string strStoreCode;
                    string strStoreName;
                    if (rowspan > 1)//多列并排 取上一列的值
                    {
                        strStoreCode = htmlNodes[1].Children[i].Children[1].ToPlainTextString();
                        strStoreName = htmlNodes[1].Children[i].Children[1].ToPlainTextString();
                        if (count <= rowspan)
                        {
                            count++;
                        }
                    }
                    else
                    {
                        //HTML解析商品编码
                        var htmlArticleNo = htmlNodes[1].Children[i].Children[1].ToHtml().Replace("  ", " ").Replace("\"", "");
                        rowspan      = GetStrArticleNo(htmlArticleNo);
                        strArticleNo = htmlNodes[1].Children[i].Children[1].ToPlainTextString();
                        strStoreCode = htmlNodes[1].Children[i].Children[9].ToPlainTextString();
                        strStoreName = htmlNodes[1].Children[i].Children[9].ToPlainTextString();
                        count        = 1;
                    }

                    string strInventory = htmlNodes[1].Children[i].Children[htmlNodes[1].Children[i].Children.Count - 2].ToPlainTextString();
                    string strDate      = DateTime.Now.ToString("yyyy-MM-dd");

                    //获取条码
                    string barCode = DataHelper.GetBarCode(strArticleNo, _dtGoods);

                    //获取客户编号及客户简称
                    string strTopStoreCode = "";
                    string strTopStoreName = "";
                    DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName);

                    DataRow dr = dt.NewRow();
                    dr[0] = strArticleNo;
                    dr[1] = strStoreCode;
                    dr[2] = strStoreName;
                    dr[3] = strInventory;
                    dr[4] = strDate;
                    dr[5] = barCode;
                    dr[6] = strTopStoreCode;
                    dr[7] = strTopStoreName;

                    dt.Rows.Add(dr);
                    i++;
                }
            }

            #endregion

            return(dt);
        }
예제 #22
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList  list            = new ArrayList();
            string htl             = string.Empty;
            string cookiestr       = string.Empty;
            string viewState       = string.Empty;
            int    page            = 1;
            string eventValidation = string.Empty;

            try
            {
                htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(htl));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_Main_paging_LblPageCount")));

            if (tdNodes.Count > 0)
            {
                try
                {
                    page = int.Parse(tdNodes[0].ToPlainTextString().Trim());
                }
                catch { return(list); }
            }
            for (int i = 1; i <= page; i++)
            {
                if (i > 1)
                {
                    if (i < 3)
                    {
                        viewState       = this.ToolWebSite.GetAspNetViewState(htl);
                        eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl);
                    }
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "ctl00$ScriptManager1",
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "ctl00$Main$ddl_type",
                        "ctl00$Main$txt_Title",
                        "ctl00$Main$paging$txtPageIndex",
                        "__VIEWSTATE",
                        "__VIEWSTATEENCRYPTED",
                        "__EVENTVALIDATION",
                        "__ASYNCPOST",
                        "ctl00$Main$paging$btnForward.x",
                        "ctl00$Main$paging$btnForward.y"
                    }, new string[] {
                        "ctl00$UpdatePanel1|ctl00$Main$paging$btnForward",
                        string.Empty,
                        string.Empty,
                        "1",
                        string.Empty,
                        i.ToString(),
                        viewState, "", eventValidation, "true", "8", "9"
                    });
                    try
                    {
                        htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch (Exception ex) { continue; }
                }
                parser = new Parser(new Lexer(htl));
                NodeList tableList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Main_GV_New")));
                if (tableList != null && tableList.Count > 0)
                {
                    TableTag table = (TableTag)tableList[0];
                    for (int j = 1; j < table.RowCount; j++)
                    {
                        string pUrl = string.Empty, pInfoSource = string.Empty, pBeginDate = string.Empty,
                               pBuilTime = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty,
                               pSuperUnit = string.Empty, pDesignUnit = string.Empty, pProspUnit = string.Empty,
                               pInviteArea = string.Empty, pBuildArea = string.Empty, pPrjClass = string.Empty,
                               pProClassLevel = string.Empty, pChargeDept = string.Empty, pPrjAddress = string.Empty,
                               pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty,
                               pCreatetime = string.Empty;
                        TableRow tr        = table.Rows[j];
                        PrjName    = tr.Columns[2].ToPlainTextString().Trim();
                        pBuildUnit = tr.Columns[3].ToPlainTextString().Trim();
                        string aLink = string.Empty;
                        ATag   aTag  = new ATag();
                        try
                        {
                            aLink  = tr.ToHtml().Replace("ondblclick", "href").Replace("<tr", "<a");
                            aLink  = aLink.Remove(aLink.IndexOf("<td")) + "</a>";
                            parser = new Parser(new Lexer(aLink));
                            NodeFilter a     = new TagNameFilter("a");
                            NodeList   aList = parser.ExtractAllNodesThatMatch(a);
                            if (aList != null && aList.Count > 0)
                            {
                                aTag = aList.SearchFor(typeof(ATag), true)[0] as ATag;
                            }
                            if (aTag.Link.Contains("PrjManager") || aTag.Link.Contains("View"))
                            {
                                pUrl = aTag.Link.Remove(aTag.Link.IndexOf("View")).Replace("&amp;", "&") + "View";
                                int index = pUrl.IndexOf("PrjManager");
                                pUrl = "http://www.szbajs.gov.cn/SiteManage/" + pUrl.Substring(index, pUrl.Length - index);
                            }
                            else
                            {
                                continue;
                            }
                        }
                        catch (Exception ex) { continue; }
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(pUrl), Encoding.UTF8);
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                        Parser   parserdetail = new Parser(new Lexer(htmldetail));
                        NodeList dtnode       = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "data_con")));
                        if (dtnode.Count > 0 && dtnode != null)
                        {
                            pInfoSource = dtnode.AsString().Replace(" ", "");
                            Regex regPrjAddr = new Regex(@"(工程地点|工程地址)(:|:)[^\r\n]+\r\n");
                            pPrjAddress = regPrjAddr.Match(pInfoSource).Value.Replace("工程地址", "").Replace("工程地点", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpProspUnit = new Regex(@"勘查单位(:|:)[^\r\n]+\r\n");
                            pProspUnit = regpProspUnit.Match(pInfoSource).Value.Replace("勘查单位", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n");
                            pDesignUnit = regpDesignUnit.Match(pInfoSource).Value.Replace("设计单位", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n");
                            pSuperUnit = regpSuperUnit.Match(pInfoSource).Value.Replace("监理单位", "").Replace(":", "").Replace(":", "").Trim();

                            Regex regConst = new Regex(@"施工单位(:|:)[^\r\n]+\r\n");
                            pConstUnit = regConst.Match(pInfoSource).Value.Replace("施工单位", "").Replace(":", "").Replace(":", "").Trim();
                            if (string.IsNullOrEmpty(pChargeDept))
                            {
                                pChargeDept = "宝安区建设局";
                            }
                            BaseProject info = ToolDb.GenBaseProject("广东省", pUrl, "深圳市宝安区", pInfoSource, pBuilTime, pBeginDate, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, pProspUnit, pInviteArea,
                                                                     pBuildArea, pPrjClass, pProClassLevel, pChargeDept, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pCreatetime, "深圳市宝安区建设局");

                            list.Add(info);
                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #23
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list     = new List <InviteInfo>();
            int   sqlCount = 0;
            int   count    = 0;
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch
            {
                return(list);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages")));

            if (pageNode != null && pageNode.Count > 0)
            {
                SelectTag selectTag = pageNode[0] as SelectTag;
                pageInt = selectTag.OptionTags.Length;
            }
            string cookiestr = string.Empty;

            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a", "topicChrList_20070702_p", "topicChrList_20070702_s_name", "id", "method", "__ec_pages", "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_ldate" }, new string[] { "topicChrList_20070702", "20", string.Empty, i.ToString(), string.Empty, "1660", "view", (i - 1).ToString(), "20", string.Empty, string.Empty });
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr);
                    }
                    catch { }
                }
                parser = new Parser(new Lexer(html));
                NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_20070702_table")));
                if (tdNodes != null && tdNodes.Count > 0)
                {
                    TableTag table = tdNodes[0] as TableTag;

                    for (int t = 3; t < table.RowCount; t++)
                    {
                        string   code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;
                        TableRow tr = table.Rows[t];
                        prjName = tr.Columns[2].ToPlainTextString().Trim().ToRegString();
                        //try
                        //{
                        inviteType = tr.Columns[3].ToPlainTextString().Trim();
                        beginDate  = tr.Columns[4].ToPlainTextString().Trim();
                        //}
                        //catch { DateTime beginDa = DateTime.Today; beginDate = beginDa.ToString("yyyy-MM-dd HH:mm:ss"); }



                        ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag;

                        Regex regexLink = new Regex(@"id=[^-]+");
                        InfoUrl = "http://www.szzfcg.cn/portal/documentView.do?method=view&" + regexLink.Match(aTag.Link).Value;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Trim();
                            Parser   dtlparserHTML = new Parser(new Lexer(htmldetail));
                            NodeList dtnodeHTML    = dtlparserHTML.ExtractAllNodesThatMatch(new TagNameFilter("body"));
                            HtmlTxt    = dtnodeHTML.AsHtml();
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n");
                        }
                        catch (Exception ex) { }
                        Parser   dtlparser = new Parser(new Lexer(htmldetail));
                        NodeList dtnode    = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("body"));
                        inviteCtx = dtnode.AsString().Replace(" ", "").Replace("\t", "").Trim("\r\n".ToCharArray()).Replace("&ldquo;", "“").Replace("&rdquo;", "”").Replace("双击鼠标自动滚屏[打印此页][关闭此页]", "");
                        inviteCtx = System.Web.HttpUtility.HtmlDecode(inviteCtx);
                        Regex regCtx = new Regex(@"[\r\n]+");
                        inviteCtx = regCtx.Replace(inviteCtx, "\r\n");
                        Regex regcode = new Regex(@"(招标编号|项目编号)(:|:)([0-9]|[A-Za-z]|[-])+");
                        code = regcode.Match(inviteCtx).Value.Replace("招标编号", "").Replace("项目编号", "").Replace(":", "").Replace(":", "").Trim();

                        if (string.IsNullOrEmpty(inviteCtx) || string.IsNullOrEmpty(HtmlTxt))
                        {
                            parser = new Parser(new Lexer(htmldetail));
                            NodeFilter filter  = new TagNameFilter("body");
                            NodeList   ctxList = parser.ExtractAllNodesThatMatch(filter);
                            inviteCtx = ctxList.AsString();
                            HtmlTxt   = ctxList.AsHtml();
                        }
                        if (string.IsNullOrEmpty(inviteCtx) || string.IsNullOrEmpty(HtmlTxt))
                        {
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>");
                            HtmlTxt   = regexHtml.Replace(htmldetail, "");
                            inviteCtx = Regex.Replace(HtmlTxt, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "");
                        }
                        msgType    = "深圳政府采购";
                        specType   = "政府采购";
                        prjAddress = "深圳市";
                        if (inviteType.Contains("160"))
                        {
                            inviteType = ToolHtml.GetInviteTypes(prjName);
                        }
                        InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳政府采购", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, InfoUrl, HtmlTxt);
                        if (!crawlAll && sqlCount >= this.MaxCount)
                        {
                            return(null);
                        }
                        sqlCount++;
                        if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx))
                        {
                            count++;
                            parser = new Parser(new Lexer(htmldetail));
                            NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a"));
                            if (fileNode != null && fileNode.Count > 0)
                            {
                                for (int f = 0; f < fileNode.Count; f++)
                                {
                                    ATag tag = fileNode[f] as ATag;
                                    if (tag.IsAtagAttach())
                                    {
                                        try
                                        {
                                            BaseAttach attach = null;
                                            if (tag.Link.ToLower().Contains(".com") || tag.Link.ToLower().Contains(".cn"))
                                            {
                                                attach = ToolHtml.GetBaseAttachByUrl(tag.Link.Replace("&amp;", "&"), tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\");
                                            }
                                            else
                                            {
                                                attach = ToolHtml.GetBaseAttachByUrl("http://www.szzfcg.cn" + tag.Link.Replace("&amp;", "&"), tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\");
                                            }
                                            if (attach != null)
                                            {
                                                ToolDb.SaveEntity(attach, "SourceID,AttachServerPath");
                                            }
                                        }
                                        catch { }
                                    }
                                }
                            }
                            if (count >= 10)
                            {
                                count = 0;
                                Thread.Sleep(1000 * 300);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #24
0
        public DataTable SalesData(string strBuff)
        {
            #region 分析网页html节点

            DataTable dt = new DataTable();
            dt.Columns.Add("ArticleNo"); //商品编码
            dt.Columns.Add("BarCode");   //商品条码
            dt.Columns.Add("StoreCode"); //区域/仓库
            dt.Columns.Add("StoreName"); //区域/仓库
            dt.Columns.Add("Sales");     //销售
            dt.Columns.Add("Date");
            dt.Columns.Add("TopStoreCode");
            dt.Columns.Add("TopStoreName");
            for (int j = 1; j <= 1; j++)//分页
            {
                Lexer      lexer     = new Lexer(strBuff);
                Parser     parser    = new Parser(lexer);
                NodeFilter html      = new TagNameFilter("Table");
                NodeList   htmlNodes = parser.Parse(html);

                for (int i = 9; i <= htmlNodes[1].Children.Count - 4; i++)//空格也算一个元素
                {
                    string strArticleNo = htmlNodes[1].Children[i].Children[1].ToPlainTextString();
                    string salesBarCode = htmlNodes[1].Children[i].Children[5].ToPlainTextString();
                    string strStoreCode = htmlNodes[1].Children[i].Children[11].ToPlainTextString();
                    string strStoreName = htmlNodes[1].Children[i].Children[11].ToPlainTextString();
                    string strSales     = "";
                    if (htmlNodes[1].Children[i].Children != null)
                    {
                        int sold = int.Parse(htmlNodes[1].Children[i].Children[13].ToPlainTextString());
                        int back = int.Parse(htmlNodes[1].Children[i].Children[15].ToPlainTextString());
                        strSales = (sold - back).ToString();
                    }
                    string strDate = DateTime.Now.ToString("yyyy-MM-dd");

                    //获取货号
                    if (string.IsNullOrEmpty(strArticleNo))
                    {
                        strArticleNo = DataHelper.GetArticalNo(salesBarCode, _dtGoods);
                    }

                    //获取客户编号及客户简称
                    string strTopStoreCode = "";
                    string strTopStoreName = "";
                    DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName);

                    DataRow dr = dt.NewRow();
                    dr[0] = strArticleNo;
                    dr[1] = salesBarCode;
                    dr[2] = strStoreCode;
                    dr[3] = strStoreName;
                    dr[4] = strSales;
                    dr[5] = strDate;
                    dr[6] = strTopStoreCode;
                    dr[7] = strTopStoreName;

                    dt.Rows.Add(dr);
                    i++;
                }
            }

            #endregion

            return(dt);
        }
예제 #25
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            IList list = new ArrayList();
            //取得页码
            int    pageInt         = 1;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;
            string cookiestr       = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(list);
            }
            Parser   parser  = new Parser(new Lexer(html));
            NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1")));

            if (tdNodes != null && tdNodes.Count > 0)
            {
                string htlPage = tdNodes.ToHtml();
                parser = new Parser(new Lexer(htlPage));
                NodeFilter filer    = new TagNameFilter("a");
                NodeList   pageList = parser.ExtractAllNodesThatMatch(filer);
                if (pageList != null && pageList.Count > 0)
                {
                    for (int i = pageList.Count - 1; i >= 0; i--)
                    {
                        try
                        {
                            ATag   aTag     = pageList.SearchFor(typeof(ATag), true)[i] as ATag;
                            string pageTemp = aTag.Link.Replace("main.aspx?flg=10&id=6&page=", "");
                            pageInt = int.Parse(pageTemp);
                            break;
                        }
                        catch (Exception ex) { }
                    }
                }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8);
                    }
                    catch { }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "760")));

                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 0; j < table.RowCount; j++)
                    {
                        string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty;

                        TableRow tr = table.Rows[j];
                        beginDate = tr.Columns[2].ToPlainTextString().Trim();
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        prjName = aTag.LinkText;
                        InfoUrl = "http://www.uho.cn/" + aTag.Link;
                        string htmldetail = string.Empty;
                        try
                        {
                            htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).ToLower().Replace("&nbsp;", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n");
                            Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>");
                            htmldetail = regexHtml.Replace(htmldetail, "");
                        }
                        catch (Exception ex) { continue; }
                        parser = new Parser(new Lexer(htmldetail));
                        NodeList deaiList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "sdg")));
                        if (deaiList != null && deaiList.Count > 0)
                        {
                            HtmlTxt = deaiList.AsHtml();
                            bidCtx  = HtmlTxt.ToCtxString();
                            code    = bidCtx.GetRegexBegEnd("编号:", ")", 50);
                            if (!string.IsNullOrEmpty(code))
                            {
                                code = code.ToUpper();
                            }
                            string ctx = string.Empty;
                            parser = new Parser(new Lexer(HtmlTxt));
                            NodeList bidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")));
                            if (bidNode != null && bidNode.Count > 0)
                            {
                                TableTag bidTable = bidNode[0] as TableTag;
                                try
                                {
                                    for (int k = 0; k < 1; k++)
                                    {
                                        for (int d = 0; d < bidTable.Rows[k].ColumnCount; d++)
                                        {
                                            ctx += bidTable.Rows[k].Columns[d].ToNodePlainString() + ":";
                                            ctx += bidTable.Rows[k + 1].Columns[d].ToNodePlainString() + "\r\n";
                                        }
                                    }
                                    bidUnit  = ctx.GetBidRegex();
                                    bidMoney = ctx.GetMoneyRegex();
                                }
                                catch { }
                            }
                            if (string.IsNullOrEmpty(bidUnit))
                            {
                                bidUnit = bidCtx.GetBidRegex();
                            }
                            if (bidMoney == "0")
                            {
                                bidMoney = bidCtx.GetMoneyRegex();
                            }

                            specType = "其他";
                            msgType  = "深圳市友和保险经纪有限公司";
                            prjName  = ToolDb.GetPrjName(prjName);
                            prjName  = prjName.Replace(" ", "").Trim();
                            bidType  = ToolHtml.GetInviteTypes(prjName);
                            BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt);
                            list.Add(info);

                            if (!crawlAll && list.Count >= this.MaxCount)
                            {
                                return(list);
                            }
                        }
                    }
                }
            }
            return(list);
        }
예제 #26
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1")), true), new TagNameFilter("table")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    TableTag table = pageList[0] as TableTag;
                    pageInt = table.Rows[0].ColumnCount + 1;
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    try
                    {
                        viewState       = this.ToolWebSite.GetAspNetViewState(html);
                        eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                        NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                            "__EVENTTARGET",
                            "__EVENTARGUMENT",
                            "__VIEWSTATE",
                            "__VIEWSTATEENCRYPTED",
                            "__EVENTVALIDATION",
                            "sel",
                            "beginDate",
                            "endDate",
                            "infotitle"
                        },
                                                                                          new string[] {
                            "GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, "1", "", "", ""
                        });
                        html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1")));
                if (nodeList != null && nodeList.Count > 0)
                {
                    TableTag table = nodeList[0] as TableTag;
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                               infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;

                        TableRow tr = table.Rows[j];
                        headName    = tr.Columns[1].ToNodePlainString();
                        releaseTime = tr.Columns[3].ToPlainTextString().GetDateRegex();
                        if (string.IsNullOrEmpty(releaseTime))
                        {
                            releaseTime = tr.Columns[3].ToPlainTextString().GetDateRegex("yyyy/MM/dd");
                        }
                        infoScorce = tr.Columns[2].ToNodePlainString();
                        infoType   = "通知公告";
                        infoUrl    = "http://www.szjsjy.com.cn/Notify/" + tr.Columns[1].GetATagHref();//"http://www.szjsjy.com.cn/Notify/InformContent.aspx?id=117750";//
                        string htldtl = string.Empty;
                        try
                        {
                            htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString();
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htldtl));
                        NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("background", "../img/A-3_17.gif")));
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = noList.AsString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n");
                            msgType    = MsgTypeCosnt.ShenZhenMsgType;
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            else
                            {
                                sqlCount++;
                                if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                {
                                    parser = new Parser(new Lexer(ctxHtml));
                                    NodeFilter aLink = new TagNameFilter("a");
                                    NodeList   aList = parser.ExtractAllNodesThatMatch(aLink);
                                    if (aList != null && aList.Count > 0)
                                    {
                                        for (int k = 0; k < aList.Count; k++)
                                        {
                                            ATag a = aList[k].GetATag();
                                            if (a != null)
                                            {
                                                if (!a.LinkText.Contains("返回"))
                                                {
                                                    try
                                                    {
                                                        BaseAttach obj = ToolHtml.GetBaseAttach("http://www.szjsjy.com.cn/" + a.Link.Replace("../", ""), a.LinkText, info.Id);
                                                        if (obj != null)
                                                        {
                                                            ToolDb.SaveEntity(obj, string.Empty);
                                                        }
                                                    }
                                                    catch { }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }
예제 #27
0
        protected override IList ExecuteCrawl(bool crawlAll)
        {
            //取得页码
            int    pageInt = 1, sqlCount = 0;
            string html            = string.Empty;
            string viewState       = string.Empty;
            string eventValidation = string.Empty;

            try
            {
                html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8);
            }
            catch (Exception ex)
            {
                return(null);
            }
            Parser   parser   = new Parser(new Lexer(html));
            NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1")), true), new TagNameFilter("table")));

            if (pageList != null && pageList.Count > 0)
            {
                try
                {
                    TableTag table    = pageList[0] as TableTag;
                    int      pageAtag = table.Rows[0].ColumnCount;
                    pageInt = int.Parse((table.Rows[0].SearchFor(typeof(ATag), true)[pageAtag - 2] as ATag).LinkText);
                }
                catch { pageInt = 1; }
            }
            for (int i = 1; i <= pageInt; i++)
            {
                if (i > 1)
                {
                    viewState       = this.ToolWebSite.GetAspNetViewState(html);
                    eventValidation = this.ToolWebSite.GetAspNetEventValidation(html);
                    NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] {
                        "__EVENTTARGET",
                        "__EVENTARGUMENT",
                        "__VIEWSTATE",
                        "__VIEWSTATEENCRYPTED",
                        "__EVENTVALIDATION",
                        "sel",
                        "beginDate",
                        "endDate",
                        "infotitle"
                    },
                                                                                      new string[] {
                        "GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, "1", "", "", ""
                    });
                    try
                    {
                        html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), nvc, Encoding.UTF8);
                    }
                    catch { continue; }
                }
                parser = new Parser(new Lexer(html));
                NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1")));
                if (dtList != null && dtList.Count > 0)
                {
                    TableTag table = dtList[0] as TableTag;
                    for (int j = 1; j < table.RowCount - 1; j++)
                    {
                        string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty,
                               infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty;
                        TableRow tr = table.Rows[j];
                        headName    = tr.Columns[1].ToPlainTextString().Trim();
                        infoScorce  = tr.Columns[2].ToPlainTextString().Trim();
                        releaseTime = tr.Columns[3].ToPlainTextString().Trim();
                        infoType    = "通知公告";
                        ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag;
                        infoUrl = "http://www.szjsjy.com.cn/Notify/" + aTag.Link;
                        string htmldetailtxt = string.Empty;
                        try
                        {
                            htmldetailtxt = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8);
                        }
                        catch { continue; }
                        parser = new Parser(new Lexer(htmldetailtxt));
                        NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("background", "../img/A-3_17.gif")));
                        if (noList != null && noList.Count > 0)
                        {
                            ctxHtml    = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", "");
                            infoCtx    = noList.AsString().Replace(" ", "").Replace("&nbsp;", "").Replace("\t\t", "\t").Replace("\t\t", "\t");
                            infoCtx    = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "");
                            msgType    = "深圳市建设工程交易中心";
                            infoScorce = infoScorce.Replace("&nbsp;", "");
                            NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType);
                            if (!crawlAll && sqlCount >= this.MaxCount)
                            {
                                return(null);
                            }
                            else
                            {
                                sqlCount++;
                                if (ToolDb.SaveEntity(info, this.ExistCompareFields))
                                {
                                    parser = new Parser(new Lexer(ctxHtml));
                                    NodeFilter aLink = new TagNameFilter("a");
                                    NodeList   aList = parser.ExtractAllNodesThatMatch(aLink);
                                    if (aList != null && aList.Count > 0)
                                    {
                                        for (int k = 0; k < aList.Count; k++)
                                        {
                                            ATag a = aList.SearchFor(typeof(ATag), true)[k] as ATag;
                                            if (a != null)
                                            {
                                                if (!a.LinkText.Contains("返回"))
                                                {
                                                    AddBaseFile("http://www.szjsjy.com.cn/" + a.Link.Replace("../", ""), a.LinkText, info);
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(null);
        }