Exemplos de código com HasAttributeFilter, Winista.Text.HtmlParser.Filters em C# (CSharp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: HtmlHandle.cs Projeto: czl032405/MKHXHHEHEHE

        public static List<Product> LoadGoods(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter filter = new HasAttributeFilter("class", "product");
            NodeList products = parser.ExtractAllNodesThatMatch(filter);

            List<Product> result = new List<Product>();
            for (int i = 0; i < products.Count; i++)
            {
                try
                {
                    Product p = new Product();
                    string pname = "", ppromo = "", pimg = "";
                    decimal pprice = 0;
                    ITag product = products[i] as ITag;

                    //name
                    NodeFilter nameFilter = new HasAttributeFilter("class", "product-title");
                    NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true);
                    ITag name = names[0] as ITag;
                    pname = name.ToPlainTextString().Trim();

                    //name
                    NodeFilter priceFilter = new HasAttributeFilter("class", "product-price");
                    NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true);
                    ITag price = prices[0] as ITag;
                    pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7));

                    //img
                    NodeFilter imgFilter = new TagNameFilter("img");
                    NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true);
                    ITag img = imgs[0] as ITag;
                    pimg = img.GetAttribute("DATA-KS-LAZYLOAD");

                    //promo
                    NodeFilter promoFilter = new HasAttributeFilter("class", "promo");
                    NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true);
                    if (promos.Count > 0)
                    {
                        ITag promo = promos[0] as ITag;
                        ppromo = promo.GetAttribute("data-promo");
                    }

                    p.img = pimg;
                    p.name = pname;
                    p.price = pprice;
                    p.promo = ppromo;
                    result.Add(p);
                }
                catch
                {

                }

            }

            return result;
        }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: GetGanJiJobs.cs Projeto: wangsying/SpiderJobs

        public NodeList GetListUrl(string url)
        {
            Parser parser = ParserHelp.GetParser(url);
            NodeFilter filter = new HasAttributeFilter("class", "list_title");
            NodeList list = new NodeList();

            list = parser.ExtractAllNodesThatMatch(filter);

            return list;
        }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: Program.cs Projeto: CaseyYang/WebProjects

 static void GetStoryOfRevolution()
 {
     StreamReader reader = new StreamReader("catalogue.htm");
     Lexer lexer = new Lexer(reader.ReadToEnd());
     Parser parser = new Parser(lexer);
     HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl");
     HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink");
     AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass);
     NodeList linkNodeList = parser.Parse(linkFilter);
     List<string> linkUrlList = new List<string>(linkNodeList.Size());
     List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size());
     HttpWebRequest httpWebRequest;
     StreamReader chapterReader = null;
     for (int i = 0; i < linkNodeList.Size(); i++)
     {
         ATag linkNode = (ATag)linkNodeList[i];
         linkUrlList.Add(linkNode.Link);
         httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]);
         chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024));
         string chapterHtmlContent = chapterReader.ReadToEnd();
         chapterHtmlContentList.Add(chapterHtmlContent);
         Console.WriteLine("第" + (i + 1) + "个页面获取完毕！");
     }
     chapterReader.Close();
     HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText");
     StreamWriter writer = new StreamWriter("革命逸事.txt");
     for (int i = 0; i < chapterHtmlContentList.Count; i++)
     {
         writer.WriteLine("第" + (i + 1) + "章");
         lexer = new Lexer(chapterHtmlContentList[i]);
         parser = new Parser(lexer);
         NodeList praghNodeList = parser.Parse(praghFilter);
         if (praghNodeList.Size() == 1)
         {
             for (int j = 0; j < praghNodeList[0].Children.Size(); j++)
             {
                 if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag)))
                 {
                     ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j];
                     writer.WriteLine("    " + praghTag.StringText);
                 }
             }
             writer.WriteLine();
         }
         else
         {
             Console.WriteLine("第" + (i + 1) + "页中，判断段落的标准出错！");
         }
     }
     writer.Close();
 }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: PlayTimeSpider.cs Projeto: Letractively/swift-test-one

        public List<PlayTime> getPlayTimes(string xmlFile)
        {
            Match match = Regex.Match(xmlFile, @"\d\d\d\d");
            string cinemaID = match.Value;//电影院的ID
            List<PlayTime> playTimes = new List<PlayTime>();
            string html = File.ReadAllText(xmlFile);

            Lexer lexer = new Lexer(html);
            Parser playParser = new Parser(lexer);
            NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14");
            NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter);
            if (playTimeList.Count >= 1)
            {
                for (int i = 0; i < playTimeList.Count; i++)
                {
                    PlayTime playTime = new PlayTime();
                    ITag playTag = (playTimeList[i] as ITag);
                    ITag idTag = (playTag.FirstChild as ITag);
                    if (idTag.Attributes != null)
                    {
                        string strID = idTag.Attributes["HREF"].ToString();
                        Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d");
                        if (idMatch.Success)
                        {
                            playTime.MovieID = int.Parse(idMatch.Value);
                        }
                        else
                        {
                            Match strMatch = Regex.Match(strID, @"\d\d\d\d\d");
                            if (strMatch.Success)
                            {
                                playTime.MovieID = int.Parse(strMatch.Value);
                            }
                        }

                    }
                    string strTime = playTag.NextSibling.NextSibling.ToPlainTextString();
                    char[] a = {'上','映'};
                    strTime = strTime.Trim(a);
                    playTime.Playtime = DateTime.Parse(strTime);
                    playTime.CinemaID = int.Parse(cinemaID);
                    playTime.PlayState = true;

                    playTimes.Add(playTime);
                }
                return playTimes;
            }
            return null;
        }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Get1010Jobs.cs Projeto: wangsying/SpiderJobs

        public Job GetJobInfoParser(string url)
        {
            Job jobinfo = new Job();

            string title = string.Empty;
            string description = string.Empty;
            DateTime dt = DateTime.Now;
            string email = string.Empty;

            Parser parser = new Parser(new HttpProtocol(new Uri(url)));

            NodeFilter detail = new HasAttributeFilter("class", "d_left");

            NodeList nodeDetail = parser.ExtractAllNodesThatMatch(detail);
            if (nodeDetail == null || nodeDetail.Count == 0)
            {
                return jobinfo;
            }

            description = GetDetailString(nodeDetail);
            Match m = Regex.Match(description, @"发布时间：(?<date>\d\d\d\d-\d{1,2}\-\d{1,2} \d{1,2}\:\d{1,2})");

            dt = DateTime.Now;

            if (m.Success && m.Groups["date"].Success && DateTime.TryParse(m.Groups["date"].Value, out dt)) { }

            Match emailMatch = Regex.Match(description, @"([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)");
            if (emailMatch.Success)
            {
                email = emailMatch.Value;
            }

            Match telMatch = Regex.Match(description, @"(1[3|5|8][0-9]|15[0|3|6|7|8|9]|18[8|9])\d{8}");
            if (telMatch.Success)
            {
                jobinfo.tel = telMatch.Value;
            }

            jobinfo.category_id = Catalog.id;
            jobinfo.title = title;
            jobinfo.description = description;
            jobinfo.created_on = dt;
            jobinfo.is_active = true;
            jobinfo.city_id = Catalog.city.id;
            jobinfo.sp1010url = url;
            jobinfo.poster_email = email;

            return jobinfo;
        }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: CinemaSpider.cs Projeto: Letractively/swift-test-one

 public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP)
 {
     //string vip = string.Empty;
     //string html = File.ReadAllText(url);
     dining = string.Empty;
     park = string.Empty;
     gameCenter = string.Empty;
     intro3D = string.Empty;
     introVIP = string.Empty;
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000");
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagPar = (node.Parent as ITag);
         ITag tagSib = (node.PreviousSibling as ITag);
         if (tagSib.Attributes["CLASS"] != null)
         {
             switch (tagSib.Attributes["CLASS"].ToString())
             {
                 case "ico_cside1 mr12":
                     dining = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside2 mr12":
                     park = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside3 mr12":
                     gameCenter = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside5 mr12":
                     intro3D = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside7 mr12":
                     introVIP = tagPar.ToPlainTextString();
                     break;
             }
         }
     }
     return true;
     //throw new NotImplementedException();
 }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: Spider.cs Projeto: Letractively/swift-test-one

 public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     if (nodeList.Count == 1)
     {
         ITag tagNode = (nodeList[0] as ITag);
         if (tagNode.Attributes != null)
         {
             return tagNode.Attributes[attributeV].ToString();
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.Attributes[attributeV].ToString();
     //                    return value;
     //                }
     //            }
     //        }
     //    }
     //}
     return null;
 }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: GetGanJiJobs.cs Projeto: wangsying/SpiderJobs

        public Job GetDetail(string url)
        {
            Job info = new Job();

            Parser parser = ParserHelp.GetParser(url);

            NodeFilter miaoShu = new HasAttributeFilter("id", "miaoshu");
            NodeFilter mainBox = new HasAttributeFilter("class", "mainBox");
            NodeFilter orfilter = new OrFilter(miaoShu, mainBox);

            NodeList list = new NodeList();
            list = parser.Parse(orfilter);
            if (list == null || list.Count < 2)
            {
                return info;
            }

            GetMiaoShu(list, ref info);
            GetContartInfo(list, ref info);

            return info;
        }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: Program.cs Projeto: CaseyYang/WebProjects

 //分析HtmlContents中给定索引条目的内容，提取信息
 private static void GetInfoFromHtml(int index)
 {
     //使用Winista.HtmlParser库解析HTML
     //建立HTML分析工具对象
     Lexer lexer = new Lexer(HtmlContents[index]);
     Parser parser = new Parser(lexer);
     //按属性的过滤器：两个参数分别代表要过滤的属性和属性值
     HasAttributeFilter nameFilter = new HasAttributeFilter("class", "lrg");
     HasAttributeFilter priceFilter = new HasAttributeFilter("class", "bld lrg red");
     //获得所有满足过滤条件的HTML节点
     NodeList nameList = parser.Parse(nameFilter);
     for (int j = 0; j < nameList.Size(); j++)
     {
         //确定节点nameList[j]为Span类型的标签；HttpUtility.HtmlDecode方法把HTML编码转为文本编码，使中文正常显示
         string name = HttpUtility.HtmlDecode(((Span)nameList[j]).StringText);
         //Parent表示该HTML节点的父节点
         //NextSobling表示该HTML节点的下一个兄弟节点
         //Children表示该HTML节点的所有孩子节点组成的集合
         //ExtractAllNodesThatMatch表示获取所有满足给定过滤器条件的节点，两个参数分别代表过滤器和是否进入孩子节点中迭代查找
         //注意：对Winista.HtmlParser来说，“空文本节点”也是一个节点（在IE的开发者工具中显示“空文本节点”，而Chrome则不显示）；形似<del>内容</ del>在Children中会表达成三个节点
         NodeList priceList = nameList[j].Parent.Parent.NextSibling.NextSibling.Children.ExtractAllNodesThatMatch(priceFilter, true);
         if (priceList.Size() == 1)
         {
             string priceStr = ((Span)priceList[0]).StringText;
             double price = Double.Parse(priceStr.Substring(2, priceStr.Length - 2));
             TradeList.Add(new Commodity(name, price, "RMB"));
         }
         else
         {
             badRecordCount++;
         }
     }
     Console.WriteLine("第" + (index + 1) + "个页面处理完成！");
     //保存当前页面到本地文件
     //StreamWriter writer = new StreamWriter("searchresult"+i+".html");
     //writer.Write(s);
     //writer.Close();
 }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

        public List<ATag> ParseCatelog(string html)
        {
            List<ATag> atags = new List<ATag>();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "fenlei_list");
            NodeList navNodes = parser.Parse(nav);

            NodeFilter catelog = new LinkRegexFilter(@"^\.\./product/index\.php\?cplm\=\-\d\d\d\-$");
            catelog = new HasChildFilter(catelog);
            NodeList catelogNodes = navNodes[0].Children.ExtractAllNodesThatMatch(catelog);

            if(catelogNodes==null){
                return atags;
            }

            int length = catelogNodes.Count;
            for (int i=0;i<length;i++)
            {
                INode node = catelogNodes[i];
                ATag a = node.Children[0] as ATag;
                atags.Add(a);
            }

            return atags;
        }

Exemplo n.º 11

0

Exibir arquivo

Arquivo: Get1010Jobs.cs Projeto: wangsying/SpiderJobs

        public void SpiderCurrentPage(int idx)
        {
            ParserConf.GetConfiguration().RootPath = AppDomain.CurrentDomain.BaseDirectory;
            string url = Catalog.sp1010 + string.Format("index{0}.html", idx);
            Parser parser;
            NodeList nodeList=null;
            int count = 0;
            bool sign=true;

            while (sign && count<5)
            {
                SpiderEventLog.WriteSourceLog("Spider " + url, url, EventLogEntryType.Information);

                try
                {
                    parser = new Parser(new HttpProtocol(new Uri(url)));
                }
                catch (Exception ex)
                {
                    SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString());
                    return;
                }

                if (parser == null)
                {
                    return;
                }

                sign = false;

                try
                {
                    NodeFilter filter = new HasAttributeFilter("class", "Linklist");
                    nodeList = parser.ExtractAllNodesThatMatch(filter);
                }
                catch (Exception ex)
                {
                    SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString());
                    sign = true;
                }

                count++;
            }

            if (nodeList == null)
            {
                return;
            }

            int length = nodeList.Count;
            for (int i = 0; i < length; i++)
            {
                ATag node = nodeList[i] as ATag;
                if (IsExistJob(node.Link))
                {
                    SpiderEventLog.WriteLog(string.Format("职务 [{0}] 已存在",node.LinkText));
                    continue;
                }

                Job jobinfo = GetJobInfoParser(node.Link);
                jobinfo.title = Regex.Replace(node.LinkText,"&[^&;]{0,};", "",RegexOptions.IgnoreCase);

                ConsoleColor color = Console.ForegroundColor;
                Console.ForegroundColor = ConsoleColor.Red;

                Console.WriteLine("=".PadLeft(120,'='));
                Console.WriteLine("title:{0}", jobinfo.title);
                Console.WriteLine("url:{0}", jobinfo.sp1010url);
                Console.WriteLine("tel:{0}", jobinfo.tel);
                Console.WriteLine("email:{0}", jobinfo.poster_email);
                Console.WriteLine("desc:{0}", jobinfo.description);
                Console.WriteLine("=".PadLeft(120,'='));

                Console.ForegroundColor = color;

                InsertJobInfo(jobinfo);
            }
        }

Exemplo n.º 12

0

Exibir arquivo

Arquivo: FansAndFollowCrawler.cs Projeto: CaseyYang/WebProjects

 /// <summary>
 /// 配置各种HTML节点过滤器
 /// </summary>
 private static void MakeFilters()
 {
     HasAttributeFilter fansListFilterByClass = new HasAttributeFilter("class", "cnfList");
     HasAttributeFilter fanListFilterByNodeType = new HasAttributeFilter("node-type", "userListBox");
     AndFilter fansListFilter = new AndFilter(fanListFilterByNodeType, fansListFilterByClass);
     fanFilter = new AndFilter(new HasParentFilter(fansListFilter, false), new HasAttributeFilter("class", "clearfix S_line1"));
     HasAttributeFilter portraitFilterByParent = new HasAttributeFilter("class", "left");
     portraitFilter = new AndFilter(new HasParentFilter(portraitFilterByParent, false), new HasAttributeFilter("class", "face mbspace"));
     HasAttributeFilter fanNameFilterByParent = new HasAttributeFilter("class", "con_left");
     fanNameFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "name"));
     fanConnectFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "connect"));
     fanInfoFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "info"));
     followMethodFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "from W_textb"));
 }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: DianPinCrawler.cs Projeto: CaseyYang/WebProjects

 private static void MakeFilters()
 {
     NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList));
     HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList");
     poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter);
     poiFilter = new NodeClassFilter(typeof(DefinitionListBullet));
     tasteFilter = new HasAttributeFilter("class", "score1");
     environmentFilter = new HasAttributeFilter("class", "score2");
     serviceFilter = new HasAttributeFilter("class", "score3");
     averageFilter = new HasAttributeFilter("class", "average");
     commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview"));
     HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname");
     nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL"));
     addressFilter = new HasAttributeFilter("class", "address");
     tagsFilter = new HasAttributeFilter("class", "tags");
 }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

        private void ParseProductShowPhoto(NodeList nodes)
        {
            NodeFilter show = new HasAttributeFilter("class", "Picture220");
            NodeList showNodes = nodes.ExtractAllNodesThatMatch(show, true);
            ImageTag showTag = showNodes[0] as ImageTag;
            showTag.ImageURL = showTag.ImageURL.Replace("../../", "http://rrxf.cn/");

            Console.WriteLine(showTag.ImageURL);
            DownloadPicture(showTag.ImageURL);
        }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

 private void ParseProductDemoPhoto(NodeList nodes)
 {
     NodeFilter photo = new HasAttributeFilter("class", "Picture40");
     NodeList photoNodes = nodes.ExtractAllNodesThatMatch(photo, true);
     DownloadPictures(photoNodes);
 }

Exemplo n.º 16

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

        private void ParsePorductDescribe(NodeList nodes)
        {
            NodeFilter miao = new HasAttributeFilter("class", "miao");
            NodeList miaoArea = nodes.ExtractAllNodesThatMatch(miao, true);

            NodeFilter pictures = new NodeClassFilter(typeof(ImageTag));
            NodeList pictureNodes = miaoArea.ExtractAllNodesThatMatch(pictures, true);

            DownloadPictures(pictureNodes);

            string miaoshu = miaoArea.AsHtml();
            miaoshu = Regex.Replace(miaoshu, @"http\://(www\.|)rrxf\.cn/", pictureURL + "/", RegexOptions.IgnoreCase);
            miaoshu = Regex.Replace(miaoshu, @"(pic|bigpic)/", "$1_", RegexOptions.IgnoreCase);
            miaoshu = miaoshu.Replace("-", "_");

            Console.WriteLine(miaoshu);
        }

Exemplo n.º 17

0

Exibir arquivo

Arquivo: SpiderTaobao.cs Projeto: wangsying/SpiderJobs

        public NodeList GetDetailPageForHtml(string html)
        {
            Parser parse = GetParser(html);

            NodeFilter showidFilter = new HasAttributeFilter("id", "detail");
            NodeFilter showclassFilter = new HasAttributeFilter("class", "box");
            AndFilter showFilter = new AndFilter(showidFilter, showclassFilter);

            NodeFilter contentidFilter = new HasAttributeFilter("id", "J_DivItemDesc");
            NodeFilter contentclassFilter = new HasAttributeFilter("class", "content");
            AndFilter contentFilter = new AndFilter(contentidFilter, contentclassFilter);

            OrFilter orFitler = new OrFilter(showFilter, contentFilter);

            return parse.Parse(orFitler);
        }

Exemplo n.º 18

0

Exibir arquivo

Arquivo: PaperRefGenerator.cs Projeto: Wali8822/ReferenceMaker

        protected string getPaperID(string paper_name)
        {
            string html_page = _HttpUtil.getPaperIDHTML(paper_name);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("A");
            HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank");
            HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name));

            AndFilter af = new AndFilter(tag_f,attr_f);
            AndFilter aff = new AndFilter(af, child_f);

            NodeList childs = p.ExtractAllNodesThatMatch(aff);

            if (childs == null || childs.Count <= 0)
            {
                //Paper not found
                return null;
            }
            //TODO Multi Paper found

            INode node = childs[0];
            if (node is ITag)
            {
                ITag t = node as ITag;

                string href = t.GetAttribute("href");

                if (href != null && href != "")
                {
                    string [] sp = href.Split(new char[]{'/'});

                    return sp[sp.Length - 1].Split(new char[]{'.'})[0];
                }
            }

            //Not Found
            return null;
        }

Exemplo n.º 19

0

Exibir arquivo

Arquivo: Spider.cs Projeto: Letractively/swift-test-one

 public static List<string> getValues(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     List<string> values = new List<string>();
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             value = tagNode.ToPlainTextString();
             values.Add(value);
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.ToPlainTextString();
     //                    values.Add(value);
     //                }
     //            }
     //        }
     //    }
     //}
     return values;
 }

Exemplo n.º 20

0

Exibir arquivo

Arquivo: SpiderTaobao.cs Projeto: wangsying/SpiderJobs

        public void GetLinkForPage(string url)
        {
            Lexer lexer = new Lexer(GetHtml(url));
            Parser parse = new Parser(lexer);
            parse.Encoding = "gb2312";
            NodeFilter linkFilter = new LinkRegexFilter(@"^http\://item\.taobao\.com/item\.htm\?id\=\d+$");
            NodeFilter classFilter = new HasAttributeFilter("class", "EventCanSelect");
            AndFilter andFilter = new AndFilter(linkFilter, classFilter);
            NodeList result = parse.Parse(andFilter);

            int length = result.Count;
            for (int i = 0; i < length; i++)
            {
                ItemLink.Add(result[i]);
            }
        }

Exemplo n.º 21

0

Exibir arquivo

Arquivo: WebCrawler.cs Projeto: CaseyYang/WebProjects

        /// <summary>
        /// 辅助函数：从HTML中获得用户信息
        /// </summary>
        /// <param name="currentUserHtml">包含微博用户信息的HTML文本</param>
        private void GetUserInfoFromHtml(string currentUserHtml)
        {
            //配置相关的过滤器
            HasAttributeFilter nickNameFilter = new HasAttributeFilter("class", "name");
            HasAttributeFilter remarkNameFilter = new HasAttributeFilter("class", "CH");
            HasAttributeFilter linkUrlFilter = new HasAttributeFilter("class", "pf_lin S_link1");
            HasAttributeFilter selfIntroFilter = new HasAttributeFilter("class", "pf_intro bsp");
            HasAttributeFilter tagsFilter = new HasAttributeFilter("class", "S_func1");
            HasAttributeFilter profileFilter = new HasAttributeFilter("class", "tags");

            Lexer lexer = new Lexer(currentUserHtml);
            Parser parser = new Parser(lexer);

            //获取微博名
            NodeList nickNameNodeList = parser.ExtractAllNodesThatMatch(nickNameFilter);

            if (nickNameNodeList.Size() == 1)
            {
                user.NickName = ((Span)nickNameNodeList[0]).ToPlainTextString();
            }
            else
            {
                Console.WriteLine("判断微博名的标准出错！");
            }
            //注意此处：如果要重复使用parser，一定要在本次使用“完”、下次使用前调用reset，否则会出错
            parser.Reset();
            //获取备注名称
            NodeList remarkNameNodeList = parser.ExtractAllNodesThatMatch(remarkNameFilter);

            if (remarkNameNodeList.Size() == 1 && remarkNameNodeList[0].GetType().Equals(typeof(Span)))
            {
                string str = ((Span)remarkNameNodeList[0]).ToPlainTextString();
                //去掉头尾的括号
                user.RemarkName = str.Substring(1, str.Length - 2);
            }
            else
            {
                Console.WriteLine("判断微博备注名称的标准出错！");
            }
            parser.Reset();
            //获取微博链接地址
            NodeList linkUrlNodeList = parser.ExtractAllNodesThatMatch(linkUrlFilter);
            if (linkUrlNodeList.Size() == 1 && linkUrlNodeList[0].GetType().Equals(typeof(ATag)))
            {
                user.LinkURL = ((ATag)linkUrlNodeList[0]).StringText;
            }
            else
            {
                Console.WriteLine("判断微博链接地址的标准出错！");
            }
            parser.Reset();
            //获取自我描述
            NodeList selfIntroNodeList = parser.ExtractAllNodesThatMatch(selfIntroFilter);
            if (selfIntroNodeList.Size() == 1 && selfIntroNodeList[0].Children[1].GetType().Equals(typeof(Span)))
            {
                user.SelfIntroduction = ((Span)selfIntroNodeList[0].Children[1]).GetAttribute("TITLE");
            }
            else
            {
                Console.WriteLine("判断自我描述的标准出错！");
            }
            parser.Reset();
            //获取标签
            NodeList tagsNodeList = parser.ExtractAllNodesThatMatch(tagsFilter);
            string str2 = "";
            for (int i = 0; i < tagsNodeList.Size(); i++)
            {
                if (tagsNodeList[i].GetType().Equals(typeof(Span)))
                {
                    str2 += ((Span)tagsNodeList[i]).ToPlainTextString() + " ";
                }
            }
            user.Tags = str2;
            parser.Reset();
            //获取属性信息
            NodeList profileNodeList = parser.ExtractAllNodesThatMatch(profileFilter);
            if (profileNodeList.Size() == 1)
            {
                //通过分析发现，有用的信息均处于<a>标记中，所以按<a>标记取。然后再分析是其中的文本还是<em>中的title
                NodeClassFilter aTagFilter = new NodeClassFilter(typeof(ATag));
                NodeList profileList = profileNodeList[0].Children.ExtractAllNodesThatMatch(aTagFilter, true);
                for (int j = 0; j < profileList.Size(); j++)
                {
                    ATag aTag = (ATag)profileList[j];
                    if (aTag.Attributes.Contains("TITLE"))
                    {
                        user.Profile += aTag.GetAttribute("TITLE") + " ";
                    }
                    else
                    {
                        //遇到含有node-type="infoSlide"的节点说明所有属性遍历结束
                        if (aTag.Attributes.Contains("NODE-TYPE") && aTag.GetAttribute("NODE-TYPE").Equals("infoSlide"))
                        {
                            break;
                        }
                        else
                        {
                            //包含<em>子节点的情况
                            if (aTag.Children[0].GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tagNode = (TagNode)aTag.Children[0];
                                user.Profile += tagNode.GetAttribute("TITLE") + " ";
                            }
                            else
                            {
                                //直接把<a>标记包含的文本输出
                                user.Profile += aTag.StringText + " ";
                            }
                        }
                    }
                }
            }
            else
            {
                Console.WriteLine("判断用户属性信息的标准出错！");
            }
        }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: WebCrawler.cs Projeto: CaseyYang/WebProjects

 /// <summary>
 /// 配置各种HTML节点过滤器
 /// </summary>
 private static void MakeFilters()
 {
     //爬取个人主页时，使用如下过滤器得到包含mid属性的div；mid和maid以及endid相关
     idFilter = new List<HasAttributeFilter>();
     idFilter.Add(new HasAttributeFilter("class", "WB_feed_type SW_fun  "));
     //过滤出每条微博的div
     feedFilter = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix");
     idFilter.Add(feedFilter);
     //过滤出包含微博发送者的div：因为转发微博的div也包含属性class="WB_info"，所以使用两个过滤器更为可靠
     HasAttributeFilter wbDetailFilter = new HasAttributeFilter("class", "WB_detail");
     feedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(wbDetailFilter, false));
     //过滤出包含微博内容的div：因为转发微博的div也包含属性class="WB_text"，所以使用两个过滤器更为可靠
     feedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_content"));
     //过滤出包含转发微博的div
     reFeedFilter = new HasAttributeFilter("node-type", "feed_list_forwardContent");
     //过滤出转发微博的原发送者的div：因为类似的原因，所以需要两个过滤器
     reFeedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(reFeedFilter, true));
     //过滤出转发微博的内容：因为类似的原因，所以需要两个过滤器
     reFeedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_reason"));
     //过滤出已被删除的转发微博(适用于该div位于reFeedFilter过滤出的div下的情况)
     refeedDeletedFilter1 = new HasAttributeFilter("class", "WB_deltxt");
     //过滤出已被删除的转发微博(适用于该div位于<div class="WB_datail">下的情况)
     refeedDeletedFilter2 = new AndFilter(new HasParentFilter(wbDetailFilter, true), refeedDeletedFilter1);
     //过滤出包含对原微博转发数的<b>标记
     similarFeedCountFilter = new AndFilter(new HasAttributeFilter("class", "S_spetxt"), new HasAttributeFilter("node-type", "followNum"));
     //过滤出包含对原微博类似转发的标记
     HasAttributeFilter similarFeedFilterByParent = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix WB_feed_noLine");
     similarFeedFilter = new AndFilter(wbDetailFilter, new HasParentFilter(similarFeedFilterByParent, false));
     //过滤出包含微博发送地点的div
     feedLocationFilter = new AndFilter(new HasAttributeFilter("class", "map_data"), new HasParentFilter(wbDetailFilter, false));
     //过滤出包含微博发送时间、发送方式、转发数和评论数的div
     AndFilter feedMetaDataFilter = new AndFilter(new NotFilter(new HasParentFilter(new HasAttributeFilter("class", "WB_media_expand SW_fun2 S_line1 S_bg1"), true)), new HasAttributeFilter("class", "WB_func clearfix"));
     //过滤出包含转发数和评论数的div
     AndFilter feedHandleFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_handle"));
     //过滤出包含发送时间和发送方式的div
     feedFromFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_from"));
     //过滤出包含“赞”数的链接标记
     feedLikeFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_like"));
     //过滤出包含转发数的链接标记
     feedForwardFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_forward"));
     //过滤出包含评论数的链接标记
     feedCommentFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_comment"));
     //过滤出包含微博发送时间的链接标记
     feedTimeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2 WB_time"));
     //过滤出包含微博发送方式的链接标记
     feedSendTypeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2"));
 }

Exemplo n.º 23

0

Exibir arquivo

Arquivo: CinemaSpider.cs Projeto: Letractively/swift-test-one

        public float getCinemaGrade(string html)
        {
            //string tag = "dd";
            //string attribute = "CLASS";
            //string attValue = "total";
            //string left = Spider.getValue(html, tag, attribute, attValue);
            //string tag2 = "dd";
            //string attribute2 = "CLASS";
            //string attValue2 = "total2";
            //string right = Spider.getValue(html, tag2, attribute2, attValue2);
            //string grade = left + right;
            ////return float.Parse(grade);
            //return 1.1f;
            //throw new NotImplementedException();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18");
            NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
            if (nodeList.Count == 1)
            {
                INode node = nodeList[0];
                ITag tagLeft = (node.FirstChild as ITag);
                ITag tagRight = (node.LastChild as ITag);
                string left = tagLeft.ToPlainTextString();
                string right = tagRight.ToPlainTextString();
                string strGrade = left + right;
                return float.Parse(strGrade);
            }
            return 7.0f;
        }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

        public void ParseProduct(ATag a)
        {
            string html = GetHtml(a.Link);

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter productArea = new HasAttributeFilter("id", "productyou");
            NodeList nodes = parser.ExtractAllNodesThatMatch(productArea);

            ParseProductTitle(nodes);
            ParseProductShowPhoto(nodes);
            ParseProductDemoPhoto(nodes);
            ParsePorductDescribe(nodes);

            NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans");
            NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true);

            NodeFilter productAttributes = new HasAttributeFilter("class", "cph");
            NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true);

            int length = productAttributeNodes.Count;
            for (int i = 0; i < length; i++)
            {
                INode n = productAttributeNodes[i].Children[0];
                string t =n.ToPlainTextString();
                if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success)
                {
                    ParseProductColors(n);
                }
                Console.WriteLine();
            }
        }

Exemplo n.º 25

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

        public void ParseProducts(ATag a)
        {
            string html = GetHtml(a.Link.Replace("../", "http://rrxf.cn/"));

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "photoyi");
            NodeList navNodes = parser.Parse(nav);

            if (navNodes == null)
                return;

            int length = navNodes.Count;
            for (int i = 0; i < length; i++)
            {
                ATag link = ParseProductUrl(navNodes[i].ToHtml());
                Console.WriteLine(link.Link);
                ParseProduct(link);
            }
        }

Exemplo n.º 26

0

Exibir arquivo

Arquivo: SpiderRrxf.cs Projeto: wangsying/SpiderJobs

        private static void ParseProductTitle(NodeList nodes)
        {
            NodeFilter title = new HasAttributeFilter("class", "prouductx");
            NodeList titleNodes = nodes.ExtractAllNodesThatMatch(title, true);

            Console.WriteLine(titleNodes[0].ToPlainTextString());
        }

Exemplo n.º 27

0

Exibir arquivo

Arquivo: PaperRefGenerator.cs Projeto: Wali8822/ReferenceMaker

        protected ArrayList getPaperReferenceByID(ArrayList paper_id)
        {
            string html_page = _HttpUtil.getPaperReferenceHTML(paper_id);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("div");
            HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container");

            AndFilter af = new AndFilter(tag_f, attr_f);

            NodeList childs = p.ExtractAllNodesThatMatch(af);

            if (childs == null || childs.Count <= 0)
            {
                return null;
            }

            INode node = childs[0];

            NodeList ref_childs = node.Children;
            ArrayList ref_list = new ArrayList();

            for (int i = 0; i < ref_childs.Count;++i )
            {
                INode tmp = ref_childs[i];

                if (tmp is ITag)
                {
                    ITag tag = tmp as ITag;

                    string str = tag.ToPlainTextString();

                    str = str.Replace('\r', ' ').Replace('\n',' ');

                    str = str.Substring(str.IndexOf(']') + 1);

                    //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", "");

                    ref_list.Add(str);
                }
            }

            if (_Progressable != null)
            {
                _Progressable.onFinish(ref_list);
            }

            return ref_list;
        }

Exemplo n.º 28

0

Exibir arquivo

Arquivo: PlaySpider.cs Projeto: Letractively/swift-test-one

 public List<Model.Play> getPlays(string xmlFile)
 {
     Match strCinema = Regex.Match(xmlFile, @"\d\d\d\d");
     string cinemaID = strCinema.Value;
     string html = File.ReadAllText(xmlFile);
     List<Model.Play> plays = new List<Model.Play>();
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     //获取影片列表的node
     string listAttribute = "METHOD";
     string listAttValue = "mdShowtime";
     NodeFilter movieListFilter = new HasAttributeFilter(listAttribute, listAttValue);
     NodeList movieNodeList = parser.ExtractAllNodesThatMatch(movieListFilter);
     if (movieNodeList.Count >= 1)
     {
         //获取每个电影的html
         for (int i = 0; i < movieNodeList.Count; i++)
         {
             INode node = movieNodeList[i];
             string movieHtml = node.ToHtml();
             Lexer movieLexer = new Lexer(movieHtml);
             Parser movieParser = new Parser(movieLexer);
             //获取影片ID
             NodeFilter idFilter = new HasAttributeFilter("CLASS", "c_000");
             NodeList idNodes = movieParser.ExtractAllNodesThatMatch(idFilter);
             string strID = string.Empty;
             if (idNodes.Count >= 1)
             {
                 ITag idTag = (idNodes[0] as ITag);
                 if (idTag.Attributes != null)
                 {
                     string str = idTag.Attributes["HREF"].ToString();
                     Match match = Regex.Match(str, @"\d\d\d\d\d\d");
                     if (match.Success)
                     {
                         strID = match.Value;//电影的ID
                     }
                     else
                     {
                         Match ma = Regex.Match(str, @"\d\d\d\d\d");
                         if (ma.Success)
                         {
                             strID = ma.Value;
                         }
                     }
                     //strID = match.Value;//电影的ID
                 }
             }
             //获取影片播放时段列表
             Lexer lexer2 = new Lexer(movieHtml);
             Parser movieParser2 = new Parser(lexer2);
             NodeFilter playFilter = new HasAttributeFilter("_TYPE", "expiry");
             //NodeList playNodes = parser.ExtractAllNodesThatMatch(playFilter);
             NodeList playNodes = movieParser2.ExtractAllNodesThatMatch(playFilter);
             if (playNodes.Count >= 1)
             {
                 for (int j = 0; j < playNodes.Count; j++)
                 {
                     Model.Play play = new Model.Play();
                     ITag playTag = (playNodes[j] as ITag);
                     if (playTag.Attributes != null)
                     {
                         play.CinemaID = int.Parse(cinemaID);
                         play.MovieID = int.Parse(strID);
                         play.PlayID = int.Parse(playTag.Attributes["SHOWTIMEID"].ToString());
                         string strTime = playTag.Attributes["TIME"].ToString();
                         if (strTime == null || strTime == "")
                         {
                             continue;
                         }
                         strTime = strTime.Trim();
                         strTime = strTime.Remove(0, 10);
                         play.PlayName = strTime.Trim();
                         //ITag tag2 = (playTag.FirstChild as ITag);
                         //string strPrice = tag2.FirstChild.ToPlainTextString(); //playTag.FirstChild.FirstChild.ToPlainTextString();
                         string strPrice = playTag.FirstChild.NextSibling.FirstChild.NextSibling.ToPlainTextString();
                         if (strPrice != null&&strPrice!=""&&strPrice!=" ")
                         {
                             strPrice = strPrice.Trim();
                             strPrice = strPrice.Remove(0, 1);
                             play.Price = float.Parse(strPrice);
                         }
                         else
                         {
                             play.Price = 0f;
                         }
                         plays.Add(play);
                     }
                 }
             }
         }
         return plays;
     }
     return null;
     //throw new NotImplementedException();
 }

Exemplos de Winista.Text.HtmlParser.Filters HasAttributeFilter em C# (CSharp)