Ejemplo n.º 1
0
 /// <summary>
 /// 获取目标数据
 /// </summary>
 /// <param name="parser">目标html文件</param>
 /// <param name="tag">标签名称</param>
 /// <param name="attribute">标签里面的属性名称</param>
 /// <param name="attValue">属性的值</param>
 /// <returns>标签内的目标数据</returns>
 public static string getValue(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new TagNameFilter(tag);
     NodeList nodeList = parser.Parse(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             foreach (string key in tagNode.Attributes.Keys)
             {
                 if (key.Contains("<TAGNAME>"))
                 {
                     continue;
                 }
                 if (key.Contains(attribute))
                 {
                     if (tagNode.Attributes[key].ToString() == attValue)
                     {
                         value = tagNode.ToPlainTextString();
                         return value;
                     }
                 }
             }
         }
     }
     return null;
 }
        public static string HtmlPaser(string sourceHtml)
        {
            Winista.Text.HtmlParser.Parser parser = Parser.CreateParser(sourceHtml.Replace(System.Environment.NewLine, ""), "utf-8");

            StringBuilder builderHead = new StringBuilder();
            StringBuilder builderBody = new StringBuilder();

            NodeFilter html = new TagNameFilter("HTML");
            INode nodes = parser.Parse(html)[0];
            builderHead.Append(nodes.Children[0].ToHtml());
            INode body = nodes.Children[1];
            INode div = body.Children[1];

            for (int i = 0; i < div.Children.Count; i++)
            {
                if (div.Children[i] is ITag)
                    builderBody.Append(div.Children[i].ToHtml());
            }

            StringBuilder builder = new StringBuilder();
            builder.Append("<html>");
            builder.Append(builderHead.ToString());
            builder.Append("<body>");
            builder.Append(string.Format("<{0}>", div.GetText()));
            builder.Append(builderBody.ToString());
            builder.Append("</div>");
            builder.Append("</body>");
            builder.Append("</html>");
            return builder.ToString();
        }
Ejemplo n.º 3
0
        public static List<Product> LoadGoods(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter filter = new HasAttributeFilter("class", "product");
            NodeList products = parser.ExtractAllNodesThatMatch(filter);

            List<Product> result = new List<Product>();
            for (int i = 0; i < products.Count; i++)
            {
                try
                {
                    Product p = new Product();
                    string pname = "", ppromo = "", pimg = "";
                    decimal pprice = 0;
                    ITag product = products[i] as ITag;

                    //name
                    NodeFilter nameFilter = new HasAttributeFilter("class", "product-title");
                    NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true);
                    ITag name = names[0] as ITag;
                    pname = name.ToPlainTextString().Trim();

                    //name
                    NodeFilter priceFilter = new HasAttributeFilter("class", "product-price");
                    NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true);
                    ITag price = prices[0] as ITag;
                    pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7));

                    //img
                    NodeFilter imgFilter = new TagNameFilter("img");
                    NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true);
                    ITag img = imgs[0] as ITag;
                    pimg = img.GetAttribute("DATA-KS-LAZYLOAD");

                    //promo
                    NodeFilter promoFilter = new HasAttributeFilter("class", "promo");
                    NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true);
                    if (promos.Count > 0)
                    {
                        ITag promo = promos[0] as ITag;
                        ppromo = promo.GetAttribute("data-promo");
                    }

                    p.img = pimg;
                    p.name = pname;
                    p.price = pprice;
                    p.promo = ppromo;
                    result.Add(p);
                }
                catch
                {

                }

            }

            return result;
        }
Ejemplo n.º 4
0
            /// <summary>
            /// 转换html源码为xml格式
            /// </summary>
            /// <param name="html">html源码</param>
            /// <returns>xml字符串</returns>
            /// <param name="TargetTag">需转换的标记名</param>
            public static string CovertHtmlToXml(string html, string targetTag)
            {
                try
                {
                    XmlDocument doc = new XmlDocument();
                    XmlNode xmlDeclaration = doc.CreateXmlDeclaration("1.0", "utf-8", null);
                    doc.AppendChild(xmlDeclaration);

                    // 借助htmlparser解析html内容
                    Parser parser = Parser.CreateParser(html, "GBK");
                    // 筛选出指定的节点
                    TagNameFilter tnf = new TagNameFilter(targetTag);
                    NodeList nodes = parser.Parse(tnf);

                    // 创建根节点
                    XmlElement root = doc.CreateElement("Tags");

                    TagNode tagNode = null;
                    Hashtable ht = null;
                    XmlAttribute attr = null;
                    XmlElement parent = null;
                    for (int i = 0; i < nodes.Size(); i++)
                    {
                        tagNode = nodes[i] as TagNode;
                        parent = doc.CreateElement(tagNode.TagName);

                        // 添加属性
                        ht = tagNode.Attributes;
                        foreach (DictionaryEntry ent in ht)
                        {
                            // 查看属性名是否合法
                            if (Regex.IsMatch(ent.Key.ToString(), validName))
                            {
                                attr = doc.CreateAttribute(ent.Key.ToString());
                                attr.Value = ent.Value.ToString();
                                parent.Attributes.Append(attr);
                            }
                        }// end foreach (DictionaryEntry ent in ht)

                        AppendChild(tagNode, parent, doc);

                        root.AppendChild(parent);
                    }
                    doc.AppendChild(root);

                    return doc.OuterXml;

                    //throw new Exception("给定的html文本必须至少包含一个" + targetTag + "节点");
                }
                catch (Exception ex)
                {
                    throw new Exception("转换html内容出错:" + ex.Message);
                }
            }
Ejemplo n.º 5
0
        protected ArrayList getPaperReferenceByID(ArrayList paper_id)
        {
            string html_page = _HttpUtil.getPaperReferenceHTML(paper_id);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("div");
            HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container");

            AndFilter af = new AndFilter(tag_f, attr_f);

            NodeList childs = p.ExtractAllNodesThatMatch(af);

            if (childs == null || childs.Count <= 0)
            {
                return null;
            }

            INode node = childs[0];

            NodeList ref_childs = node.Children;
            ArrayList ref_list = new ArrayList();

            for (int i = 0; i < ref_childs.Count;++i )
            {
                INode tmp = ref_childs[i];

                if (tmp is ITag)
                {
                    ITag tag = tmp as ITag;

                    string str = tag.ToPlainTextString();

                    str = str.Replace('\r', ' ').Replace('\n',' ');

                    str = str.Substring(str.IndexOf(']') + 1);

                    //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", "");

                    ref_list.Add(str);
                }
            }

            if (_Progressable != null)
            {
                _Progressable.onFinish(ref_list);
            }

            return ref_list;
        }
Ejemplo n.º 6
0
        protected string getPaperID(string paper_name)
        {
            string html_page = _HttpUtil.getPaperIDHTML(paper_name);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("A");
            HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank");
            HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name));

            AndFilter af = new AndFilter(tag_f,attr_f);
            AndFilter aff = new AndFilter(af, child_f);

            NodeList childs = p.ExtractAllNodesThatMatch(aff);

            if (childs == null || childs.Count <= 0)
            {
                //Paper not found
                return null;
            }
            //TODO Multi Paper found

            INode node = childs[0];
            if (node is ITag)
            {
                ITag t = node as ITag;

                string href = t.GetAttribute("href");

                if (href != null && href != "")
                {
                    string [] sp = href.Split(new char[]{'/'});

                    return sp[sp.Length - 1].Split(new char[]{'.'})[0];
                }
            }

            //Not Found
            return null;
        }