/// <summary> /// 获取目标数据 /// </summary> /// <param name="parser">目标html文件</param> /// <param name="tag">标签名称</param> /// <param name="attribute">标签里面的属性名称</param> /// <param name="attValue">属性的值</param> /// <returns>标签内的目标数据</returns> public static string getValue(string html, string tag, string attribute, string attValue) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; NodeFilter nodeFilter = new TagNameFilter(tag); NodeList nodeList = parser.Parse(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagNode = (node as ITag); if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { foreach (string key in tagNode.Attributes.Keys) { if (key.Contains("<TAGNAME>")) { continue; } if (key.Contains(attribute)) { if (tagNode.Attributes[key].ToString() == attValue) { value = tagNode.ToPlainTextString(); return value; } } } } } return null; }
public static string HtmlPaser(string sourceHtml) { Winista.Text.HtmlParser.Parser parser = Parser.CreateParser(sourceHtml.Replace(System.Environment.NewLine, ""), "utf-8"); StringBuilder builderHead = new StringBuilder(); StringBuilder builderBody = new StringBuilder(); NodeFilter html = new TagNameFilter("HTML"); INode nodes = parser.Parse(html)[0]; builderHead.Append(nodes.Children[0].ToHtml()); INode body = nodes.Children[1]; INode div = body.Children[1]; for (int i = 0; i < div.Children.Count; i++) { if (div.Children[i] is ITag) builderBody.Append(div.Children[i].ToHtml()); } StringBuilder builder = new StringBuilder(); builder.Append("<html>"); builder.Append(builderHead.ToString()); builder.Append("<body>"); builder.Append(string.Format("<{0}>", div.GetText())); builder.Append(builderBody.ToString()); builder.Append("</div>"); builder.Append("</body>"); builder.Append("</html>"); return builder.ToString(); }
public static List<Product> LoadGoods(string html) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter filter = new HasAttributeFilter("class", "product"); NodeList products = parser.ExtractAllNodesThatMatch(filter); List<Product> result = new List<Product>(); for (int i = 0; i < products.Count; i++) { try { Product p = new Product(); string pname = "", ppromo = "", pimg = ""; decimal pprice = 0; ITag product = products[i] as ITag; //name NodeFilter nameFilter = new HasAttributeFilter("class", "product-title"); NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true); ITag name = names[0] as ITag; pname = name.ToPlainTextString().Trim(); //name NodeFilter priceFilter = new HasAttributeFilter("class", "product-price"); NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true); ITag price = prices[0] as ITag; pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7)); //img NodeFilter imgFilter = new TagNameFilter("img"); NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true); ITag img = imgs[0] as ITag; pimg = img.GetAttribute("DATA-KS-LAZYLOAD"); //promo NodeFilter promoFilter = new HasAttributeFilter("class", "promo"); NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true); if (promos.Count > 0) { ITag promo = promos[0] as ITag; ppromo = promo.GetAttribute("data-promo"); } p.img = pimg; p.name = pname; p.price = pprice; p.promo = ppromo; result.Add(p); } catch { } } return result; }
/// <summary> /// 转换html源码为xml格式 /// </summary> /// <param name="html">html源码</param> /// <returns>xml字符串</returns> /// <param name="TargetTag">需转换的标记名</param> public static string CovertHtmlToXml(string html, string targetTag) { try { XmlDocument doc = new XmlDocument(); XmlNode xmlDeclaration = doc.CreateXmlDeclaration("1.0", "utf-8", null); doc.AppendChild(xmlDeclaration); // 借助htmlparser解析html内容 Parser parser = Parser.CreateParser(html, "GBK"); // 筛选出指定的节点 TagNameFilter tnf = new TagNameFilter(targetTag); NodeList nodes = parser.Parse(tnf); // 创建根节点 XmlElement root = doc.CreateElement("Tags"); TagNode tagNode = null; Hashtable ht = null; XmlAttribute attr = null; XmlElement parent = null; for (int i = 0; i < nodes.Size(); i++) { tagNode = nodes[i] as TagNode; parent = doc.CreateElement(tagNode.TagName); // 添加属性 ht = tagNode.Attributes; foreach (DictionaryEntry ent in ht) { // 查看属性名是否合法 if (Regex.IsMatch(ent.Key.ToString(), validName)) { attr = doc.CreateAttribute(ent.Key.ToString()); attr.Value = ent.Value.ToString(); parent.Attributes.Append(attr); } }// end foreach (DictionaryEntry ent in ht) AppendChild(tagNode, parent, doc); root.AppendChild(parent); } doc.AppendChild(root); return doc.OuterXml; //throw new Exception("给定的html文本必须至少包含一个" + targetTag + "节点"); } catch (Exception ex) { throw new Exception("转换html内容出错:" + ex.Message); } }
protected ArrayList getPaperReferenceByID(ArrayList paper_id) { string html_page = _HttpUtil.getPaperReferenceHTML(paper_id); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("div"); HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container"); AndFilter af = new AndFilter(tag_f, attr_f); NodeList childs = p.ExtractAllNodesThatMatch(af); if (childs == null || childs.Count <= 0) { return null; } INode node = childs[0]; NodeList ref_childs = node.Children; ArrayList ref_list = new ArrayList(); for (int i = 0; i < ref_childs.Count;++i ) { INode tmp = ref_childs[i]; if (tmp is ITag) { ITag tag = tmp as ITag; string str = tag.ToPlainTextString(); str = str.Replace('\r', ' ').Replace('\n',' '); str = str.Substring(str.IndexOf(']') + 1); //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", ""); ref_list.Add(str); } } if (_Progressable != null) { _Progressable.onFinish(ref_list); } return ref_list; }
protected string getPaperID(string paper_name) { string html_page = _HttpUtil.getPaperIDHTML(paper_name); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("A"); HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank"); HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name)); AndFilter af = new AndFilter(tag_f,attr_f); AndFilter aff = new AndFilter(af, child_f); NodeList childs = p.ExtractAllNodesThatMatch(aff); if (childs == null || childs.Count <= 0) { //Paper not found return null; } //TODO Multi Paper found INode node = childs[0]; if (node is ITag) { ITag t = node as ITag; string href = t.GetAttribute("href"); if (href != null && href != "") { string [] sp = href.Split(new char[]{'/'}); return sp[sp.Length - 1].Split(new char[]{'.'})[0]; } } //Not Found return null; }