/// <summary> /// /// </summary> /// <param name="parser"></param> public HtmlPage(Parser parser):base(true) { title = ""; nodesInBody = new NodeList(); tables = new NodeList(); m_Images = new NodeList(); }
/// <summary> Create an abstract node with the page positions given. /// Remember the page and start & end cursor positions. /// </summary> /// <param name="page">The page this tag was read from. /// </param> /// <param name="start">The starting offset of this node within the page. /// </param> /// <param name="end">The ending offset of this node within the page. /// </param> public AbstractNode(Page page, int start, int end) { mPage = page; nodeBegin = start; nodeEnd = end; parent = null; children = null; }
public NodeList GetListUrl(string url) { Parser parser = ParserHelp.GetParser(url); NodeFilter filter = new HasAttributeFilter("class", "list_title"); NodeList list = new NodeList(); list = parser.ExtractAllNodesThatMatch(filter); return list; }
/// <summary> Search given node and pick up any objects of given type.</summary> /// <param name="node">The node to search. /// </param> /// <param name="type">The class to search for. /// </param> /// <returns> A node array with the matching nodes. /// </returns> public static INode[] FindTypeInNode(INode node, System.Type type) { INodeFilter filter; NodeList ret; ret = new NodeList(); filter = new NodeClassFilter(type); node.CollectInto(ret, filter); return (ret.ToNodeArray()); }
/// <summary> Scan for script. /// Accumulates text from the page, until </[a-zA-Z] is encountered. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of CDATA. /// </param> /// <param name="stack">The parse stack, <em>not used</em>. /// </param> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { System.String language; System.String code; INode content; int position; INode node; TagAttribute attribute; System.Collections.ArrayList vector; if (tag is ScriptTag) { language = ((ScriptTag) tag).Language; if ((null != language) && (language.ToUpper().Equals("JScript.Encode".ToUpper()) || language.ToUpper().Equals("VBScript.Encode".ToUpper()))) { code = ScriptDecoder.Decode(lexer.Page, lexer.Cursor); ((ScriptTag) tag).ScriptCode = code; } } content = lexer.ParseCDATA(!STRICT); position = lexer.Position; node = lexer.NextNode(false); if (null != node) if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0]))) { lexer.Position = position; node = null; } // build new end tag if required if (null == node) { attribute = new TagAttribute("/script", null); vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); vector.Add(attribute); node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector); } tag.SetEndTag((ITag) node); if (null != content) { tag.Children = new NodeList(content); content.Parent = tag; } node.Parent = tag; tag.DoSemanticAction(); return (tag); }
public void GetContartInfo(NodeList list, ref Job info) { string miaoshu = list[0].ToString(); if (string.IsNullOrEmpty(miaoshu)) { return; } miaoshu = Regex.Replace(miaoshu,@"(\\t|\s)",""); Match company = Regex.Match(miaoshu, @"Txt\(4903\[108\,12\]\,4935\[110\,16\]\)\:\\n(?<company>\w*)\\n...End", RegexOptions.Multiline); if (company.Success) { info.company = company.Value; } }
/// <summary> Get the next node.</summary> /// <returns> The next node in the HTML stream, or null if there are no more nodes. /// </returns> /// <exception cref="ParserException">If an unrecoverable error occurs. /// </exception> public virtual INode NextNode() { ITag tag; IScanner scanner; NodeList stack; INode ret; try { ret = mLexer.NextNode(); if (null != ret) { // kick off recursion for the top level node if (ret is ITag) { tag = (ITag) ret; if (!tag.IsEndTag()) { // now recurse if there is a scanner for this type of tag scanner = tag.ThisScanner; if (null != scanner) { stack = new NodeList(); ret = scanner.Scan(tag, mLexer, stack); } } } } } catch (ParserException pe) { throw pe; // no need to wrap an existing ParserException } catch (System.Exception e) { System.Text.StringBuilder msgBuffer = new System.Text.StringBuilder(); msgBuffer.Append("Unexpected Exception occurred while reading "); msgBuffer.Append(mLexer.Page.Url); msgBuffer.Append(", in nextNode"); // TODO: appendLineDetails (msgBuffer); ParserException ex = new ParserException(msgBuffer.ToString(), e); mFeedback.Error(msgBuffer.ToString(), ex); throw ex; } return (ret); }
/// <summary> /// 将img图片路径转为网路完整的图片路径 /// </summary> /// <param name="html">需要转换的内容</param> /// <param name="url">替换站点路径:http://www.z01.com</param> /// <returns></returns> public string ConvertImgUrl(string html, string url) { if (string.IsNullOrEmpty(html) || string.IsNullOrEmpty(url)) { return(html); } HtmlPage page = GetPage("<html><body>" + html + "</body></html>"); Winista.Text.HtmlParser.Util.NodeList nodes = page.Body.ExtractAllNodesThatMatch(new TagNameFilter("IMG"), true); for (int i = 0; i < nodes.Count; i++) { ImageTag image = (ImageTag)nodes[i]; if (!image.ImageURL.ToLower().Contains("://")) { image.ImageURL = url.TrimEnd('/') + ("/" + image.ImageURL.TrimStart('/')); } } return(page.Body.ToHtml()); }
public Job GetDetail(string url) { Job info = new Job(); Parser parser = ParserHelp.GetParser(url); NodeFilter miaoShu = new HasAttributeFilter("id", "miaoshu"); NodeFilter mainBox = new HasAttributeFilter("class", "mainBox"); NodeFilter orfilter = new OrFilter(miaoShu, mainBox); NodeList list = new NodeList(); list = parser.Parse(orfilter); if (list == null || list.Count < 2) { return info; } GetMiaoShu(list, ref info); GetContartInfo(list, ref info); return info; }
/// <summary> Scan for style definitions. /// Accumulates text from the page, until </[a-zA-Z] is encountered. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of CDATA. /// </param> /// <param name="stack">The parse stack, <em>not used</em>. /// </param> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { INode content; int position; INode node; TagAttribute attribute; System.Collections.ArrayList vector; content = lexer.ParseCDATA(); position = lexer.Position; node = lexer.NextNode(false); if (null != node) if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0]))) { lexer.Position = position; node = null; } // build new end tag if required if (null == node) { attribute = new TagAttribute("/style", null); vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); vector.Add(attribute); node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector); } tag.SetEndTag((ITag) node); if (null != content) { tag.Children = new NodeList(content); content.Parent = tag; } node.Parent = tag; tag.DoSemanticAction(); return (tag); }
static void GetSubtitleFromHtml(NodeList nodeList, string subtitleType) { Console.WriteLine(subtitleType); StreamWriter writer = new StreamWriter(subtitleType + ".txt"); for (int i = 0; i < nodeList.Count; i++) { INode currentNode = (INode)nodeList[i]; while (currentNode.NextSibling != null && !currentNode.NextSibling.GetType().Equals(typeof(TableColumn))) { currentNode = currentNode.NextSibling; } if (currentNode.NextSibling != null) { TableColumn contentNode = (TableColumn)currentNode.NextSibling; string content = ""; NodeList childrenNode = contentNode.Children; if (childrenNode != null) { for (int j = 0; j < childrenNode.Count; j++) { if (childrenNode[j].GetText().Equals("br")) { writer.WriteLine(content); content = ""; continue; } string tmpStr = HttpUtility.HtmlDecode(childrenNode[j].ToPlainTextString()); tmpStr = tmpStr.Trim(); content += tmpStr; } //TableColumn speakerNode=(TableColumn)colorCell.NextSibling.NextSibling; writer.WriteLine(content); } } } writer.Close(); }
/// <summary> Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node /// satisfies the filtering criteria.<P> /// /// This mechanism allows powerful filtering code to be written very easily, /// without bothering about collection of embedded tags separately. /// e.g. when we try to get all the links on a page, it is not possible to /// get it at the top-level, as many tags (like form tags), can contain /// links embedded in them. We could get the links out by checking if the /// current node is a <see cref="CompositeTag"></see>, and going through its children. /// So this method provides a convenient way to do this.<P> /// /// Using collectInto(), programs get a lot shorter. Now, the code to /// extract all links from a page would look like: /// <pre> /// NodeList collectionList = new NodeList(); /// NodeFilter filter = new TagNameFilter ("A"); /// for (NodeIterator e = parser.elements(); e.hasMoreNodes();) /// e.nextNode().collectInto(collectionList, filter); /// </pre> /// Thus, collectionList will hold all the link nodes, irrespective of how /// deep the links are embedded.<P> /// /// Another way to accomplish the same objective is: /// <pre> /// NodeList collectionList = new NodeList(); /// NodeFilter filter = new TagClassFilter (LinkTag.class); /// for (NodeIterator e = parser.elements(); e.hasMoreNodes();) /// e.nextNode().collectInto(collectionList, filter); /// </pre> /// This is slightly less specific because the LinkTag class may be /// registered for more than one node name, e.g. <LINK> tags too. /// </summary> /// <param name="list">The node list to collect acceptable nodes into. /// </param> /// <param name="filter">The filter to determine which nodes are retained. /// </param> public virtual void CollectInto(NodeList list, INodeFilter filter) { if (filter.Accept(this)) { list.Add(this); } }
/// <summary> Filter the list with the given filter.</summary> /// <param name="filter">The filter to use. /// </param> /// <param name="recursive">If <code>true<code> digs into the children recursively. /// </param> /// <returns> A new node array containing the nodes accepted by the filter. /// This is a linear list and preserves the nested structure of the returned /// nodes only. /// </returns> public virtual NodeList ExtractAllNodesThatMatch(NodeFilter filter, bool recursive) { INode node; NodeList children; NodeList ret; ret = new NodeList(); for (int i = 0; i < m_iSize; i++) { node = nodeData[i]; if (filter.Accept(node)) ret.Add(node); if (recursive) { children = node.Children; if (null != children) ret.Add(children.ExtractAllNodesThatMatch(filter, recursive)); } } return (ret); }
private void ParsePorductDescribe(NodeList nodes) { NodeFilter miao = new HasAttributeFilter("class", "miao"); NodeList miaoArea = nodes.ExtractAllNodesThatMatch(miao, true); NodeFilter pictures = new NodeClassFilter(typeof(ImageTag)); NodeList pictureNodes = miaoArea.ExtractAllNodesThatMatch(pictures, true); DownloadPictures(pictureNodes); string miaoshu = miaoArea.AsHtml(); miaoshu = Regex.Replace(miaoshu, @"http\://(www\.|)rrxf\.cn/", pictureURL + "/", RegexOptions.IgnoreCase); miaoshu = Regex.Replace(miaoshu, @"(pic|bigpic)/", "$1_", RegexOptions.IgnoreCase); miaoshu = miaoshu.Replace("-", "_"); Console.WriteLine(miaoshu); }
public virtual bool NodePushRangeChildren() { NodeList nl = m_node.Children; nl = nl.ExtractAllNodesThatMatch(AndFilter.TrueFilter,true); if (nl.Count > 0) { m_nodestack.Push(m_node); m_nodestack.Push(m_nodeenum); m_nodestack.Push(m_nodelist); m_nodelist = nl; m_nodeenum = m_nodelist.Elements(); m_node = null; return true; } else return false; }
public virtual bool NodePushRangeAll() { NodeList nl = m_parser.Parse(AndFilter.TrueFilter); if (nl.Count > 0) { m_nodestack.Push(m_node); m_nodestack.Push(m_nodeenum); m_nodestack.Push(m_nodelist); m_nodelist = nl; m_nodeenum = m_nodelist.Elements(); m_node = null; return true; } else return false; }
public virtual void Navigate(string url) { try { m_parser = new Parser(new Lexer(getHtml(url,null)),null); m_nodestack.Clear(); m_node = null; m_nodeenum = null; m_nodelist = null; m_url = url; //m_parser.InputHTML = getHtml(url, null); //m_parser.URL = url; //m_parser.AnalyzePage(); } catch (Exception e) { log.Error("Navigate: "+url, e); } }
/// <summary> Extract all nodes matching the given filter.</summary> /// <param name="filter">The filter to be applied to the nodes. /// </param> /// <throws> ParserException If a parse error occurs. </throws> /// <returns> A list of nodes matching the filter criteria, /// i.e. for which the filter's accept method /// returned <code>true</code>. /// </returns> public virtual NodeList ExtractAllNodesThatMatch(INodeFilter filter) { INodeIterator e; NodeList ret; ret = new NodeList(); for (e = Elements(); e.HasMoreNodes(); ) e.NextNode().CollectInto(ret, filter); return (ret); }
/// <summary> Parse the given resource, using the filter provided. /// This can be used to extract information from specific nodes. /// When used with a <code>null</code> filter it returns an /// entire page which can then be modified and converted back to HTML /// (Note: the synthesis use-case is not handled very well; the parser /// is more often used to extract information from a web page). /// <p>For example, to replace the entire contents of the HEAD with a /// single TITLE tag you could do this: /// <pre> /// NodeList nl = parser.parse (null); // here is your two node list /// NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD")) /// if (heads.size () > 0) // there may not be a HEAD tag /// { /// Head head = heads.elementAt (0); // there should be only one /// head.removeAll (); // clean out the contents /// Tag title = new TitleTag (); /// title.setTagName ("title"); /// title.setChildren (new NodeList (new TextNode ("The New Title"))); /// Tag title_end = new TitleTag (); /// title_end.setTagName ("/title"); /// title.setEndTag (title_end); /// head.add (title); /// } /// System.out.println (nl.toHtml ()); // output the modified HTML /// </pre> /// </p> /// </summary> /// <returns> The list of matching nodes (for a <code>null</code> /// filter this is all the top level nodes). /// </returns> /// <param name="filter">The filter to apply to the parsed nodes, /// or <code>null</code> to retrieve all the top level nodes. /// </param> /// <throws> ParserException If a parsing error occurs. </throws> public virtual NodeList Parse(INodeFilter filter) { INodeIterator e; INode node; NodeList ret; ret = new NodeList(); for (e = Elements(); e.HasMoreNodes(); ) { node = e.NextNode(); if (null != filter) node.CollectInto(ret, filter); else ret.Add(node); } return (ret); }
private static void ParseProductTitle(NodeList nodes) { NodeFilter title = new HasAttributeFilter("class", "prouductx"); NodeList titleNodes = nodes.ExtractAllNodesThatMatch(title, true); Console.WriteLine(titleNodes[0].ToPlainTextString()); }
private void ParseProductShowPhoto(NodeList nodes) { NodeFilter show = new HasAttributeFilter("class", "Picture220"); NodeList showNodes = nodes.ExtractAllNodesThatMatch(show, true); ImageTag showTag = showNodes[0] as ImageTag; showTag.ImageURL = showTag.ImageURL.Replace("../../", "http://rrxf.cn/"); Console.WriteLine(showTag.ImageURL); DownloadPicture(showTag.ImageURL); }
private void ParseProductDemoPhoto(NodeList nodes) { NodeFilter photo = new HasAttributeFilter("class", "Picture40"); NodeList photoNodes = nodes.ExtractAllNodesThatMatch(photo, true); DownloadPictures(photoNodes); }
/// <summary> Add another node list to this one.</summary> /// <param name="list">The list to add. /// </param> public virtual void Add(NodeList list) { for (int i = 0; i < list.m_iSize; i++) Add(list.nodeData[i]); }
public virtual bool NodePopRange() { m_nodelist = (NodeList)m_nodestack.Pop(); m_nodeenum = (ISimpleNodeIterator)m_nodestack.Pop(); m_node = (INode)m_nodestack.Pop(); return true; }
public SimpleNodeIterator(NodeList enclosingInstance) { InitBlock(enclosingInstance); }
public virtual bool NodePushRangeByName(string elementname) { NodeList nl = m_parser.Parse(new HasAttributeFilter("name", elementname)); if (nl.Count > 0) { m_nodestack.Push(m_node); m_nodestack.Push(m_nodeenum); m_nodestack.Push(m_nodelist); m_nodelist = nl; m_nodeenum = m_nodelist.Elements(); m_node = null; return true; } else return false; }
private void InitBlock(NodeList enclosingInstance) { this.m_enclosingInstance = enclosingInstance; }
public virtual bool NodePushRangeScripts() { NodeList nl = m_parser.Parse(new TagNameFilter("SCRIPT")); if (nl.Count > 0) { m_nodestack.Push(m_node); m_nodestack.Push(m_nodeenum); m_nodestack.Push(m_nodelist); m_nodelist = nl; m_nodeenum = m_nodelist.Elements(); m_node = null; return true; } else return false; }
private void DownloadPictures(NodeList photoNodes) { List<ImageTag> photos = new List<ImageTag>(); int length = photoNodes.Count; for (int i = 0; i < length; i++) { ImageTag imgTag = photoNodes[i] as ImageTag; imgTag.ImageURL = imgTag.ImageURL.Replace("../../", "http://rrxf.cn/"); Console.WriteLine(imgTag.ImageURL); photos.Add(imgTag); DownloadPicture(imgTag.ImageURL); } }
protected void Dispose(bool Disposing) { if(!IsDisposed) { if (Disposing) { //清理托管资源 m_parser = null; //if (m_nodelist != null) m_nodelist.Clear(); m_nodelist = null; m_node = null; m_nodeenum = null; //m_nodestack.Clear(); m_nodestack = null; } //清理非托管资源 } IsDisposed=true; }
/// <summary> Scan the tag. /// For this implementation, the only operation is to perform the tag's /// semantic action. /// </summary> /// <param name="tag">The tag to scan. /// </param> /// <param name="lexer">Provides html page access. /// </param> /// <param name="stack">The parse stack. May contain pending tags that enclose /// this tag. /// </param> /// <returns> The resultant tag (may be unchanged). /// </returns> public virtual ITag Scan(ITag tag, Winista.Text.HtmlParser.Lex.Lexer lexer, NodeList stack) { tag.DoSemanticAction(); return (tag); }