/// <summary> Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node /// satisfies the filtering criteria.<P> /// /// This mechanism allows powerful filtering code to be written very easily, /// without bothering about collection of embedded tags separately. /// e.g. when we try to get all the links on a page, it is not possible to /// get it at the top-level, as many tags (like form tags), can contain /// links embedded in them. We could get the links out by checking if the /// current node is a <see cref="CompositeTag"></see>, and going through its children. /// So this method provides a convenient way to do this.<P> /// /// Using collectInto(), programs get a lot shorter. Now, the code to /// extract all links from a page would look like: /// <pre> /// NodeList collectionList = new NodeList(); /// NodeFilter filter = new TagNameFilter ("A"); /// for (NodeIterator e = parser.elements(); e.hasMoreNodes();) /// e.nextNode().collectInto(collectionList, filter); /// </pre> /// Thus, collectionList will hold all the link nodes, irrespective of how /// deep the links are embedded.<P> /// /// Another way to accomplish the same objective is: /// <pre> /// NodeList collectionList = new NodeList(); /// NodeFilter filter = new TagClassFilter (LinkTag.class); /// for (NodeIterator e = parser.elements(); e.hasMoreNodes();) /// e.nextNode().collectInto(collectionList, filter); /// </pre> /// This is slightly less specific because the LinkTag class may be /// registered for more than one node name, e.g. <LINK> tags too. /// </summary> /// <param name="list">The node list to collect acceptable nodes into. /// </param> /// <param name="filter">The filter to determine which nodes are retained. /// </param> public virtual void CollectInto(NodeList list, INodeFilter filter) { if (filter.Accept(this)) { list.Add(this); } }
/// <summary> Filter the list with the given filter.</summary> /// <param name="filter">The filter to use. /// </param> /// <param name="recursive">If <code>true<code> digs into the children recursively. /// </param> /// <returns> A new node array containing the nodes accepted by the filter. /// This is a linear list and preserves the nested structure of the returned /// nodes only. /// </returns> public virtual NodeList ExtractAllNodesThatMatch(NodeFilter filter, bool recursive) { INode node; NodeList children; NodeList ret; ret = new NodeList(); for (int i = 0; i < m_iSize; i++) { node = nodeData[i]; if (filter.Accept(node)) ret.Add(node); if (recursive) { children = node.Children; if (null != children) ret.Add(children.ExtractAllNodesThatMatch(filter, recursive)); } } return (ret); }
/// <summary> Parse the given resource, using the filter provided. /// This can be used to extract information from specific nodes. /// When used with a <code>null</code> filter it returns an /// entire page which can then be modified and converted back to HTML /// (Note: the synthesis use-case is not handled very well; the parser /// is more often used to extract information from a web page). /// <p>For example, to replace the entire contents of the HEAD with a /// single TITLE tag you could do this: /// <pre> /// NodeList nl = parser.parse (null); // here is your two node list /// NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD")) /// if (heads.size () > 0) // there may not be a HEAD tag /// { /// Head head = heads.elementAt (0); // there should be only one /// head.removeAll (); // clean out the contents /// Tag title = new TitleTag (); /// title.setTagName ("title"); /// title.setChildren (new NodeList (new TextNode ("The New Title"))); /// Tag title_end = new TitleTag (); /// title_end.setTagName ("/title"); /// title.setEndTag (title_end); /// head.add (title); /// } /// System.out.println (nl.toHtml ()); // output the modified HTML /// </pre> /// </p> /// </summary> /// <returns> The list of matching nodes (for a <code>null</code> /// filter this is all the top level nodes). /// </returns> /// <param name="filter">The filter to apply to the parsed nodes, /// or <code>null</code> to retrieve all the top level nodes. /// </param> /// <throws> ParserException If a parsing error occurs. </throws> public virtual NodeList Parse(INodeFilter filter) { INodeIterator e; INode node; NodeList ret; ret = new NodeList(); for (e = Elements(); e.HasMoreNodes(); ) { node = e.NextNode(); if (null != filter) node.CollectInto(ret, filter); else ret.Add(node); } return (ret); }
/// <summary> Collect the children. /// <p>An initial test is performed for an empty XML tag, in which case /// the start tag and end tag of the returned tag are the same and it has /// no children.<p> /// If it's not an empty XML tag, the lexer is repeatedly asked for /// subsequent nodes until an end tag is found or a node is encountered /// that matches the tag ender set or end tag ender set. /// In the latter case, a virtual end tag is created. /// Each node found that is not the end tag is added to /// the list of children. The end tag is special and not a child.<p> /// Nodes that also have a CompositeTagScanner as their scanner are /// recursed into, which provides the nested structure of an HTML page. /// This method operates in two possible modes, depending on a private boolean. /// It can recurse on the JVM stack, which has caused some overflow problems /// in the past, or it can use the supplied stack argument to nest scanning /// of child tags within itself. The former is left as an option in the code, /// mostly to help subsequent modifiers visualize what the internal nesting /// is doing. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of subsequent nodes. /// </param> /// <param name="stack">The parse stack. May contain pending tags that enclose /// this tag. /// </param> /// <returns> The resultant tag (may be unchanged). /// </returns> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { INode node; ITag next; System.String name; IScanner scanner; ITag ret; ret = tag; if (ret.EmptyXmlTag) { ret.SetEndTag(ret); } else do { node = lexer.NextNode(false); if (null != node) { if (node is ITag) { next = (ITag) node; name = next.TagName; // check for normal end tag if (next.IsEndTag() && name.Equals(ret.TagName)) { ret.SetEndTag(next); node = null; } else if (IsTagToBeEndedFor(ret, next)) // check DTD { // backup one node. insert a virtual end tag later lexer.Position = next.StartPosition; node = null; } else if (!next.IsEndTag()) { // now recurse if there is a scanner for this type of tag scanner = next.ThisScanner; if (null != scanner) { if (mUseJVMStack) { // JVM stack recursion node = scanner.Scan(next, lexer, stack); AddChild(ret, node); } else { // fake recursion: if (scanner == this) { if (next.EmptyXmlTag) { next.SetEndTag(next); FinishTag(next, lexer); AddChild(ret, next); } else { stack.Add(ret); ret = next; } } else { // normal recursion if switching scanners node = scanner.Scan(next, lexer, stack); AddChild(ret, node); } } } else AddChild(ret, next); } else { if (!mUseJVMStack && !mLeaveEnds) { // Since all non-end tags are consumed by the // previous clause, we're here because we have an // end tag with no opening tag... this could be bad. // There are two cases... // 1) The tag hasn't been registered, in which case // we just add it as a simple child, like it's // opening tag // 2) There may be an opening tag further up the // parse stack that needs closing. // So, we ask the factory for a node like this one // (since end tags never have scanners) and see // if it's scanner is a composite tag scanner. // If it is we walk up the parse stack looking for // something that needs this end tag to finish it. // If there is something, we close off all the tags // walked over and continue on as if nothing // happened. System.Collections.ArrayList attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); attributes.Add(new TagAttribute(name, null)); ITag opener = lexer.NodeFactory.CreateTagNode(lexer.Page, next.StartPosition, next.EndPosition, attributes); scanner = opener.ThisScanner; if ((null != scanner) && (scanner == this)) { // uh-oh int index = - 1; for (int i = stack.Size() - 1; (- 1 == index) && (i >= 0); i--) { // short circuit here... assume everything on the stack has this as it's scanner // we'll need to stop if either of those conditions isn't met ITag boffo = (ITag) stack.ElementAt(i); if (name.Equals(boffo.TagName)) index = i; else if (IsTagToBeEndedFor(boffo, next)) // check DTD index = i; } if (- 1 != index) { // finish off the current one first FinishTag(ret, lexer); AddChild((ITag) stack.ElementAt(stack.Size() - 1), ret); for (int i = stack.Size() - 1; i > index; i--) { ITag fred = (ITag) stack.Remove(i); FinishTag(fred, lexer); AddChild((ITag) stack.ElementAt(i - 1), fred); } ret = (ITag) stack.Remove(index); node = null; } else AddChild(ret, next); // default behaviour } else AddChild(ret, next); // default behaviour } else AddChild(ret, next); } } else { AddChild(ret, node); node.DoSemanticAction(); } } if (!mUseJVMStack) { // handle coming out of fake recursion if (null == node) { int depth = stack.Size(); if (0 != depth) { node = stack.ElementAt(depth - 1); if (node is ITag) { ITag precursor = (ITag) node; scanner = precursor.ThisScanner; if (scanner == this) { stack.Remove(depth - 1); FinishTag(ret, lexer); AddChild(precursor, ret); ret = precursor; } else node = null; // normal recursion } else node = null; // normal recursion } } } } while (null != node); FinishTag(ret, lexer); return (ret); }
/// <summary> Finds a text node, however embedded it might be, and returns /// it. The text node will retain links to its parents, so /// further navigation is possible. /// </summary> /// <param name="searchText">The text to search for. /// </param> /// <returns> The list of text nodes (recursively) found. /// </returns> public virtual IText[] DigupStringNode(System.String searchText) { NodeList nodeList = SearchFor(searchText); NodeList stringNodes = new NodeList(); for (int i = 0; i < nodeList.Size(); i++) { INode node = nodeList.ElementAt(i); if (node is IText) { stringNodes.Add(node); } else { if (node is CompositeTag) { CompositeTag ctag = (CompositeTag) node; IText[] nodes = ctag.DigupStringNode(searchText); for (int j = 0; j < nodes.Length; j++) stringNodes.Add(nodes[j]); } } } IText[] stringNode = new IText[stringNodes.Size()]; for (int i = 0; i < stringNode.Length; i++) { stringNode[i] = (IText) stringNodes.ElementAt(i); } return stringNode; }
/// <summary> Searches for all nodes whose text representation contains the search string. /// Collects all nodes containing the search string into a NodeList. /// For example, if you wish to find any textareas in a form tag containing /// "hello world", the code would be: /// <code> /// NodeList nodeList = formTag.searchFor("Hello World"); /// </code> /// </summary> /// <param name="searchString">Search criterion. /// </param> /// <param name="caseSensitive">If <code>true</code> this search should be case /// sensitive. Otherwise, the search string and the node text are converted /// to uppercase using the locale provided. /// </param> /// <param name="locale">The locale for uppercase conversion. /// </param> /// <returns> A collection of nodes whose string contents or /// representation have the <code>searchString</code> in them. /// </returns> public virtual NodeList SearchFor(System.String searchString, bool caseSensitive, System.Globalization.CultureInfo locale) { INode node; System.String text; NodeList ret; ret = new NodeList(); if (!caseSensitive) searchString = searchString.ToUpper(locale); for (ISimpleNodeIterator e = GetChildren(); e.HasMoreNodes(); ) { node = e.NextNode(); text = node.ToPlainTextString(); if (!caseSensitive) text = text.ToUpper(locale); if (- 1 != text.IndexOf(searchString)) ret.Add(node); } return (ret); }