/// <summary> /// Return all elements matching a selector, within a domain baseList, starting from list. /// </summary> /// <param name="baseList"></param> /// <param name="list"></param> /// <param name="selector"></param> /// <returns></returns> protected IEnumerable<IDomObject> GetMatches(IEnumerable<IDomObject> list, Selector selector) { // Maintain a hashset of every element already searched. Since result sets frequently contain items which are // children of other items in the list, we would end up searching the tree repeatedly HashSet<IDomObject> uniqueElements = null; Stack<MatchElement> stack = null; IEnumerable<IDomObject> curList = list; HashSet<IDomObject> temporaryResults = new HashSet<IDomObject>(); // The unique list has to be reset for each sub-selector uniqueElements = new HashSet<IDomObject>(); if (selector.SelectorType == SelectorType.HTML) { HtmlParser.DomElementFactory factory = new HtmlParser.DomElementFactory(Document); foreach (var obj in factory.CreateObjects(selector.Html)) { yield return obj; } yield break; } // Result-list position selectors are simple -- skip out of main matching code if so if (selector.SelectorType.HasFlag(SelectorType.Position) && selector.IsResultListPosition) { foreach (var obj in GetResultPositionMatches(curList, selector)) { yield return obj; } yield break; } // Otherwise, try to match each element individually stack = new Stack<MatchElement>(); foreach (var e in curList) { // We must check everything again when looking for specific depth of children // otherwise - no point - skip em if (selector.TraversalType != TraversalType.Child && uniqueElements.Contains(e)) { continue; } stack.Push(new MatchElement(e, 0)); int matchIndex = 0; while (stack.Count != 0) { var current = stack.Pop(); if (Matches(selector, current.Object, current.Depth)) { temporaryResults.Add(current.Object); matchIndex++; } // Add children to stack (in reverse order, so they are processed in the correct order when popped) // Don't keep going to children if the target depth is < the depth. Though the match would still fail, // stuff would end up the unique list which we might need to test later if it appears directly in the source list // causing it to be ignored. if (selector.TraversalType != TraversalType.Filter && current.Object is IDomElement && (selector.TraversalType != TraversalType.Child || selector.ChildDepth > current.Depth)) { SelectorType selectorType = selector.SelectorType; IDomElement elm = current.Element; if (selector.TraversalType == TraversalType.Child && selector.ChildDepth == current.Depth + 1 && selector.IsDomIndexPosition) { temporaryResults.AddRange(GetDomPositionMatches(elm, selector)); selectorType &= ~SelectorType.Position; } if (selectorType == 0) { continue; } for (int j = elm.ChildNodes.Count - 1; j >= 0; j--) { IDomObject obj = elm[j]; if (selector.TraversalType == TraversalType.Child && !uniqueElements.Add(obj)) { continue; } if (obj.NodeType == NodeType.ELEMENT_NODE) { stack.Push(new MatchElement(obj, current.Depth + 1)); } } } } } foreach (var obj in temporaryResults) { yield return obj; } yield break; }
/// <summary> /// Return all elements matching a selector, within a domain baseList, starting from list. /// </summary> /// <param name="baseList"></param> /// <param name="list"></param> /// <param name="selector"></param> /// <returns></returns> protected IEnumerable <IDomObject> GetMatches(IEnumerable <IDomObject> list, Selector selector) { // Maintain a hashset of every element already searched. Since result sets frequently contain items which are // children of other items in the list, we would end up searching the tree repeatedly HashSet <IDomObject> uniqueElements = null; Stack <MatchElement> stack = null; IEnumerable <IDomObject> curList = list; HashSet <IDomObject> temporaryResults = new HashSet <IDomObject>(); // The unique list has to be reset for each sub-selector uniqueElements = new HashSet <IDomObject>(); if (selector.SelectorType == SelectorType.HTML) { HtmlParser.DomElementFactory factory = new HtmlParser.DomElementFactory(Document); foreach (var obj in factory.CreateObjects(selector.Html)) { yield return(obj); } yield break; } // Result-list position selectors are simple -- skip out of main matching code if so if (selector.SelectorType.HasFlag(SelectorType.Position) && selector.IsResultListPosition) { foreach (var obj in GetResultPositionMatches(curList, selector)) { yield return(obj); } yield break; } // Otherwise, try to match each element individually stack = new Stack <MatchElement>(); foreach (var e in curList) { // We must check everything again when looking for specific depth of children // otherwise - no point - skip em if (selector.TraversalType != TraversalType.Child && uniqueElements.Contains(e)) { continue; } stack.Push(new MatchElement(e, 0)); int matchIndex = 0; while (stack.Count != 0) { var current = stack.Pop(); if (Matches(selector, current.Object, current.Depth)) { temporaryResults.Add(current.Object); matchIndex++; } // Add children to stack (in reverse order, so they are processed in the correct order when popped) // Don't keep going to children if the target depth is < the depth. Though the match would still fail, // stuff would end up the unique list which we might need to test later if it appears directly in the source list // causing it to be ignored. if (selector.TraversalType != TraversalType.Filter && current.Object is IDomElement && (selector.TraversalType != TraversalType.Child || selector.ChildDepth > current.Depth)) { SelectorType selectorType = selector.SelectorType; IDomElement elm = current.Element; if (selector.TraversalType == TraversalType.Child && selector.ChildDepth == current.Depth + 1 && selector.IsDomIndexPosition) { temporaryResults.AddRange(GetDomPositionMatches(elm, selector)); selectorType &= ~SelectorType.Position; } if (selectorType == 0) { continue; } for (int j = elm.ChildNodes.Count - 1; j >= 0; j--) { IDomObject obj = elm[j]; if (selector.TraversalType == TraversalType.Child && !uniqueElements.Add(obj)) { continue; } if (obj.NodeType == NodeType.ELEMENT_NODE) { stack.Push(new MatchElement(obj, current.Depth + 1)); } } } } } foreach (var obj in temporaryResults) { yield return(obj); } yield break; }