/// <summary> /// Select implementation. The public method automatically remaps a selector with the knowledge /// that the context is external (and not part of a chain) /// </summary> /// /// <exception cref="ArgumentNullException"> /// Thrown when one or more required arguments are null. /// </exception> /// /// <param name="context"> /// The context in which the selector applies. If null, the selector is run against the entire /// Document. If not, the selector is run against this sequence of elements. /// </param> /// /// <returns> /// A list of elements. This method returns a list (rather than a sequence) because the sequence /// must be enumerated to ensure that end-users don't cause the selector to be rerun repeatedly, /// and that the values are not mutable (e.g. if the underlying source changes). /// </returns> public IList <IDomObject> Select(IEnumerable <IDomObject> context) { // this holds the final output HashSet <IDomObject> output = new HashSet <IDomObject>(); if (Selector == null) { throw new ArgumentNullException("The selector cannot be null."); } if (Selector.Count == 0) { return(EmptyEnumerable().ToList()); } ActiveSelectors = new List <SelectorClause>(Selector); // First just check if we ended up here with an HTML selector; if so, hand it off. var firstSelector = ActiveSelectors[0]; if (firstSelector.SelectorType == SelectorType.HTML) { return(CsQuery.Implementation. DomDocument.Create(firstSelector.Html, HtmlParsingMode.Fragment) .ChildNodes .ToList()); } // this holds any results that carried over from the previous loop for chaining IEnumerable <IDomObject> lastResult = null; // this is the source from which selections are made in a given iteration; it could be the DOM // root, a context, or the previous result set. IEnumerable <IDomObject> selectionSource = null; // Disable the index if there is no context (e.g. disconnected elements) // or if the first element is not indexed, or the context is not from the same document as this // selector is bound. Determine which features can be used for this query by casting the index // to the known interfaces. bool useIndex; if (context.IsNullOrEmpty()) { useIndex = true; } else { IDomObject first = context.First(); useIndex = !first.IsDisconnected && first.IsIndexed && first.Document == Document; } IDomIndexRanged rangedIndex = null; IDomIndexSimple simpleIndex = null; if (useIndex) { rangedIndex = Document.DocumentIndex as IDomIndexRanged; simpleIndex = Document.DocumentIndex as IDomIndexSimple; } for (activeSelectorId = 0; activeSelectorId < ActiveSelectors.Count; activeSelectorId++) { var selector = ActiveSelectors[activeSelectorId].Clone(); if (lastResult != null && (selector.CombinatorType == CombinatorType.Root || selector.CombinatorType == CombinatorType.Context)) { // we will alter the selector during each iteration to remove the parts that have already been // parsed, so use a copy. This is a selector that was chained with the selector grouping // combinator "," -- we always output the results so far when beginning a new group. output.AddRange(lastResult); lastResult = null; } // For "and" combinator types, we want to leave everything as it was -- the results of this // selector should compound with the prior. This is not an actual CSS combinator, this is the // equivalent of grouping parenthesis. That is, in CSS there's no way to say "(input[submit], // button):visible" - that is group the results on selector part and apply a filter to it. But // we need to do exactly this for certain selector types (for example the jQuery :button // selector). if (selector.CombinatorType != CombinatorType.Grouped) { selectionSource = GetSelectionSource(selector, context, lastResult); lastResult = null; } var key = new List <ulong>(); SelectorType removeSelectorType = 0; // determine the type of traversal & depth for this selector int depth = 0; bool descendants = true; switch (selector.TraversalType) { case TraversalType.Child: depth = selector.ChildDepth; descendants = false; break; case TraversalType.Filter: case TraversalType.Adjacent: case TraversalType.Sibling: depth = 0; descendants = false; break; case TraversalType.Descendent: depth = 1; descendants = true; break; // default: fall through with default values set above. } bool canUseBasicIndex = (selectionSource == null) && descendants && depth == 0; // build index keys when possible for the active index type if (rangedIndex != null || (simpleIndex != null && canUseBasicIndex) && !selector.NoIndex) { // We don't want to use the index for "NotEquals" selectors because a missing attribute // is considered a valid match if (selector.SelectorType.HasFlag(SelectorType.AttributeValue) && selector.AttributeSelectorType != AttributeSelectorType.NotExists && selector.AttributeSelectorType != AttributeSelectorType.NotEquals) { key.Add('!'); key.Add(HtmlData.Tokenize(selector.AttributeName)); // AttributeValue must still be matched manually - so remove this flag only if the // selector is conclusive without further checking if (selector.AttributeSelectorType == AttributeSelectorType.Exists) { removeSelectorType = SelectorType.AttributeValue; } } else if (selector.SelectorType.HasFlag(SelectorType.Tag)) { key.Add('+'); key.Add(HtmlData.Tokenize(selector.Tag)); removeSelectorType = SelectorType.Tag; } else if (selector.SelectorType.HasFlag(SelectorType.ID)) { key.Add('#'); key.Add(HtmlData.TokenizeCaseSensitive(selector.ID)); removeSelectorType = SelectorType.ID; } else if (selector.SelectorType.HasFlag(SelectorType.Class)) { key.Add('.'); key.Add(HtmlData.TokenizeCaseSensitive(selector.Class)); removeSelectorType = SelectorType.Class; } } // If part of the selector was indexed, key will not be empty. Return initial set from the // index. If any selectors remain after this they will be searched the hard way. IEnumerable <IDomObject> result = null; if (key.Count > 0) { // This is the main index access point: if we have an index key, we'll get as much as we can from the index. // Anything else will be handled manually. if (selectionSource == null) { // we don't need to test for index features at this point; if canUseBasicIndex = false and we // are here, then the prior logic dictates that the ranged index is available. But always use // the simple index if that's all we need because it could be faster. result = simpleIndex.QueryIndex(key.ToArray()); } else { HashSet <IDomObject> elementMatches = new HashSet <IDomObject>(); result = elementMatches; foreach (IDomObject obj in selectionSource) { var subKey = key.Concat(HtmlData.indexSeparator).Concat(obj.NodePath).ToArray(); var matches = rangedIndex.QueryIndex(subKey, depth, descendants); elementMatches.AddRange(matches); } } selector.SelectorType &= ~removeSelectorType; // Special case for attribute selectors: when Attribute Value attribute selector is present, we // still need to filter for the correct value afterwards. But we need to change the traversal // type because any nodes with the correct attribute type have already been selected. if (selector.SelectorType.HasFlag(SelectorType.AttributeValue)) { selector.TraversalType = TraversalType.Filter; } } // If any selectors were not handled via the index, match them manually if (selector.SelectorType != 0) { // if there are no temporary results (b/c there was no indexed selector) then use selection // source instead (e.g. start from the same point that the index would have) result = GetMatches(result ?? selectionSource ?? Document.ChildElements, selector); } lastResult = lastResult == null ? result : lastResult.Concat(result); } // After the loop has finished, output any results from the last iteration. output.AddRange(lastResult); // Return the results as a list so that any user will not cause the selector to be run again return(output.OrderBy(item => item.NodePath, Implementation.PathKeyComparer.Comparer).ToList()); }
/// <summary> /// Select from the bound Document using index. First non-class/tag/id selector will result in /// this being passed off to GetMatches. /// </summary> /// /// <exception cref="ArgumentNullException"> /// Thrown when one or more required arguments are null. /// </exception> /// /// <param name="context"> /// The context in which the selector applies. If null, the selector is run against the entire /// Document. If not, the selector is run against this sequence of elements. /// </param> /// /// <returns> /// A list of elements matching the selector. /// </returns> public List <IDomObject> Select(IEnumerable <IDomObject> context) { // this holds the final output HashSet <IDomObject> output = new HashSet <IDomObject>(); if (Selector == null) { throw new ArgumentNullException("The selector cannot be null."); } if (Selector.Count == 0) { return(EmptyEnumerable().ToList()); } ActiveSelectors = new List <SelectorClause>(Selector); // First just check if we ended up here with an HTML selector; if so, hand it off. var firstSelector = ActiveSelectors[0]; if (firstSelector.SelectorType == SelectorType.HTML) { HtmlParser.HtmlElementFactory factory = new HtmlParser.HtmlElementFactory(firstSelector.Html); // Return the factory ouptut as a list because otherwise the enumerator could end up // as the actual source of the selection, meaning it would get re-parsed each time return(factory.ParseAsFragment()); } // this holds any results that carried over from the previous loop for chaining IEnumerable <IDomObject> lastResult = null; // this is the source from which selections are made in a given iteration; it could be the DOM // root, a context, or the previous result set. IEnumerable <IDomObject> selectionSource = null; // Disable the index if there is no context (e.g. disconnected elements) // or if the first element is not indexed. bool useIndex = context.IsNullOrEmpty() || (!context.First().IsDisconnected&& context.First().IsIndexed); for (activeSelectorId = 0; activeSelectorId < ActiveSelectors.Count; activeSelectorId++) { var selector = ActiveSelectors[activeSelectorId].Clone(); if (lastResult != null) { // we will alter the selector during each iteration to remove the parts that have already been // parsed, so use a copy. This is a selector that was chained with the selector grouping // combinator "," -- we always output the results so far when beginning a new group. if (selector.CombinatorType == CombinatorType.Root && lastResult != null) { output.AddRange(lastResult); lastResult = null; } } // For "and" combinator types, we want to leave everything as it was -- the results of this // selector should compound with the prior. This is not an actual CSS combinator, this is the // equivalent of grouping parenthesis. That is, in CSS there's no way to say "(input[submit], // button):visible" - that is group the results on selector part and apply a filter to it. But // we need to do exactly this for certain selector types (for example the jQuery :button // selector). if (selector.CombinatorType != CombinatorType.Grouped) { selectionSource = GetSelectionSource(selector, context, lastResult); lastResult = null; } string key = ""; SelectorType removeSelectorType = 0; if (useIndex && !selector.NoIndex) { #if DEBUG_PATH if (selector.SelectorType.HasFlag(SelectorType.AttributeValue) && selector.AttributeSelectorType != AttributeSelectorType.NotExists && selector.AttributeSelectorType != AttributeSelectorType.NotEquals) { key = "!" + selector.AttributeName.ToLower(); // AttributeValue must still be matched manually - so remove this flag only if the // selector is conclusive without further checking if (selector.AttributeSelectorType == AttributeSelectorType.Exists) { removeSelectorType = SelectorType.AttributeValue; } } else if (selector.SelectorType.HasFlag(SelectorType.Tag)) { key = "+" + selector.Tag.ToLower(); removeSelectorType = SelectorType.Tag; } else if (selector.SelectorType.HasFlag(SelectorType.ID)) { key = "#" + selector.ID; removeSelectorType = SelectorType.ID; } else if (selector.SelectorType.HasFlag(SelectorType.Class)) { key = "." + selector.Class; removeSelectorType = SelectorType.Class; } #else // We don't want to use the index for "NotEquals" selectors because a missing attribute // is considered a valid match if (selector.SelectorType.HasFlag(SelectorType.AttributeValue) && selector.AttributeSelectorType != AttributeSelectorType.NotExists && selector.AttributeSelectorType != AttributeSelectorType.NotEquals) { key = "!" + (char)HtmlData.Tokenize(selector.AttributeName); // AttributeValue must still be matched manually - so remove this flag only if the // selector is conclusive without further checking if (selector.AttributeSelectorType == AttributeSelectorType.Exists) { removeSelectorType = SelectorType.AttributeValue; } } else if (selector.SelectorType.HasFlag(SelectorType.Tag)) { key = "+" + (char)HtmlData.Tokenize(selector.Tag); removeSelectorType = SelectorType.Tag; } else if (selector.SelectorType.HasFlag(SelectorType.ID)) { key = "#" + (char)HtmlData.TokenizeCaseSensitive(selector.ID); removeSelectorType = SelectorType.ID; } else if (selector.SelectorType.HasFlag(SelectorType.Class)) { key = "." + (char)HtmlData.TokenizeCaseSensitive(selector.Class); removeSelectorType = SelectorType.Class; } #endif } // If part of the selector was indexed, key will not be empty. Return initial set from the // index. If any selectors remain after this they will be searched the hard way. IEnumerable <IDomObject> result = null; if (key != String.Empty) { // This is the main index access point: if we have an index key, we'll get as much as we can from the index. // Anything else will be handled manually. int depth = 0; bool descendants = true; switch (selector.TraversalType) { case TraversalType.Child: depth = selector.ChildDepth; descendants = false; break; case TraversalType.Filter: case TraversalType.Adjacent: case TraversalType.Sibling: depth = 0; descendants = false; break; case TraversalType.Descendent: depth = 1; descendants = true; break; } if (selectionSource == null) { result = Document.DocumentIndex.QueryIndex(key + HtmlData.indexSeparator, depth, descendants); } else { HashSet <IDomObject> elementMatches = new HashSet <IDomObject>(); result = elementMatches; foreach (IDomObject obj in selectionSource) { elementMatches.AddRange(Document.DocumentIndex.QueryIndex(key + HtmlData.indexSeparator + obj.Path, depth, descendants)); } } selector.SelectorType &= ~removeSelectorType; // Special case for attribute selectors: when Attribute Value attribute selector is present, we // still need to filter for the correct value afterwards. But we need to change the traversal // type because any nodes with the correct attribute type have already been selected. if (selector.SelectorType.HasFlag(SelectorType.AttributeValue)) { selector.TraversalType = TraversalType.Filter; } } else if (selector.SelectorType.HasFlag(SelectorType.Elements)) { HashSet <IDomObject> elementMatches = new HashSet <IDomObject>(); result = elementMatches; foreach (IDomObject obj in GetAllChildOrDescendants(selector.TraversalType, selectionSource)) { //key = HtmlData.indexSeparator + obj.Path; HashSet <IDomObject> srcKeys = new HashSet <IDomObject>(Document.DocumentIndex.QueryIndex(HtmlData.indexSeparator + obj.Path)); foreach (IDomObject match in selector.SelectElements) { if (srcKeys.Contains(match)) { elementMatches.Add(match); } } } selector.SelectorType &= ~SelectorType.Elements; } // If any selectors were not handled via the index, match them manually if (selector.SelectorType != 0) { // if there are no temporary results (b/c there was no indexed selector) then use selection // source instead (e.g. start from the same point that the index would have) result = GetMatches(result ?? selectionSource ?? Document.ChildElements, selector); } lastResult = lastResult == null ? result : lastResult.Concat(result); } // After the loop has finished, output any results from the last iteration. output.AddRange(lastResult); // Return the results as a list so that any user will not cause the selector to be run again return(output.OrderBy(item => item.Path, StringComparer.Ordinal).ToList()); }