Esempio n. 1
0
        private void Populate(char[] html, HtmlParsingMode htmlParsingMode)
        {
            if (html != null && html.Length > 0)
            {
                SourceHtml = html;
            }

            HtmlElementFactory factory = new HtmlParser.HtmlElementFactory(this);

            Populate(factory.Parse(htmlParsingMode));
        }
Esempio n. 2
0
        /// <summary>
        /// Select from the bound Document using index. First non-class/tag/id selector will result in
        /// this being passed off to GetMatches.
        /// </summary>
        ///
        /// <exception cref="ArgumentNullException">
        /// Thrown when one or more required arguments are null.
        /// </exception>
        ///
        /// <param name="context">
        /// The context in which the selector applies. If null, the selector is run against the entire
        /// Document. If not, the selector is run against this sequence of elements.
        /// </param>
        ///
        /// <returns>
        /// A list of elements matching the selector.
        /// </returns>

        public List<IDomObject> Select(IEnumerable<IDomObject> context)
        {
            // this holds the final output

            HashSet<IDomObject> output = new HashSet<IDomObject>();

            if (Selector == null )
            {
                throw new ArgumentNullException("The selector cannot be null.");
            }

            if (Selector.Count == 0)
            {
                return EmptyEnumerable().ToList();
            }

            ActiveSelectors = new List<SelectorClause>(Selector);

            // First just check if we ended up here with an HTML selector; if so, hand it off.
            
            var firstSelector = ActiveSelectors[0];
            if (firstSelector.SelectorType == SelectorType.HTML)
            {

                HtmlParser.HtmlElementFactory factory = 
                    new HtmlParser.HtmlElementFactory(firstSelector.Html);

                // Return the factory ouptut as a list because otherwise the enumerator could end up
                // as the actual source of the selection, meaning it would get re-parsed each time
                
                return factory.ParseAsFragment();
            } 

            // this holds any results that carried over from the previous loop for chaining

            IEnumerable<IDomObject> lastResult = null;

            // this is the source from which selections are made in a given iteration; it could be the DOM
            // root, a context, or the previous result set. 
            
            IEnumerable<IDomObject> selectionSource=null;

            // Disable the index if there is no context (e.g. disconnected elements)
            // or if the first element is not indexed.

            bool useIndex = context.IsNullOrEmpty() || 
                (!context.First().IsDisconnected && context.First().IsIndexed);


            for (activeSelectorId = 0; activeSelectorId < ActiveSelectors.Count; activeSelectorId++)
            {

                var selector = ActiveSelectors[activeSelectorId].Clone();

                if (lastResult != null)
                {
                    // we will alter the selector during each iteration to remove the parts that have already been
                    // parsed, so use a copy. This is a selector that was chained with the selector grouping
                    // combinator "," -- we always output the results so far when beginning a new group. 

                    if (selector.CombinatorType == CombinatorType.Root && lastResult != null)
                    {
                        output.AddRange(lastResult);
                        lastResult = null;
                    }
                }

                // For "and" combinator types, we want to leave everything as it was -- the results of this
                // selector should compound with the prior. This is not an actual CSS combinator, this is the
                // equivalent of grouping parenthesis. That is, in CSS there's no way to say "(input[submit],
                // button):visible" - that is group the results on selector part and apply a filter to it. But
                // we need to do exactly this for certain selector types (for example the jQuery :button
                // selector). 

                if (selector.CombinatorType != CombinatorType.Grouped)
                {
                    selectionSource = GetSelectionSource(selector, context, lastResult);
                    lastResult = null;
                }
                
                string key = "";
                SelectorType removeSelectorType = 0;

                if (useIndex && !selector.NoIndex)
                {

#if DEBUG_PATH

                    if (selector.SelectorType.HasFlag(SelectorType.AttributeValue) 
                        && selector.AttributeSelectorType != AttributeSelectorType.NotExists
                        && selector.AttributeSelectorType != AttributeSelectorType.NotEquals)
                    {
                        key = "!" + selector.AttributeName.ToLower();

                        // AttributeValue must still be matched manually - so remove this flag only if the
                        // selector is conclusive without further checking
                        
                        if (selector.AttributeSelectorType == AttributeSelectorType.Exists)
                        {
                            removeSelectorType = SelectorType.AttributeValue;
                        }
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Tag))
                    {
                        key = "+"+selector.Tag.ToLower();
                        removeSelectorType=SelectorType.Tag;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.ID))
                    {
                        key = "#" + selector.ID;
                        removeSelectorType=SelectorType.ID;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Class))
                    {
                        key = "." + selector.Class;
                        removeSelectorType=SelectorType.Class;
                    }

#else
                    // We don't want to use the index for "NotEquals" selectors because a missing attribute
                    // is considered a valid match
                    
                    if (selector.SelectorType.HasFlag(SelectorType.AttributeValue)
                        && selector.AttributeSelectorType != AttributeSelectorType.NotExists
                        && selector.AttributeSelectorType != AttributeSelectorType.NotEquals)
                    {
                        key = "!" + (char)HtmlData.Tokenize(selector.AttributeName);

                        // AttributeValue must still be matched manually - so remove this flag only if the
                        // selector is conclusive without further checking
                        
                        if (selector.AttributeSelectorType == AttributeSelectorType.Exists)
                        {
                            removeSelectorType = SelectorType.AttributeValue;
                        }
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Tag))
                    {
                        key = "+" + (char)HtmlData.Tokenize(selector.Tag);
                        removeSelectorType=SelectorType.Tag;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.ID))
                    {
                        key = "#" + (char)HtmlData.TokenizeCaseSensitive(selector.ID);
                        removeSelectorType=SelectorType.ID;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Class))
                    {
                        key = "." + (char)HtmlData.TokenizeCaseSensitive(selector.Class);
                        removeSelectorType=SelectorType.Class;
                    }
#endif
                }

                // If part of the selector was indexed, key will not be empty. Return initial set from the
                // index. If any selectors remain after this they will be searched the hard way.

                IEnumerable<IDomObject> result = null;

                if (key != String.Empty)
                {
                    // This is the main index access point: if we have an index key, we'll get as much as we can from the index.
                    // Anything else will be handled manually.

                    int depth = 0;
                    bool descendants = true;
                    
                    switch (selector.TraversalType)
                    {
                        case TraversalType.Child:
                            depth = selector.ChildDepth;
                            descendants = false;
                            break;
                        case TraversalType.Filter:
                        case TraversalType.Adjacent:
                        case TraversalType.Sibling:
                            depth = 0;
                            descendants = false;
                            break;
                        case TraversalType.Descendent:
                            depth = 1;
                            descendants = true;
                            break;
                    }                    

                    if (selectionSource == null)
                    {
                        result = Document.DocumentIndex.QueryIndex(key + HtmlData.indexSeparator, depth, descendants);
                    }
                    else
                    {
                        HashSet<IDomObject> elementMatches = new HashSet<IDomObject>();
                        result = elementMatches;

                        foreach (IDomObject obj in selectionSource)
                        {
                            elementMatches.AddRange(Document.DocumentIndex.QueryIndex(key + HtmlData.indexSeparator + obj.Path,
                                    depth, descendants));
                        }

                    }
                    selector.SelectorType &= ~removeSelectorType;

                    // Special case for attribute selectors: when Attribute Value attribute selector is present, we
                    // still need to filter for the correct value afterwards. But we need to change the traversal
                    // type because any nodes with the correct attribute type have already been selected. 
                    
                    if (selector.SelectorType.HasFlag(SelectorType.AttributeValue))
                    {
                        selector.TraversalType = TraversalType.Filter;
                    }
                    
                }
                else if (selector.SelectorType.HasFlag(SelectorType.Elements))
                {
                    HashSet<IDomObject> elementMatches = new HashSet<IDomObject>();
                    result = elementMatches;
                    foreach (IDomObject obj in GetAllChildOrDescendants(selector.TraversalType,selectionSource))
                    {

                        //key = HtmlData.indexSeparator + obj.Path;
                        HashSet<IDomObject> srcKeys = new HashSet<IDomObject>(Document.DocumentIndex.QueryIndex(HtmlData.indexSeparator + obj.Path));
                        foreach (IDomObject match in selector.SelectElements)
                        {
                            if (srcKeys.Contains(match))
                            {
                                elementMatches.Add(match);
                            }
                        }
                    }

                    selector.SelectorType &= ~SelectorType.Elements;
                }

                // If any selectors were not handled via the index, match them manually
                
                if (selector.SelectorType != 0)
                {
      
                    // if there are no temporary results (b/c there was no indexed selector) then use selection
                    // source instead (e.g. start from the same point that the index would have) 

                    result = GetMatches(result ?? selectionSource ?? Document.ChildElements, selector);
                }
                
                lastResult = lastResult == null ?
                    result : lastResult.Concat(result); 
                
            }

            // After the loop has finished, output any results from the last iteration.

            output.AddRange(lastResult);

            // Return the results as a list so that any user will not cause the selector to be run again
            
            return output.OrderBy(item => item.Path, StringComparer.Ordinal).ToList();

        }
Esempio n. 3
0
        /// <summary>
        /// Select from the bound Document using index. First non-class/tag/id selector will result in
        /// this being passed off to GetMatches.
        /// </summary>
        ///
        /// <exception cref="ArgumentNullException">
        /// Thrown when one or more required arguments are null.
        /// </exception>
        ///
        /// <param name="context">
        /// The context in which the selector applies. If null, the selector is run against the entire
        /// Document. If not, the selector is run against this sequence of elements.
        /// </param>
        ///
        /// <returns>
        /// A list of elements matching the selector.
        /// </returns>

        public List <IDomObject> Select(IEnumerable <IDomObject> context)
        {
            // this holds the final output

            HashSet <IDomObject> output = new HashSet <IDomObject>();

            if (Selector == null)
            {
                throw new ArgumentNullException("The selector cannot be null.");
            }

            if (Selector.Count == 0)
            {
                return(EmptyEnumerable().ToList());
            }

            ActiveSelectors = new List <SelectorClause>(Selector);

            // First just check if we ended up here with an HTML selector; if so, hand it off.

            var firstSelector = ActiveSelectors[0];

            if (firstSelector.SelectorType == SelectorType.HTML)
            {
                HtmlParser.HtmlElementFactory factory =
                    new HtmlParser.HtmlElementFactory(firstSelector.Html);

                // Return the factory ouptut as a list because otherwise the enumerator could end up
                // as the actual source of the selection, meaning it would get re-parsed each time

                return(factory.ParseAsFragment());
            }

            // this holds any results that carried over from the previous loop for chaining

            IEnumerable <IDomObject> lastResult = null;

            // this is the source from which selections are made in a given iteration; it could be the DOM
            // root, a context, or the previous result set.

            IEnumerable <IDomObject> selectionSource = null;

            // Disable the index if there is no context (e.g. disconnected elements)
            // or if the first element is not indexed.

            bool useIndex = context.IsNullOrEmpty() ||
                            (!context.First().IsDisconnected&& context.First().IsIndexed);


            for (activeSelectorId = 0; activeSelectorId < ActiveSelectors.Count; activeSelectorId++)
            {
                var selector = ActiveSelectors[activeSelectorId].Clone();

                if (lastResult != null)
                {
                    // we will alter the selector during each iteration to remove the parts that have already been
                    // parsed, so use a copy. This is a selector that was chained with the selector grouping
                    // combinator "," -- we always output the results so far when beginning a new group.

                    if (selector.CombinatorType == CombinatorType.Root && lastResult != null)
                    {
                        output.AddRange(lastResult);
                        lastResult = null;
                    }
                }

                // For "and" combinator types, we want to leave everything as it was -- the results of this
                // selector should compound with the prior. This is not an actual CSS combinator, this is the
                // equivalent of grouping parenthesis. That is, in CSS there's no way to say "(input[submit],
                // button):visible" - that is group the results on selector part and apply a filter to it. But
                // we need to do exactly this for certain selector types (for example the jQuery :button
                // selector).

                if (selector.CombinatorType != CombinatorType.Grouped)
                {
                    selectionSource = GetSelectionSource(selector, context, lastResult);
                    lastResult      = null;
                }

                string       key = "";
                SelectorType removeSelectorType = 0;

                if (useIndex && !selector.NoIndex)
                {
#if DEBUG_PATH
                    if (selector.SelectorType.HasFlag(SelectorType.AttributeValue) &&
                        selector.AttributeSelectorType != AttributeSelectorType.NotExists &&
                        selector.AttributeSelectorType != AttributeSelectorType.NotEquals)
                    {
                        key = "!" + selector.AttributeName.ToLower();

                        // AttributeValue must still be matched manually - so remove this flag only if the
                        // selector is conclusive without further checking

                        if (selector.AttributeSelectorType == AttributeSelectorType.Exists)
                        {
                            removeSelectorType = SelectorType.AttributeValue;
                        }
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Tag))
                    {
                        key = "+" + selector.Tag.ToLower();
                        removeSelectorType = SelectorType.Tag;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.ID))
                    {
                        key = "#" + selector.ID;
                        removeSelectorType = SelectorType.ID;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Class))
                    {
                        key = "." + selector.Class;
                        removeSelectorType = SelectorType.Class;
                    }
#else
                    // We don't want to use the index for "NotEquals" selectors because a missing attribute
                    // is considered a valid match

                    if (selector.SelectorType.HasFlag(SelectorType.AttributeValue) &&
                        selector.AttributeSelectorType != AttributeSelectorType.NotExists &&
                        selector.AttributeSelectorType != AttributeSelectorType.NotEquals)
                    {
                        key = "!" + (char)HtmlData.Tokenize(selector.AttributeName);

                        // AttributeValue must still be matched manually - so remove this flag only if the
                        // selector is conclusive without further checking

                        if (selector.AttributeSelectorType == AttributeSelectorType.Exists)
                        {
                            removeSelectorType = SelectorType.AttributeValue;
                        }
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Tag))
                    {
                        key = "+" + (char)HtmlData.Tokenize(selector.Tag);
                        removeSelectorType = SelectorType.Tag;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.ID))
                    {
                        key = "#" + (char)HtmlData.TokenizeCaseSensitive(selector.ID);
                        removeSelectorType = SelectorType.ID;
                    }
                    else if (selector.SelectorType.HasFlag(SelectorType.Class))
                    {
                        key = "." + (char)HtmlData.TokenizeCaseSensitive(selector.Class);
                        removeSelectorType = SelectorType.Class;
                    }
#endif
                }

                // If part of the selector was indexed, key will not be empty. Return initial set from the
                // index. If any selectors remain after this they will be searched the hard way.

                IEnumerable <IDomObject> result = null;

                if (key != String.Empty)
                {
                    // This is the main index access point: if we have an index key, we'll get as much as we can from the index.
                    // Anything else will be handled manually.

                    int  depth       = 0;
                    bool descendants = true;

                    switch (selector.TraversalType)
                    {
                    case TraversalType.Child:
                        depth       = selector.ChildDepth;
                        descendants = false;
                        break;

                    case TraversalType.Filter:
                    case TraversalType.Adjacent:
                    case TraversalType.Sibling:
                        depth       = 0;
                        descendants = false;
                        break;

                    case TraversalType.Descendent:
                        depth       = 1;
                        descendants = true;
                        break;
                    }

                    if (selectionSource == null)
                    {
                        result = Document.DocumentIndex.QueryIndex(key + HtmlData.indexSeparator, depth, descendants);
                    }
                    else
                    {
                        HashSet <IDomObject> elementMatches = new HashSet <IDomObject>();
                        result = elementMatches;

                        foreach (IDomObject obj in selectionSource)
                        {
                            elementMatches.AddRange(Document.DocumentIndex.QueryIndex(key + HtmlData.indexSeparator + obj.Path,
                                                                                      depth, descendants));
                        }
                    }
                    selector.SelectorType &= ~removeSelectorType;

                    // Special case for attribute selectors: when Attribute Value attribute selector is present, we
                    // still need to filter for the correct value afterwards. But we need to change the traversal
                    // type because any nodes with the correct attribute type have already been selected.

                    if (selector.SelectorType.HasFlag(SelectorType.AttributeValue))
                    {
                        selector.TraversalType = TraversalType.Filter;
                    }
                }
                else if (selector.SelectorType.HasFlag(SelectorType.Elements))
                {
                    HashSet <IDomObject> elementMatches = new HashSet <IDomObject>();
                    result = elementMatches;
                    foreach (IDomObject obj in GetAllChildOrDescendants(selector.TraversalType, selectionSource))
                    {
                        //key = HtmlData.indexSeparator + obj.Path;
                        HashSet <IDomObject> srcKeys = new HashSet <IDomObject>(Document.DocumentIndex.QueryIndex(HtmlData.indexSeparator + obj.Path));
                        foreach (IDomObject match in selector.SelectElements)
                        {
                            if (srcKeys.Contains(match))
                            {
                                elementMatches.Add(match);
                            }
                        }
                    }

                    selector.SelectorType &= ~SelectorType.Elements;
                }

                // If any selectors were not handled via the index, match them manually

                if (selector.SelectorType != 0)
                {
                    // if there are no temporary results (b/c there was no indexed selector) then use selection
                    // source instead (e.g. start from the same point that the index would have)

                    result = GetMatches(result ?? selectionSource ?? Document.ChildElements, selector);
                }

                lastResult = lastResult == null ?
                             result : lastResult.Concat(result);
            }

            // After the loop has finished, output any results from the last iteration.

            output.AddRange(lastResult);

            // Return the results as a list so that any user will not cause the selector to be run again

            return(output.OrderBy(item => item.Path, StringComparer.Ordinal).ToList());
        }