Пример #1
0
        public override void ParseFragment(XmlNode node, HtmlTextReader reader, ParserOptions options)
        {
            XmlDomBuilder dom = new XmlDomBuilder(node);
            HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>();

            parser.Parse(dom, reader, options, InsersionMode.InBody);
        }
Пример #2
0
        public override void Parse(XmlDocument doc, HtmlTextReader reader, ParserOptions options)
        {
            XmlDomBuilder dom = new XmlDomBuilder(doc);
            HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>();

            parser.Parse(dom, reader, options);
        }
Пример #3
0
        public static void LoadHtmlFragment(XNode node, TextReader reader, ParserOptions options)
        {
            XDomBuilder dom = new XDomBuilder(node);
            HtmlStreamParser <XNode> parser         = new HtmlStreamParser <XNode>();
            HtmlTextReader           htmlTextReader = new HtmlTextReader(reader);

            parser.Parse(dom, htmlTextReader, options, InsersionMode.InBody);
        }
Пример #4
0
        internal static void LoadHtml(XDocument doc, TextReader reader, ParserOptions options)
        {
            XDomBuilder dom = new XDomBuilder(doc);
            HtmlStreamParser <XNode> parser         = new HtmlStreamParser <XNode>();
            HtmlTextReader           htmlTextReader = new HtmlTextReader(reader);

            parser.Parse(dom, htmlTextReader, options);
        }
Пример #5
0
        internal static async Task LoadWebPageAsync(XmlDocument doc, string url, LoaderOptions options)
        {
            LoaderOptions optionsToUse = options == null ? new LoaderOptions() : options;

            optionsToUse.ParserOptions.BaseUrl = string.IsNullOrEmpty(optionsToUse.ParserOptions.BaseUrl) ? url : optionsToUse.ParserOptions.BaseUrl;

            XmlDomBuilder dom = new XmlDomBuilder(doc);
            HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>();

            // Get the Html asynchronously and Parse it into an Xml Document
            using (HtmlTextReader htmlReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse))
                parser.Parse(dom, htmlReader, optionsToUse.ParserOptions);
        }
Пример #6
0
 public abstract void ParseFragment(XmlNode node, HtmlTextReader htmlTextReader, ParserOptions options);
Пример #7
0
 public abstract void Parse(XmlDocument doc, HtmlTextReader htmlTextReader, ParserOptions options);
Пример #8
0
        // Read all attributes onto the given tag...
        private static void AddAttributes(DomBuilder <DomNode> dom, DomNode tag, string tagName, HtmlTextReader reader, string originatingUrl = null)
        {
            while (reader.ParseState == ParseState.AttributeName)
            {
                // Get the attribute Name and Value
                string attrName  = reader.ReadAttributeName();
                string attrValue = reader.ParseState == ParseState.AttributeValue ? reader.ReadAttributeValue() : string.Empty;

                // Make sure we have a value for the attribute name
                if (string.IsNullOrEmpty(attrName))
                {
                    continue;
                }

                // Make sure the attribute name is a valid XML name...
                attrName = XmlConvert.EncodeLocalName(attrName);

                // Values can have html encodings - we want them decoded
                attrValue = HtmlDecode(attrValue);

                // Fully-qualify UrlAttributes if a BaseUrl was supplied
                if (!string.IsNullOrEmpty(originatingUrl))
                {
                    // See if the given attribute is a url attribute inside the given tag
                    Dictionary <string, bool> relevantTags = null;
                    _urlAttributes.TryGetValue(attrName, out relevantTags);
                    if (relevantTags != null && relevantTags.ContainsKey(tagName) && !attrValue.Contains("://"))
                    {
                        Uri baseUri      = new Uri(originatingUrl);
                        Uri compbinedUri = new Uri(baseUri, attrValue);
                        attrValue = compbinedUri.ToString();
                    }
                }

                // Add the attribute, if it does not already exist
                dom.AddAttribute(tag, attrName, attrValue);
            }
        }
Пример #9
0
        public void Parse(DomBuilder <DomNode> dom, HtmlTextReader reader, ParserOptions options, InsersionMode mode = InsersionMode.BeforeHtml)
        {
            ParserOptions parserOptions = options == null ? new ParserOptions() : options;
            DateTime      startTime     = DateTime.Now;

Beginning:

            // Make sure root node is clear
            dom.RemoveAll(dom.RootNode);

            // DOM Node pointers
            DomNode currNode = dom.RootNode;
            DomNode htmlNode = default(DomNode);
            DomNode headNode = default(DomNode);
            DomNode bodyNode = default(DomNode);

            // If we are parsing a fragment, the initial insersion mode will
            // be: 'InBody', otherwise it will be: 'BeforeHtml'
            InsersionMode insertionMode = mode;

            while (reader.ParseState != ParseState.Done)
            {
                ParseState state = reader.ParseState;

                // Read the next token. Ignore empty tokens
                string tok = reader.ReadNext();
                if (IsNullOrWhiteSpace(tok))
                {
                    continue;
                }

                // Find the insertion point for the token based on our mode
                switch (insertionMode)
                {
                case InsersionMode.BeforeHtml:

                    // Comment is valid at the top
                    if (state == ParseState.Comment)
                    {
                        break;
                    }

                    // <html> is valid at the top - switch to BeforeHead
                    if (state == ParseState.OpenTag && tok == "html")
                    {
                        htmlNode      = dom.AddElement(dom.RootNode, "html");
                        currNode      = htmlNode;
                        insertionMode = InsersionMode.BeforeHead;
                        break;
                    }

                    // Got a tag that should be in the head. Create html/head node structure...
                    if (state == ParseState.OpenTag && IsHeadTag(tok))
                    {
                        htmlNode      = dom.AddElement(dom.RootNode, "html");
                        headNode      = dom.AddElement(htmlNode, "head");
                        currNode      = headNode;
                        insertionMode = InsersionMode.InHead;
                        break;
                    }

                    // Got anything else - put it in the body
                    else
                    {
                        htmlNode      = dom.AddElement(dom.RootNode, "html");
                        headNode      = dom.AddElement(htmlNode, "head");
                        bodyNode      = dom.AddElement(htmlNode, "body");
                        currNode      = bodyNode;
                        insertionMode = InsersionMode.InBody;
                        break;
                    }

                case InsersionMode.BeforeHead:

                    // Comment is valid
                    if (state == ParseState.Comment)
                    {
                        break;
                    }

                    // <head> is valid here - switch to InHead
                    if (state == ParseState.OpenTag && tok == "head")
                    {
                        headNode      = dom.AddElement(htmlNode, "head");
                        currNode      = headNode;
                        insertionMode = InsersionMode.InHead;
                        break;
                    }

                    // Got a tag that 'should' be in the head. Create head...
                    if (state == ParseState.OpenTag && IsHeadTag(tok))
                    {
                        headNode      = dom.AddElement(htmlNode, "head");
                        currNode      = headNode;
                        insertionMode = InsersionMode.InHead;
                        break;
                    }

                    // Got anything else, including <body> - put it in the body
                    headNode      = dom.AddElement(htmlNode, "head");
                    bodyNode      = dom.AddElement(htmlNode, "body");
                    currNode      = bodyNode;
                    insertionMode = InsersionMode.InBody;
                    break;

                case InsersionMode.InHead:

                    // Comment is valid here
                    if (state == ParseState.Comment)
                    {
                        break;
                    }

                    // Head tags are valid here
                    if (state == ParseState.OpenTag && IsHeadTag(tok))
                    {
                        break;
                    }

                    // Anything else must go into the body
                    bodyNode      = dom.AddElement(htmlNode, "body");
                    currNode      = bodyNode;
                    insertionMode = InsersionMode.InBody;
                    break;
                }

                // Add the token to the DOM
                switch (state)
                {
                case ParseState.Comment:
                    // Xml Comments cannot have '--', and they cannot end in '-'
                    string commentText = tok.Replace("-", "#");
                    dom.AddComment(currNode, commentText);
                    break;

                case ParseState.Text:
                    // Decode the text, to convert all encoded values (eg: '&gt;' to '>')
                    string textContent = HtmlDecode(tok);
                    dom.AddText(currNode, textContent);
                    break;

                case ParseState.OpenTag:

                    // Look up the properties of the tag
                    TagProperties properties = TagProperties.None;
                    _tagProperties.TryGetValue(tok, out properties);

                    // For top-level <html>, & <body> tags, all we do is add attributes to
                    // the already create nodes. <head> tags get ignored
                    if ((properties & TagProperties.TopLevel) > 0)
                    {
                        if (tok == "html" && htmlNode != null)
                        {
                            AddAttributes(dom, htmlNode, tok, reader);
                            break;
                        }
                        if (tok == "body" && bodyNode != null)
                        {
                            AddAttributes(dom, bodyNode, tok, reader);
                            break;
                        }
                        break;
                    }


                    // Create the new tag, add attributes, and append to DOM
                    string  tagName = XmlConvert.EncodeLocalName(tok);
                    DomNode tag     = dom.AddElement(currNode, tagName);
                    AddAttributes(dom, tag, tagName, reader, (parserOptions.FullyQualifyUrls ? parserOptions.BaseUrl : null));

                    // If this is a meta tag, and our underlying stream
                    // lets us ReWind, and we are tentative about the encoding, then check for a new charset
                    if (((properties & TagProperties.IsHeadTag) > 0) && (reader.CurrentEncodingConfidence == EncodingConfidence.Tentative) && reader.CanRewind && (tok == "meta"))
                    {
                        Encoding encoding = CheckForNewEncoding(tag, dom);

                        // If we found a new encoding encoding, we will need to start over!
                        string newEncodingName     = encoding != null ? encoding.WebName : String.Empty;
                        string currentEncodingName = reader.CurrentEncoding != null ? reader.CurrentEncoding.WebName : String.Empty;
                        if (encoding != null && (newEncodingName != currentEncodingName))
                        {
                            // Start over from the beginning!
                            reader.Rewind(encoding);     // Rewind underlying stream, set new encoding
                            goto Beginning;
                        }
                    }

                    // If this is a self closing tag, we are done. Don't move pointer.
                    if ((properties & TagProperties.SelfClosing) > 0)
                    {
                        break;
                    }

                    // If this is an RCData tag, get the text value for it and add it.
                    if ((properties & TagProperties.RCData) > 0)
                    {
                        tok = reader.ReadRCData(tok);
                        dom.AddText(tag, tok);
                        break;
                    }

                    // Set current tag pointer to the newly added tag
                    currNode = tag;

                    break;

                case ParseState.CloseTag:

                    // Look up our ancestor chain for the corresponding open tag
                    DomNode parent = dom.FindAncestor(currNode, tok);
                    currNode = parent != null ? parent : currNode;

                    // If we moved up beyond the body (for example if there were tags or text
                    // after the </html> tag - set the pointer to the <body> tag.
                    if (Equals(currNode, htmlNode) || Equals(currNode, dom.RootNode))
                    {
                        if (bodyNode != null)       // possible we don't have a body node
                        {
                            currNode = bodyNode;
                        }
                    }

                    break;
                }
            }

            DateTime endTime = DateTime.Now;

            // See if the user want's to include metadata in <head>
            if (parserOptions.IncludeMetaData && headNode != null)
            {
                // Parse Time
                DomNode totalMillisecondsNode = dom.AddElement(headNode, "meta");
                dom.AddAttribute(totalMillisecondsNode, "source", "HtmlStreamParser");
                dom.AddAttribute(totalMillisecondsNode, "totalMilliseconds", endTime.Subtract(startTime).TotalMilliseconds.ToString());

                // Originating URL
                DomNode originatingUrlNode = dom.AddElement(headNode, "meta");
                dom.AddAttribute(originatingUrlNode, "source", "HtmlStreamParser");
                dom.AddAttribute(originatingUrlNode, "originatingUrl", reader.OriginatingUrl);

                // Detected Encodings
                DomNode encodingNode = dom.AddElement(headNode, "meta");
                dom.AddAttribute(encodingNode, "source", "HtmlStreamParser");
                dom.AddAttribute(encodingNode, "encoding", reader.CurrentEncoding.WebName);
                dom.AddAttribute(encodingNode, "confidence", reader.CurrentEncodingConfidence.ToString());

                DomNode encodingNode2 = dom.AddElement(headNode, "meta");
                dom.AddAttribute(encodingNode2, "source", "HtmlStreamParser");
                dom.AddAttribute(encodingNode2, "initalEncoding", reader.InitialEncoding.WebName);
                dom.AddAttribute(encodingNode2, "confidence", reader.InitialEncodingConfidence.ToString());

                // Add headers metadata
                foreach (var header in reader.OriginatingHttpHeaders)
                {
                    DomNode headerNode = dom.AddElement(headNode, "meta");
                    dom.AddAttribute(headerNode, "source", "HttpHeaders");
                    dom.AddAttribute(headerNode, "name", header.Key);
                    dom.AddAttribute(headerNode, "content", header.Value);
                }
            }
        }