Exemplo n.º 1
0
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE)
        {
            oP = p_oP;
            oChunk = p_oChunk;
            sText = p_sText;
            bHTML = p_bHTML;
            iDataLength = p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS;

            oE = p_oE;
            oHE = p_oHE;
        }
Exemplo n.º 2
0
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                bHTML = null;
                oChunk = null;
                sText = null;
                oE = null;
                oP = null;
            }
        }
Exemplo n.º 3
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
            }

            return(false);
        }
Exemplo n.º 4
0
 private HTMLchunk ParseNext(HTMLparser parser)
 {
     HTMLchunk chunk = parser.ParseNext();
     return chunk;
 }
Exemplo n.º 5
0
        private BlockElement ParseHtmlToBlocks(string html)
        {
            IResourceProvider provider = ServiceRegistry.GetService<IResourceProvider>();
            StyleParser styles = new StyleParser(provider);

            BlockElement root, currentBlock;
            root = currentBlock = new BlockElement("root", styles.Style); // this is the root!

            // if this is not HTML, do not parse tags. Otherwise search out and interpret tags.
            bool parseHTML = true;
            if (!parseHTML)
            {
                for (int i = 0; i < html.Length; i++)
                    currentBlock.AddAtom(new CharacterElement(styles.Style, html[i]));
            }
            else
            {
                if (m_Parser == null)
                    m_Parser = new HTMLparser();
                m_Parser.Init(html);
                HTMLchunk chunk;

                while ((chunk = ParseNext(m_Parser)) != null)
                {
                    if (!(chunk.oHTML == string.Empty))
                    {
                        // This is a span of text.
                        string text = chunk.oHTML;
                        // make sure to replace escape characters!
                        text = EscapeCharacters.ReplaceEscapeCharacters(text);
                        //Add the characters to the current box
                        for (int i = 0; i < text.Length; i++)
                            currentBlock.AddAtom(new CharacterElement(styles.Style, text[i]));
                    }
                    else
                    {
                        // This is a tag. interpret the tag and edit the openTags list.
                        // It may also be an atom, in which case we should add it to the list of atoms!
                        AElement atom = null;

                        if (chunk.bClosure && !chunk.bEndClosure)
                        {
                            styles.CloseOneTag(chunk);
                            if (currentBlock.Tag == chunk.sTag)
                            {
                                currentBlock = currentBlock.Parent;
                            }
                        }
                        else
                        {
                            bool isBlockTag = false;
                            switch (chunk.sTag)
                            {
                                // ======================================================================
                                // Anchor elements are added to the open tag collection as HREFs.
                                // ======================================================================
                                case "a":
                                    styles.InterpretHREF(chunk, null);
                                    break;
                                // ======================================================================
                                // These html elements are ignored.
                                // ======================================================================
                                case "body":
                                    break;
                                // ======================================================================
                                // These html elements are blocks but can also have styles
                                // ======================================================================
                                case "center":
                                case "left":
                                case "right":
                                case "div":
                                    atom = new BlockElement(chunk.sTag, styles.Style);
                                    styles.ParseTag(chunk, atom);
                                    isBlockTag = true;
                                    break;
                                // ======================================================================
                                // These html elements are styles, and are added to the StyleParser.
                                // ======================================================================
                                case "span":
                                case "font":
                                case "b":
                                case "i":
                                case "u":
                                case "outline":
                                case "big":
                                case "basefont":
                                case "medium":
                                case "small":
                                    styles.ParseTag(chunk, null);
                                    break;
                                // ======================================================================
                                // These html elements are added as atoms only. They cannot impart style
                                // onto other atoms.
                                // ======================================================================
                                case "br":
                                    atom = new CharacterElement(styles.Style, '\n');
                                    break;
                                case "gumpimg":
                                    // draw a gump image
                                    atom = new ImageElement(styles.Style, ImageElement.ImageTypes.UI);
                                    styles.ParseTag(chunk, atom);
                                    break;
                                case "itemimg":
                                    // draw a static image
                                    atom = new ImageElement(styles.Style, ImageElement.ImageTypes.Item);
                                    styles.ParseTag(chunk, atom);
                                    break;
                                // ======================================================================
                                // Every other element is not interpreted, but rendered as text. Easy!
                                // ======================================================================
                                default:
                                    {
                                        string text = html.Substring(chunk.iChunkOffset, chunk.iChunkLength);
                                        // make sure to replace escape characters!
                                        text = EscapeCharacters.ReplaceEscapeCharacters(text);
                                        //Add the characters to the current box
                                        for (int i = 0; i < text.Length; i++)
                                            currentBlock.AddAtom(new CharacterElement(styles.Style, text[i]));
                                    }
                                    break;
                            }

                            if (atom != null)
                            {
                                currentBlock.AddAtom(atom);
                                if (isBlockTag && !chunk.bEndClosure)
                                    currentBlock = (BlockElement)atom;
                            }

                            styles.CloseAnySoloTags();
                        }
                    }
                }
            }

            return root;
        }
Exemplo n.º 6
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet)
        {
            if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta")
                return false;

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if(!oChunk.bHashMode)
                oChunk.ConvertParamsToHash();

            string sKey=oChunk.oParams["http-equiv"] as string;

            if(sKey!=null)
            {

                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch(sKey.ToLower())
                {
                    case "content-type":
                    // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold
                    case "content-category":

                        // we might have charset here that may hint at necessity to decode page
                        // check for possible encoding change

                        // once encoding is set it should not be changed, but you can be damn
                        // sure there are web pages out there that do that!!!
                        if(!bEncodingSet)
                        {
                            string sData=oChunk.oParams["content"] as string;

                            // it is possible we have broken META tag without Content part
                            if(sData!=null)
                            {

                                if(oP.SetEncoding(sData))
                                {
                                    // we may need to re-encode title

                                    if(!bEncodingSet)
                                    {
                                        // here you need to reencode any text that you found so far
                                        // most likely it will be just TITLE, the rest can be ignored anyway
                                        bEncodingSet=true;
                                    }
                                }
                                else
                                {
                                    // failed to set encoding - most likely encoding string
                                    // was incorrect or your machine lacks codepages or something
                                    // else - might be good idea to put warning message here
                                }
                            }

                        }

                        return true;

                    default:
                        break;
                };

            }

            return false;
        }