SetEncoding() public method

Sets current encoding in format used in HTTP headers and HTML META tags
public SetEncoding ( string p_sCharSet ) : bool
p_sCharSet string
return bool
Esempio n. 1
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if(bReadLineDelay)
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk=null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet=false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while((oChunk=oP.ParseNext())!=null)
            {
                switch(oChunk.oType)
                {
                        // matched open tag, ie <a href="">
                    case HTMLchunkType.OpenTag:
                        Console.Write("Open tag: "+oChunk.sTag);

                        // in order to set correct encoding we need to keep an eye on META tags
                        // that hit us on what the encoding should be used, note here
                        // that some webpages have TITLE set BEFORE meta-tags, which means you will
                        // have to re-encode it in order to get correct representation of text

            PrintParams:

                        if(oChunk.sTag.Length==4 && oChunk.sTag=="meta")
                        {
                            HandleMetaEncoding(oP,oChunk,ref bEncodingSet);
                        };

                        // commented out call to code that will do the job for you - long code below
                        // is left to demonstrate how to access individual param values
                        // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());

                        if(oChunk.bHashMode)
                        {
                            if(oChunk.oParams.Count>0)
                            {
                                foreach(string sParam in oChunk.oParams.Keys)
                                {
                                    string sValue=oChunk.oParams[sParam].ToString();

                                    if(sValue.Length>0)
                                        Console.Write(" {0}='{1}'",sParam,sValue);
                                    else
                                        Console.Write(" {0}",sParam);
                                }

                            }
                        }
                        else
                        {
                            // this is alternative method of getting params -- it may look less convinient
                            // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                            // params for a few
                            if(oChunk.iParams>0)
                            {
                                for(int i=0; i<oChunk.iParams; i++)
                                {
                                    // here we can use exactly the same single/double quotes as they
                                    // were used on params

                                    switch(oChunk.cParamChars[i])
                                    {
                                        case (byte)' ':
                                            if(oChunk.sValues[i].Length==0)
                                                Console.Write(" {0}",oChunk.sParams[i]);
                                            else
                                                Console.Write(" {0}={1}",oChunk.sParams[i],oChunk.sValues[i]);
                                            break;

                                        default:
                                            Console.Write(" {0}={1}{2}{1}",oChunk.sParams[i],(char)oChunk.cParamChars[i],oChunk.sValues[i]);
                                            break;
                                    }

                                }

                            }

                        }

                        break;

                        // matched close tag, ie </a>
                    case HTMLchunkType.CloseTag:
                        //Console.Write(oChunk.GenerateHTML());

                        Console.Write("Closed tag: "+oChunk.sTag);

                        if(oChunk.iParams>0)
                            goto PrintParams;

                        break;

                    // NOTE: you have to call finalisation because it is not done for Scripts or comments
                    // Matched data between <script></script> tags
                    case HTMLchunkType.Script:

                        if(!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                            oP.SetRawHTML(oChunk);

                        if(oChunk.oHTML.Length>0)
                            Console.Write("Script: "+oChunk.oHTML);
                        else
                            Console.Write("Script: [ignored for performance reasons]");

                        if(oChunk.iParams>0)
                            goto PrintParams;

                        break;

                    // NOTE: you have to call finalisation because it is not done for Scripts or comments
                    // matched HTML comment, that's stuff between <!-- and -->
                    case HTMLchunkType.Comment:

                        //Console.WriteLine("{0}",oChunk.GenerateHTML());

                        if(oP.bKeepRawHTML || oP.bAutoKeepComments)
                        {
                            // by default we won't finalise automatically as comments are often
                            // very lenghty and it is costly to create long strings when they are not
                            // needed, ie: during indexing of text
                            Console.Write("Comment: "+oChunk.oHTML);
                        }
                        else
                        {
                            // Even if raw HTML by default was not taken you can get it anyway by
                            // uncommenting next line
                            //oP.SetRawHTML(oChunk);

                            Console.Write("Comment: [ignored for performance reasons]");
                        }
                        break;

                    // matched normal text
                    case HTMLchunkType.Text:

                        // skip pure whitespace that we are not really interested in
                        if(oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length==0 && bReadLineDelay)
                            continue;

                        Console.Write("Text: '{0}'",oChunk.oHTML);
                        break;

                };

                if(bReadLineDelay)
                    Console.ReadLine();
                else
                    Console.WriteLine("");
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet = false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // in order to set correct encoding we need to keep an eye on META tags
                    // that hit us on what the encoding should be used, note here
                    // that some webpages have TITLE set BEFORE meta-tags, which means you will
                    // have to re-encode it in order to get correct representation of text

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());



                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        Console.Write(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        Console.Write(" {0}={1}", oChunk.sParams[i], oChunk.sValues[i]);
                                    }
                                    break;

                                default:
                                    Console.Write(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], oChunk.sValues[i]);
                                    break;
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    //Console.Write(oChunk.GenerateHTML());

                    Console.Write("Closed tag: " + oChunk.sTag);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    if (oChunk.oHTML.Length > 0)
                    {
                        Console.Write("Script: " + oChunk.oHTML);
                    }
                    else
                    {
                        Console.Write("Script: [ignored for performance reasons]");
                    }

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    //Console.WriteLine("{0}",oChunk.GenerateHTML());

                    if (oP.bKeepRawHTML || oP.bAutoKeepComments)
                    {
                        // by default we won't finalise automatically as comments are often
                        // very lenghty and it is costly to create long strings when they are not
                        // needed, ie: during indexing of text
                        Console.Write("Comment: " + oChunk.oHTML);
                    }
                    else
                    {
                        // Even if raw HTML by default was not taken you can get it anyway by
                        // uncommenting next line
                        //oP.SetRawHTML(oChunk);

                        Console.Write("Comment: [ignored for performance reasons]");
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0 && bReadLineDelay)
                    {
                        continue;
                    }

                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
Esempio n. 3
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
                ;
            }

            return(false);
        }