예제 #1
0
        private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            // if encoding already set then we should not be trying to set new one
            // this is the logic that major browsers follow - the first Encoding is assumed to be
            // the correct one
            if (bEncodingSet)
            {
                return;
            }

            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
            {
                if (!bEncodingSet)
                {
                    Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML());
                }
            }
        }
예제 #2
0
        /// <summary>
        /// Tests parser by parsing chunk of data and then generating HTML on the basis of parsing
        /// and comparing this to expected HTML: in case of any discrepancies assertion will be fired
        /// </summary>
        /// <param name="bData">Data to parse</param>
        /// <param name="sExpectedHTML">Expected HTML as it gets generated by this very function</param>
        void TestParser(byte[] bData, string sExpectedHTML)
        {
            if (sExpectedHTML == null)
            {
                return;
            }

            StringBuilder oSB = new StringBuilder(512);

            bool bEncodingSet = false;

            oP.Init(bData);

            // ok lets parse HTML and save the HTML that we view back into string
            HTMLchunk oChunk;

            // we don't want to use hashes as they would change order in which params are made
            oP.SetChunkHashMode(false);

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case HTMLchunkType.OpenTag:

                    oSB.AppendFormat("<{0}", oChunk.sTag);

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        if (!bEncodingSet)
                        {
                            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
                            {
                                if (bEncodingSet)
                                {
                                    // possible Title re-encoding should happen here
                                }
                            }
                        }
                    }

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine(oChunk.GenerateParamsHTML());


                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    oSB.AppendFormat(" {0}='{1}'", sParam, oP.ChangeToEntities(sValue));
                                }
                                else
                                {
                                    oSB.AppendFormat(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                string sValue = oChunk.sValues[i];

                                if (oChunk.bEntities)
                                {
                                    sValue = oP.ChangeToEntities(sValue);
                                }

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        oSB.AppendFormat(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        oSB.AppendFormat(" {0}={1}", oChunk.sParams[i], sValue);
                                    }
                                    break;

                                default:
                                    oSB.AppendFormat(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], sValue);
                                    break;
                                }
                            }
                        }
                    }

                    if (oChunk.bClosure && !oP.bAutoMarkClosedTagsWithParamsAsOpen)
                    {
                        oSB.Append("/>");
                    }
                    else
                    {
                        oSB.Append(">");
                    }
                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:

                    if (oChunk.iParams > 0)
                    {
                        oSB.AppendFormat("<{0}", oChunk.sTag);
                        goto PrintParams;
                    }
                    else
                    {
                        if (oChunk.bEndClosure)
                        {
                            oSB.AppendFormat("<{0}/>", oChunk.sTag);
                        }
                        else
                        {
                            oSB.AppendFormat("</{0}>", oChunk.sTag);
                        }
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    oSB.AppendFormat(oChunk.oHTML);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:
                    if (!oP.bAutoExtractBetweenTagsOnly)
                    {
                        oSB.AppendFormat("{0}", oChunk.oHTML);
                    }
                    else
                    {
                        oSB.AppendFormat("<!--{0}-->", oChunk.oHTML);
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0)
                    {
                        continue;
                    }

                    oSB.AppendFormat("{0}", oChunk.bEntities ? oP.ChangeToEntities(oChunk.oHTML) : oChunk.oHTML);
                    break;
                }
                ;
            }

            // now compare parsed HTML with the one we expect
            Assert.AreEqual(sExpectedHTML, oSB.ToString());
        }