private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) { return; } if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) { Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } } }
/// <summary> /// Tests parser by parsing chunk of data and then generating HTML on the basis of parsing /// and comparing this to expected HTML: in case of any discrepancies assertion will be fired /// </summary> /// <param name="bData">Data to parse</param> /// <param name="sExpectedHTML">Expected HTML as it gets generated by this very function</param> void TestParser(byte[] bData, string sExpectedHTML) { if (sExpectedHTML == null) { return; } StringBuilder oSB = new StringBuilder(512); bool bEncodingSet = false; oP.Init(bData); // ok lets parse HTML and save the HTML that we view back into string HTMLchunk oChunk; // we don't want to use hashes as they would change order in which params are made oP.SetChunkHashMode(false); // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: oSB.AppendFormat("<{0}", oChunk.sTag); PrintParams: if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta") { if (!bEncodingSet) { if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (bEncodingSet) { // possible Title re-encoding should happen here } } } } // commented out call to code that will do the job for you - long code below // is left to demonstrate how to access individual param values // Console.WriteLine(oChunk.GenerateParamsHTML()); if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { oSB.AppendFormat(" {0}='{1}'", sParam, oP.ChangeToEntities(sValue)); } else { oSB.AppendFormat(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { // here we can use exactly the same single/double quotes as they // were used on params string sValue = oChunk.sValues[i]; if (oChunk.bEntities) { sValue = oP.ChangeToEntities(sValue); } switch (oChunk.cParamChars[i]) { case (byte)' ': if (oChunk.sValues[i].Length == 0) { oSB.AppendFormat(" {0}", oChunk.sParams[i]); } else { oSB.AppendFormat(" {0}={1}", oChunk.sParams[i], sValue); } break; default: oSB.AppendFormat(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], sValue); break; } } } } if (oChunk.bClosure && !oP.bAutoMarkClosedTagsWithParamsAsOpen) { oSB.Append("/>"); } else { oSB.Append(">"); } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: if (oChunk.iParams > 0) { oSB.AppendFormat("<{0}", oChunk.sTag); goto PrintParams; } else { if (oChunk.bEndClosure) { oSB.AppendFormat("<{0}/>", oChunk.sTag); } else { oSB.AppendFormat("</{0}>", oChunk.sTag); } } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // Matched data between <script></script> tags case HTMLchunkType.Script: if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML) { oP.SetRawHTML(oChunk); } oSB.AppendFormat(oChunk.oHTML); if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: if (!oP.bAutoExtractBetweenTagsOnly) { oSB.AppendFormat("{0}", oChunk.oHTML); } else { oSB.AppendFormat("<!--{0}-->", oChunk.oHTML); } break; // matched normal text case HTMLchunkType.Text: // skip pure whitespace that we are not really interested in if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0) { continue; } oSB.AppendFormat("{0}", oChunk.bEntities ? oP.ChangeToEntities(oChunk.oHTML) : oChunk.oHTML); break; } ; } // now compare parsed HTML with the one we expect Assert.AreEqual(sExpectedHTML, oSB.ToString()); }