/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE) { oP = p_oP; oChunk = p_oChunk; sText = p_sText; bHTML = p_bHTML; iDataLength = p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS; oE = p_oE; oHE = p_oHE; }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; bHTML = null; oChunk = null; sText = null; oE = null; oP = null; } }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!oChunk.bHashMode) { oChunk.ConvertParamsToHash(); } string sKey = oChunk.oParams["http-equiv"] as string; if (sKey != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!bEncodingSet) { string sData = oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if (sData != null) { if (oP.SetEncoding(sData)) { // we may need to re-encode title if (!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } } return(false); }
private HTMLchunk ParseNext(HTMLparser parser) { HTMLchunk chunk = parser.ParseNext(); return chunk; }
private BlockElement ParseHtmlToBlocks(string html) { IResourceProvider provider = ServiceRegistry.GetService<IResourceProvider>(); StyleParser styles = new StyleParser(provider); BlockElement root, currentBlock; root = currentBlock = new BlockElement("root", styles.Style); // this is the root! // if this is not HTML, do not parse tags. Otherwise search out and interpret tags. bool parseHTML = true; if (!parseHTML) { for (int i = 0; i < html.Length; i++) currentBlock.AddAtom(new CharacterElement(styles.Style, html[i])); } else { if (m_Parser == null) m_Parser = new HTMLparser(); m_Parser.Init(html); HTMLchunk chunk; while ((chunk = ParseNext(m_Parser)) != null) { if (!(chunk.oHTML == string.Empty)) { // This is a span of text. string text = chunk.oHTML; // make sure to replace escape characters! text = EscapeCharacters.ReplaceEscapeCharacters(text); //Add the characters to the current box for (int i = 0; i < text.Length; i++) currentBlock.AddAtom(new CharacterElement(styles.Style, text[i])); } else { // This is a tag. interpret the tag and edit the openTags list. // It may also be an atom, in which case we should add it to the list of atoms! AElement atom = null; if (chunk.bClosure && !chunk.bEndClosure) { styles.CloseOneTag(chunk); if (currentBlock.Tag == chunk.sTag) { currentBlock = currentBlock.Parent; } } else { bool isBlockTag = false; switch (chunk.sTag) { // ====================================================================== // Anchor elements are added to the open tag collection as HREFs. // ====================================================================== case "a": styles.InterpretHREF(chunk, null); break; // ====================================================================== // These html elements are ignored. // ====================================================================== case "body": break; // ====================================================================== // These html elements are blocks but can also have styles // ====================================================================== case "center": case "left": case "right": case "div": atom = new BlockElement(chunk.sTag, styles.Style); styles.ParseTag(chunk, atom); isBlockTag = true; break; // ====================================================================== // These html elements are styles, and are added to the StyleParser. // ====================================================================== case "span": case "font": case "b": case "i": case "u": case "outline": case "big": case "basefont": case "medium": case "small": styles.ParseTag(chunk, null); break; // ====================================================================== // These html elements are added as atoms only. They cannot impart style // onto other atoms. // ====================================================================== case "br": atom = new CharacterElement(styles.Style, '\n'); break; case "gumpimg": // draw a gump image atom = new ImageElement(styles.Style, ImageElement.ImageTypes.UI); styles.ParseTag(chunk, atom); break; case "itemimg": // draw a static image atom = new ImageElement(styles.Style, ImageElement.ImageTypes.Item); styles.ParseTag(chunk, atom); break; // ====================================================================== // Every other element is not interpreted, but rendered as text. Easy! // ====================================================================== default: { string text = html.Substring(chunk.iChunkOffset, chunk.iChunkLength); // make sure to replace escape characters! text = EscapeCharacters.ReplaceEscapeCharacters(text); //Add the characters to the current box for (int i = 0; i < text.Length; i++) currentBlock.AddAtom(new CharacterElement(styles.Style, text[i])); } break; } if (atom != null) { currentBlock.AddAtom(atom); if (isBlockTag && !chunk.bEndClosure) currentBlock = (BlockElement)atom; } styles.CloseAnySoloTags(); } } } } return root; }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet) { if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta") return false; // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if(!oChunk.bHashMode) oChunk.ConvertParamsToHash(); string sKey=oChunk.oParams["http-equiv"] as string; if(sKey!=null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch(sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if(!bEncodingSet) { string sData=oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if(sData!=null) { if(oP.SetEncoding(sData)) { // we may need to re-encode title if(!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet=true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return true; default: break; }; } return false; }