private List <AAtom> decodeText(string inText, bool parseHTML) { List <AAtom> outAtoms = new List <AAtom>(); List <string> openTags = new List <string>(); Color currentColor = Color.White; List <HREF_Attributes> openHREFs = new List <HREF_Attributes>(); // if this is not HTML, do not parse tags. Otherwise search out and interpret tags. if (!parseHTML) { for (int i = 0; i < inText.Length; i++) { addCharacter(inText[i], outAtoms, openTags, currentColor, openHREFs); } } else { Parsing.HTMLparser parser = new Parsing.HTMLparser(inText); Parsing.HTMLchunk chunk; while ((chunk = parser.ParseNext()) != null) { if (!(chunk.oHTML == string.Empty)) { // this is a span of text. string span = chunk.oHTML; // make sure to replace escape characters! span = EscapeCharacters.ReplaceEscapeCharacters(span); //Add the characters to the outText list. for (int i = 0; i < span.Length; i++) { addCharacter(span[i], outAtoms, openTags, currentColor, openHREFs); } } else { // this is a tag. interpret the tag and edit the openTags list. bool readParams = true; bool isClosing = chunk.bClosure; switch (chunk.sTag) { case "font": break; case "br": addCharacter('\n', outAtoms, openTags, currentColor, openHREFs); break; case "b": editOpenTags(openTags, isClosing, "b"); break; case "i": editOpenTags(openTags, isClosing, "i"); break; case "u": editOpenTags(openTags, isClosing, "u"); break; case "outline": editOpenTags(openTags, isClosing, "outline"); break; case "big": editOpenTags(openTags, isClosing, "big"); break; case "basefont": case "medium": editOpenTags(openTags, isClosing, "medium"); break; case "small": editOpenTags(openTags, isClosing, "small"); break; case "center": editOpenTags(openTags, isClosing, "center"); break; case "left": editOpenTags(openTags, isClosing, "left"); break; case "right": editOpenTags(openTags, isClosing, "right"); break; case "gumpimg": addGumpImage(outAtoms, openTags, openHREFs); break; case "span": addSpan(outAtoms, openTags, openHREFs); break; case "a": editOpenTags(openTags, isClosing, "a"); if (isClosing) { // closing a hyperlink - restore previous address, if any. if (openHREFs.Count > 0) { openHREFs.RemoveAt(openHREFs.Count - 1); } } else { // hyperlink with attributes HREF_Attributes href = new HREF_Attributes(); openHREFs.Add(href); } break; default: readParams = false; for (int i = 0; i < chunk.iChunkLength; i++) { addCharacter(char.Parse(inText.Substring(i + chunk.iChunkOffset, 1)), outAtoms, openTags, currentColor, openHREFs); } break; } if (readParams) { foreach (DictionaryEntry param in chunk.oParams) { string key = param.Key.ToString(); string value = param.Value.ToString(); if (value.EndsWith("/")) { value = value.Substring(0, value.Length - 1); } switch (key) { case "href": if (chunk.sTag == "a") { openHREFs[openHREFs.Count - 1].HREF = value; } else { Logger.Warn("href paramater used outside of an 'a' tag link. href is ignored in this case."); } break; case "color": case "hovercolor": case "activecolor": // get the color! string color = value; if (color[0] == '#') { color = color.Substring(1); } if (color.Length == 3 || color.Length == 6) { Color c = Utility.ColorFromHexString(color); if (key == "color") { currentColor = c; } if (chunk.sTag == "a") { switch (key) { case "color": openHREFs[openHREFs.Count - 1].UpHue = UltimaData.HuesXNA.GetWebSafeHue(c); break; case "hovercolor": openHREFs[openHREFs.Count - 1].OverHue = UltimaData.HuesXNA.GetWebSafeHue(c); break; case "activecolor": openHREFs[openHREFs.Count - 1].DownHue = UltimaData.HuesXNA.GetWebSafeHue(c); break; } } } else { Logger.Warn("Improperly formatted color:" + color); } break; case "text-decoration": switch (value) { case "none": if (chunk.sTag == "a") { openHREFs[openHREFs.Count - 1].Underline = false; } break; default: Logger.Warn(string.Format("Unknown text-decoration:{0}", value)); break; } break; case "src": case "hoversrc": case "activesrc": switch (chunk.sTag) { case "gumpimg": if (key == "src") { ((ImageAtom)outAtoms[outAtoms.Count - 1]).Value = int.Parse(value); } else if (key == "hoversrc") { ((ImageAtom)outAtoms[outAtoms.Count - 1]).ValueOver = int.Parse(value); } else if (key == "activesrc") { ((ImageAtom)outAtoms[outAtoms.Count - 1]).ValueDown = int.Parse(value); } break; default: Logger.Warn("src param encountered within " + chunk.sTag + " which does not use this param."); break; } break; case "width": switch (chunk.sTag) { case "gumpimg": case "span": outAtoms[outAtoms.Count - 1].Width = int.Parse(value); break; default: Logger.Warn("width param encountered within " + chunk.sTag + " which does not use this param."); break; } break; case "height": switch (chunk.sTag) { case "gumpimg": case "span": outAtoms[outAtoms.Count - 1].Width = int.Parse(value); break; default: Logger.Warn("height param encountered within " + chunk.sTag + " which does not use this param."); break; } break; default: Logger.Warn(string.Format("Unknown parameter:{0}", key)); break; } } } } } } return(outAtoms); }
private List<AHTMLAtom> decodeText(string inText, bool parseHTML) { List<AHTMLAtom> outAtoms = new List<AHTMLAtom>(); List<string> openTags = new List<string>(); Color currentColor = Color.White; List<HREF_Attributes> openHREFs = new List<HREF_Attributes>(); // if this is not HTML, do not parse tags. Otherwise search out and interpret tags. if (!parseHTML) { for (int i = 0; i < inText.Length; i++) { addCharacter(inText[i], outAtoms, openTags, currentColor, openHREFs); } } else { Parsing.HTMLparser parser = new Parsing.HTMLparser(inText); Parsing.HTMLchunk chunk; while ((chunk = parser.ParseNext()) != null) { if (!(chunk.oHTML == "")) { // this is text. add the characters to the outText list. for (int i = 0; i < chunk.oHTML.Length; i++) addCharacter(chunk.oHTML[i], outAtoms, openTags, currentColor, openHREFs); } else { // this is a tag. interpret the tag and edit the openTags list. bool readParams = true; bool isClosing = chunk.bClosure; switch (chunk.sTag) { case "font": break; case "br": addCharacter('\n', outAtoms, openTags, currentColor, openHREFs); break; case "b": editOpenTags(openTags, isClosing, "b"); break; case "i": editOpenTags(openTags, isClosing, "i"); break; case "u": editOpenTags(openTags, isClosing, "u"); break; case "big": editOpenTags(openTags, isClosing, "big"); break; case "basefont": case "medium": editOpenTags(openTags, isClosing, "medium"); break; case "small": editOpenTags(openTags, isClosing, "small"); break; case "center": editOpenTags(openTags, isClosing, "center"); break; case "left": editOpenTags(openTags, isClosing, "left"); break; case "right": editOpenTags(openTags, isClosing, "right"); break; case "gumpimg": addGumpImage(outAtoms, openTags, openHREFs); break; case "span": addSpan(outAtoms, openTags, openHREFs); break; case "a": editOpenTags(openTags, isClosing, "a"); if (isClosing) { // closing a hyperlink - restore previous address, if any. if (openHREFs.Count > 0) openHREFs.RemoveAt(openHREFs.Count - 1); } else { // hyperlink with attributes HREF_Attributes href = new HREF_Attributes(); openHREFs.Add(href); } break; default: readParams = false; for (int i = 0; i < chunk.iChunkLength; i++) { addCharacter(char.Parse(inText.Substring(i + chunk.iChunkOffset, 1)), outAtoms, openTags, currentColor, openHREFs); } break; } if (readParams) { foreach (DictionaryEntry param in chunk.oParams) { string key = param.Key.ToString(); string value = param.Value.ToString(); if (value.EndsWith("/")) value = value.Substring(0, value.Length - 1); switch (key) { case "href": if (chunk.sTag == "a") { openHREFs[openHREFs.Count - 1].HREF = value; } else { Logger.Warn("href paramater used outside of an 'a' tag link. href is ignored in this case."); } break; case "color": case "hovercolor": case "activecolor": // get the color! string color = value; if (color[0] == '#') color = color.Substring(1); if (color.Length == 3 || color.Length == 6) { Color c = Utility.ColorFromHexString(color); if (key == "color") currentColor = c; if (chunk.sTag == "a") { switch (key) { case "color": openHREFs[openHREFs.Count - 1].UpHue = UltimaData.HuesXNA.GetWebSafeHue(c); break; case "hovercolor": openHREFs[openHREFs.Count - 1].OverHue = UltimaData.HuesXNA.GetWebSafeHue(c); break; case "activecolor": openHREFs[openHREFs.Count - 1].DownHue = UltimaData.HuesXNA.GetWebSafeHue(c); break; } } } else Logger.Warn("Improperly formatted color:" + color); break; case "text-decoration": switch (value) { case "none": if (chunk.sTag == "a") openHREFs[openHREFs.Count - 1].Underline = false; break; default: Logger.Warn(string.Format("Unknown text-decoration:{0}", value)); break; } break; case "src": case "hoversrc": case "activesrc": switch (chunk.sTag) { case "gumpimg": if (key == "src") ((HTMLImageGump)outAtoms[outAtoms.Count - 1]).Value = int.Parse(value); else if (key == "hoversrc") ((HTMLImageGump)outAtoms[outAtoms.Count - 1]).ValueOver = int.Parse(value); else if (key == "activesrc") ((HTMLImageGump)outAtoms[outAtoms.Count - 1]).ValueDown = int.Parse(value); break; default: Logger.Warn("src param encountered within " + chunk.sTag + " which does not use this param."); break; } break; case "width": switch (chunk.sTag) { case "gumpimg": case "span": outAtoms[outAtoms.Count - 1].Width = int.Parse(value); break; default: Logger.Warn("width param encountered within " + chunk.sTag + " which does not use this param."); break; } break; case "height": switch (chunk.sTag) { case "gumpimg": case "span": outAtoms[outAtoms.Count - 1].Width = int.Parse(value); break; default: Logger.Warn("height param encountered within " + chunk.sTag + " which does not use this param."); break; } break; default: Logger.Warn(string.Format("Unknown parameter:{0}", key)); break; } } } } } } return outAtoms; }
/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE) { oP=p_oP; oChunk=p_oChunk; sText=p_sText; bHTML=p_bHTML; iDataLength=p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS; oE=p_oE; oHE=p_oHE; }
private void Dispose(bool bDisposing) { if(!bDisposed) { bDisposed=true; bHTML=null; oChunk=null; sText=null; oE=null; oP=null; } }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!oChunk.bHashMode) { oChunk.ConvertParamsToHash(); } string sKey = oChunk.oParams["http-equiv"] as string; if (sKey != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!bEncodingSet) { string sData = oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if (sData != null) { if (oP.SetEncoding(sData)) { // we may need to re-encode title if (!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } ; } return(false); }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet) { if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta") return false; // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if(!oChunk.bHashMode) oChunk.ConvertParamsToHash(); string sKey=oChunk.oParams["http-equiv"] as string; if(sKey!=null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch(sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if(!bEncodingSet) { string sData=oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if(sData!=null) { if(oP.SetEncoding(sData)) { // we may need to re-encode title if(!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet=true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return true; default: break; }; } return false; }