/// <summary> /// This function will be called when & is found, and it will /// peek forward to check if its entity, should there be a success /// indicated by non-zero returned, the pointer will be left at the new byte /// after entity /// </summary> /// <returns>Char (not byte) that corresponds to the entity or 0 if it was not entity</returns> internal char CheckForEntity(byte[] bHTML, ref int iCurPos, int iDataLength) { if (!bDecodeEntities && !bMiniEntities) { return((char)0); } int iChars = 0; byte cChar; //string sEntity=""; // if true it means we are getting hex or decimal value of the byte bool bCharCode = false; bool bCharCodeHex = false; int iEntLen = 0; int iFrom = iCurPos; string sEntity; try { /* * while(!Eof()) * { * cChar=NextChar(); */ while (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; // 21/10/05: not necessary //if(cChar==0) // break; if (++iChars <= 2) { // the first byte for numbers should be # if (iChars == 1) { if (cChar == '#') { iFrom++; bCharCode = true; continue; } } else { if (bCharCode && cChar == 'x') { iFrom++; iEntLen--; bCharCodeHex = true; } } } //Console.WriteLine("Got entity end: {0}",sEntity); // Break on: // 1) ; - proper end of entity // 2) number 10-based entity but current byte is not a number //if(cChar==';' || (bCharCode && !bCharCodeHex && !char.IsNumber((char)cChar))) // TODO: browsers appear to be lax about ; requirement for end of entity // we should really do the same and treat whitespace as termination of entity if (cChar == ';' || (bCharCode && !bCharCodeHex && !(cChar >= '0' && cChar <= '9'))) { // lets try speculative quick lookup using just first 2 chars // this should be successful in almost all cases thus removing need for // expensive creation of a string if (!bCharCode && iEntLen > 1) { object oChar = oEntities.GetLikelyPresentValue(bHTML[iFrom], bHTML[iFrom + 1]); if (oChar != null) { return((char)((int)oChar)); } } // check if its int - this way we can avoid having to create string if (bCharCode && iEntLen > 0 && !bCharCodeHex) { // if mini entities mode is set then we will ignore all numerics if (bMiniEntities) { break; } // we have to backdown one char in case when entity did not end with ; // otherwise we will lose next char in the stream, this correction suggested by Kurt Carlson! if (cChar != ';') { iCurPos--; } return((char)ParseUInt(bHTML, iFrom, iEntLen)); } sEntity = Encoding.Default.GetString(bHTML, iFrom, iEntLen); if (bCharCode) { // NOTE: this may fail due to wrong data format, // in which case we will return 0, and entity will be // ignored if (iEntLen > 0) { // if mini entities mode is set then we will ignore all numerics if (bMiniEntities) { break; } int iChar; if (!bCharCodeHex) { #if DOTNET20 // we want to avoid exceptions if possible as they are slow if (!int.TryParse(sEntity, out iChar)) { if (iChars > 0) { if ((iCurPos - iChars) >= 0) { iCurPos -= iChars; } //PutChars(iChars); } return((char)(0)); } #else iChar = int.Parse(sEntity); #endif } else { #if DOTNET20 // we want to avoid exceptions if possible as they are very slow if (!int.TryParse(sEntity, System.Globalization.NumberStyles.HexNumber, null, out iChar)) { if (iChars > 0) { if ((iCurPos - iChars) >= 0) { iCurPos -= iChars; } //PutChars(iChars); } return((char)(0)); } #else iChar = int.Parse(sEntity, NumberStyles.HexNumber); #endif } return((char)iChar); } } if (iEntLen >= iMinEntityLen && iEntLen <= iMaxEntityLen) { object oChar = oEntities.GetLikelyPresentValue(sEntity); if (oChar != null) { return((char)((int)oChar)); } } } //break; // as soon as entity length exceed max length of entity known to us // we break up the loop and return nothing found // NOTE: removed due to entities being generally correct and this code costs 10% of CPU in this function if (iEntLen > iMaxEntityLen) { break; } iEntLen++; } } catch //(Exception oEx) { //Console.WriteLine("Entity parsing exception: "+oEx.ToString()); } // if we have not found squat, then we will need to put point back // to where it was before this function was called if (iChars > 0) { if ((iCurPos - iChars) >= 0) { iCurPos -= iChars; } //PutChars(iChars); } return((char)(0)); }
/// <summary> /// This function will decode any entities found in a string - not fast! /// </summary> /// <returns>Possibly decoded string</returns> internal static string DecodeEntities(string sData) { char cChar; StringBuilder oSB = new StringBuilder(sData.Length); string sEntity = ""; try { for (int i = 0; i < sData.Length; i++) { cChar = sData[i]; if (cChar != '&' || (i + 1 >= sData.Length)) { oSB.Append(cChar); } else { // if true it means we are getting hex or decimal value of the byte bool bCharCode = false; bool bCharCodeHex = false; int iEntLen = 0; int iChars = 0; int j = i + 1; int iFrom = i + 1; for (; j < sData.Length; j++) { cChar = sData[j]; if (++iChars <= 2) { // the first byte for numbers should be # if (iChars == 1) { if (cChar == '#') { iFrom++; bCharCode = true; continue; } } else { if (bCharCode && cChar == 'x' && !bCharCodeHex) { iFrom++; //iEntLen--; bCharCodeHex = true; continue; } } } //Console.WriteLine("Got entity end: {0}",sEntity); // Break on: // 1) ; - proper end of entity // 2) number 10-based entity but current byte is not a number //if(cChar==';' || (bCharCode && !bCharCodeHex && !char.IsNumber((char)cChar))) bool bLastChar = j + 1 >= sData.Length; if (cChar == ';' || (bCharCode && !bCharCodeHex && !(cChar >= '0' && cChar <= '9')) || (bCharCode && bLastChar)) { // end of string if (bLastChar && cChar != ';') { iEntLen++; } // lets try speculative quick lookup using just first 2 chars // this should be successful in almost all cases thus removing need for // expensive creation of a string if (!bCharCode && iEntLen > 1) { // make sure we aint at the end of string if (i + 2 < sData.Length) { object oChar = oAllEntities.GetLikelyPresentValue((byte)sData[i + 1], (byte)sData[i + 2]); if (oChar != null) { oSB.Append((char)((int)oChar)); break; } } } // check if its int - this way we can avoid having to create string if (bCharCode && iEntLen > 0 && !bCharCodeHex) { sEntity = sData.Substring(iFrom, iEntLen); int iChar = 0; bool bSuccess = false; try { iChar = (int)uint.Parse(sEntity); bSuccess = true; } catch { } if (bSuccess) { oSB.Append((char)iChar); // move back once when we got number done without ; at the end // of it - Firefox and IE do it this way if (cChar != ';' && !bLastChar) { j--; } break; } else { // this will force to add entity as is - probably broken // or maybe not entity at all oSB.Append('&'); j = i; break; } } sEntity = sData.Substring(iFrom, iEntLen); if (bCharCode) { // NOTE: this may fail due to wrong data format, // in which case we will return 0, and entity will be // ignored if (iEntLen > 0) { int iChar = 0; bool bSuccess = false; #if DOTNET20 && false if (!bCharCodeHex) { bSuccess = int.TryParse(sEntity, out iChar); } else { bSuccess = int.TryParse(sEntity, System.Globalization.NumberStyles.HexNumber, out iChar); } #else try { if (!bCharCodeHex) { iChar = int.Parse(sEntity); } else { iChar = int.Parse(sEntity, NumberStyles.HexNumber); } bSuccess = true; } catch { // some numbers might not be parsed correctly so we will ignore them } #endif if (bSuccess) { oSB.Append((char)iChar); break; } else { // this will force to add entity as is - probably broken // or maybe not entity at all iEntLen = iAllMaxEntityLen + 1; } } } if (iEntLen >= iAllMinEntityLen && iEntLen <= iAllMaxEntityLen) { object oChar = oAllEntities.GetLikelyPresentValue(sEntity); if (oChar != null) { oSB.Append((char)((int)oChar)); break; } else { // this will force to add entity as is - probably broken // or maybe not entity at all iEntLen = iAllMaxEntityLen + 1; //Utils.Write(""); } } } //break; // as soon as entity length exceed max length of entity known to us // we break up the loop and return nothing found // NOTE: removed due to entities being generally correct and this code costs 10% of CPU in this function if (iEntLen > iAllMaxEntityLen || bLastChar) { // append char that triggered entity thingy in the first place oSB.Append('&'); j = i; break; } iEntLen++; } i = j; } } } catch (Exception oEx) { Console.WriteLine("Entity parsing exception: " + oEx.ToString()); return(sData); } return(oSB.ToString()); }