/// <summary> /// The unsafeForAttributesMode tells the method to perform more aggressive /// matching of "basic" entities, like IE does on non-markup HTML text. /// However we can't do this kind of matching for attributes, since it /// breaks URLs. When in doubt, use false. /// /// Example: /// /// UnEscapeEntities("£a", true) => "£a" /// UnEscapeEntities("£a", false) => "£a" /// </summary> public static string UnEscapeEntities(string html, UnEscapeMode unEscapeMode) { if (html == null) { return(null); } StringBuilder output = new StringBuilder(html.Length); int len = html.Length; for (int i = 0; i < len; i++) { char c0 = html[i]; if (c0 == '&') { if (i + 1 < len) { char c1 = html[i + 1]; switch (c1) { case '#': { if (i + 2 < len) { char c2 = html[i + 2]; switch (c2) { case 'x': case 'X': { // do hexadecimal match bool semicolonTerminated = false; int charVal = 0; int j; for (j = i + 3; j < len; j++) { int hexVal = ToHexValue(html[j]); if (hexVal == -1) { // skip one more char if currently on semicolon if (html[j] == ';') { semicolonTerminated = true; } break; } charVal *= 16; charVal += hexVal; } if (semicolonTerminated && charVal != 0) { i = j; output.Append((char)charVal); continue; } // if total is 0, continue break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { // do decimal match int charVal = 0; int j; for (j = i + 2; j < len; j++) { char c = html[j]; if (c < '0' || c > '9') { if (c == ';') { ++j; } break; } int cVal = c - '0'; charVal *= 10; charVal += cVal; } if (charVal != 0) { i = j - 1; output.Append((char)charVal); continue; } // if total is 0, continue break; } } } break; } default: { int j; int end = Math.Min(len, i + 12); for (j = i + 1; j < end; j++) { char c = html[j]; if (c == ';' || (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9'))) { break; } } string entityRef = html.Substring(i + 1, j - (i + 1)); if (unEscapeMode != UnEscapeMode.Attribute) { // k = number of characters in entityRef that we are using int k, code = -1; for (k = 1; k < entityRef.Length; k++) { if (-1 != (code = EntityEscaper.Code(entityRef.Substring(0, k), true))) { break; } } if (code == -1) { code = EntityEscaper.Code(entityRef, false); } if (code != -1) { output.Append((char)code); i += 1 + k; if (i < end && html[i] == ';') { ++i; } --i; continue; } } else { int code = EntityEscaper.Code(entityRef, false); if (code != -1) { output.Append((char)code); i += 1 + entityRef.Length; if (i < end && html[i] == ';') { ++i; } --i; continue; } } break; } } } } output.Append(c0); } return(output.ToString()); }
/// <summary> /// The unsafeForAttributesMode tells the method to perform more aggressive /// matching of "basic" entities, like IE does on non-markup HTML text. /// However we can't do this kind of matching for attributes, since it /// breaks URLs. When in doubt, use false. /// /// Example: /// /// UnEscapeEntities("£a", true) => "£a" /// UnEscapeEntities("£a", false) => "£a" /// </summary> public static string UnEscapeEntities(string html, UnEscapeMode unEscapeMode) { if (html == null) return null; StringBuilder output = new StringBuilder(html.Length); int len = html.Length; for (int i = 0; i < len; i++) { char c0 = html[i]; if (c0 == '&') { if (i + 1 < len) { char c1 = html[i + 1]; switch (c1) { case '#': { if (i + 2 < len) { char c2 = html[i + 2]; switch (c2) { case 'x': case 'X': { // do hexadecimal match bool semicolonTerminated = false; int charVal = 0; int j; for (j = i + 3; j < len; j++) { int hexVal = ToHexValue(html[j]); if (hexVal == -1) { // skip one more char if currently on semicolon if (html[j] == ';') semicolonTerminated = true; break; } charVal *= 16; charVal += hexVal; } if (semicolonTerminated && charVal != 0) { i = j; output.Append((char)charVal); continue; } // if total is 0, continue break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { // do decimal match int charVal = 0; int j; for (j = i + 2; j < len; j++) { char c = html[j]; if (c < '0' || c > '9') { if (c == ';') ++j; break; } int cVal = c - '0'; charVal *= 10; charVal += cVal; } if (charVal != 0) { i = j - 1; output.Append((char)charVal); continue; } // if total is 0, continue break; } } } break; } default: { int j; int end = Math.Min(len, i + 12); for (j = i + 1; j < end; j++) { char c = html[j]; if (c == ';' || (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9'))) { break; } } string entityRef = html.Substring(i + 1, j - (i + 1)); if (unEscapeMode != UnEscapeMode.Attribute) { // k = number of characters in entityRef that we are using int k, code = -1; for (k = 1; k < entityRef.Length; k++) { if (-1 != (code = EntityEscaper.Code(entityRef.Substring(0, k), true))) break; } if (code == -1) { code = EntityEscaper.Code(entityRef, false); } if (code != -1) { output.Append((char)code); i += 1 + k; if (i < end && html[i] == ';') ++i; --i; continue; } } else { int code = EntityEscaper.Code(entityRef, false); if (code != -1) { output.Append((char)code); i += 1 + entityRef.Length; if (i < end && html[i] == ';') ++i; --i; continue; } } break; } } } } output.Append(c0); } return output.ToString(); }