/// <summary> /// Compare two IDN strings for equivalence. This function splits the domain /// names into labels and compares them. According to IDN RFC, whenever two /// labels are compared, they are considered equal if and only if their ASCII /// forms (obtained by applying toASCII) match using an case-insensitive /// ASCII comparison. Two domain names are considered a match if and only if /// all labels match regardless of whether label separators match. /// </summary> /// /// <param name="s1">First IDN string as UCharacterIterator</param> /// <param name="s2">Second IDN string as UCharacterIterator</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 // TODO: optimize public static int Compare(UCharacterIterator s1, UCharacterIterator s2, int options) { if (s1 == null || s2 == null) { throw new ArgumentException( "One of the source buffers is null"); } StringBuilder s1Out = ConvertIDNToASCII(s1.GetText(), options); StringBuilder s2Out = ConvertIDNToASCII(s2.GetText(), options); return(CompareCaseInsensitiveASCII(s1Out, s2Out)); }
/// <summary> /// Convenience function that implements the IDNToUnicode operation as /// defined in the IDNA RFC. This operation is done on complete domain names, /// e.g: "www.example.com". /// <b>Note:</b> IDNA RFC specifies that a conformant application should /// divide a domain name into separate labels, decide whether to apply /// allowUnassigned and useSTD3ASCIIRules on each, and then convert. This /// function does not offer that level of granularity. The options once set /// will apply to all labels in the domain name /// </summary> /// /// <param name="src">The input string to be processed</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>StringBuffer the converted String</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public static StringBuilder ConvertIDNToUnicode(String src, int options) { char[] srcArr = src.ToCharArray(); StringBuilder result = new StringBuilder(); int sepIndex = 0; int oldSepIndex = 0; for (;;) { sepIndex = GetSeparatorIndex(srcArr, sepIndex, srcArr.Length); String label = new String(srcArr, oldSepIndex, sepIndex - oldSepIndex); if (label.Length == 0 && sepIndex != srcArr.Length) { throw new StringPrepParseException( "Found zero length lable after NamePrep.", IBM.ICU.Text.StringPrepParseException.ZERO_LENGTH_LABEL); } UCharacterIterator iter = IBM.ICU.Text.UCharacterIterator.GetInstance(label); result.Append(ConvertToUnicode(iter, options)); if (sepIndex == srcArr.Length) { break; } // Unlike the ToASCII operation we don't normalize the label // separators result.Append(srcArr[sepIndex]); // increment the sepIndex to skip past the separator sepIndex++; oldSepIndex = sepIndex; } if (result.Length > MAX_DOMAIN_NAME_LENGTH) { throw new StringPrepParseException( "The output exceed the max allowed length.", IBM.ICU.Text.StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); } return(result); }
/// <summary> /// Convenience function that implements the IDNToASCII operation as defined /// in the IDNA RFC. This operation is done on complete domain names, e.g: /// "www.example.com". It is important to note that this operation can fail. /// If it fails, then the input domain name cannot be used as an /// Internationalized Domain Name and the application should have methods /// defined to deal with the failure. /// <b>Note:</b> IDNA RFC specifies that a conformant application should /// divide a domain name into separate labels, decide whether to apply /// allowUnassigned and useSTD3ASCIIRules on each, and then convert. This /// function does not offer that level of granularity. The options once set /// will apply to all labels in the domain name /// </summary> /// /// <param name="src">The input string to be processed</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>StringBuffer the converted String</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public static StringBuilder ConvertIDNToASCII(String src, int options) { char[] srcArr = src.ToCharArray(); StringBuilder result = new StringBuilder(); int sepIndex = 0; int oldSepIndex = 0; for (;;) { sepIndex = GetSeparatorIndex(srcArr, sepIndex, srcArr.Length); String label = new String(srcArr, oldSepIndex, sepIndex - oldSepIndex); // make sure this is not a root label separator. if (!(label.Length == 0 && sepIndex == srcArr.Length)) { UCharacterIterator iter = IBM.ICU.Text.UCharacterIterator.GetInstance(label); result.Append(ConvertToASCII(iter, options)); } if (sepIndex == srcArr.Length) { break; } // increment the sepIndex to skip past the separator sepIndex++; oldSepIndex = sepIndex; result.Append((char)FULL_STOP); } if (result.Length > MAX_DOMAIN_NAME_LENGTH) { throw new StringPrepParseException( "The output exceed the max allowed length.", IBM.ICU.Text.StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); } return(result); }
/* * boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if( * (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; } */ /* * 1) Map -- For each character in the input, check if it has a mapping and, * if so, replace it with its mapping. * * 2) Normalize -- Possibly normalize the result of step 1 using Unicode * normalization. * * 3) Prohibit -- Check for any characters that are not allowed in the * output. If any are found, return an error. * * 4) Check bidi -- Possibly check for right-to-left characters, and if any * are found, make sure that the whole string satisfies the requirements for * bidirectional strings. If the string does not satisfy the requirements * for bidirectional strings, return an error. [Unicode3.2] defines several * bidirectional categories; each character has one bidirectional category * assigned to it. For the purposes of the requirements below, an * "RandALCat character" is a character that has Unicode bidirectional * categories "R" or "AL"; an "LCat character" is a character that has * Unicode bidirectional category "L". Note * * * that there are many characters which fall in neither of the above * definitions; Latin digits (<U+0030> through <U+0039>) are examples of * this because they have bidirectional category "EN". * * In any profile that specifies bidirectional character handling, all three * of the following requirements MUST be met: * * 1) The characters in section 5.8 MUST be prohibited. * * 2) If a string contains any RandALCat character, the string MUST NOT * contain any LCat character. * * 3) If a string contains any RandALCat character, a RandALCat character * MUST be the first character of the string, and a RandALCat character MUST * be the last character of the string. */ /// <summary> /// Prepare the input buffer for use in applications with the given profile. /// This operation maps, normalizes(NFKC), checks for prohited and BiDi /// characters in the order defined by RFC 3454 depending on the options /// specified in the profile. /// </summary> /// /// <param name="src">A UCharacterIterator object containing the source string</param> /// <param name="options">A bit set of options:- StringPrep.NONE Prohibit processing of unassigned codepoints in the input- StringPrep.ALLOW_UNASSIGNED Treat the unassigned code pointsare in the input as normal Unicode code points.</param> /// <returns>StringBuffer A StringBuffer containing the output</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public StringBuilder Prepare(UCharacterIterator src, int options) { // map StringBuilder mapOut = Map(src, options); StringBuilder normOut = mapOut; // initialize if (doNFKC) { // normalize normOut = Normalize(mapOut); } int ch; char result; UCharacterIterator iter = IBM.ICU.Text.UCharacterIterator.GetInstance(normOut); StringPrep.Values val = new StringPrep.Values(); int direction = IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT, firstCharDir = IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT; int rtlPos = -1, ltrPos = -1; bool rightToLeft = false, leftToRight = false; while ((ch = iter.NextCodePoint()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { result = GetCodePointValue(ch); GetValues(result, val); if (val.type == PROHIBITED) { throw new StringPrepParseException( "A prohibited code point was found in the input", IBM.ICU.Text.StringPrepParseException.PROHIBITED_ERROR, iter.GetText(), val.value_ren); } if (checkBiDi) { direction = bdp.GetClass(ch); if (firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT) { firstCharDir = direction; } if (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.LEFT_TO_RIGHT) { leftToRight = true; ltrPos = iter.GetIndex() - 1; } if (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC) { rightToLeft = true; rtlPos = iter.GetIndex() - 1; } } } if (checkBiDi == true) { // satisfy 2 if (leftToRight == true && rightToLeft == true) { throw new StringPrepParseException( "The input does not conform to the rules for BiDi code points.", IBM.ICU.Text.StringPrepParseException.CHECK_BIDI_ERROR, iter .GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } // satisfy 3 if (rightToLeft == true && !((firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC) && (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC))) { throw new StringPrepParseException( "The input does not conform to the rules for BiDi code points.", IBM.ICU.Text.StringPrepParseException.CHECK_BIDI_ERROR, iter .GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } } return(normOut); }
private StringBuilder Map(UCharacterIterator iter, int options) { StringPrep.Values val = new StringPrep.Values(); char result = (char)(0); int ch = IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE; StringBuilder dest = new StringBuilder(); bool allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0); while ((ch = iter.NextCodePoint()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { result = GetCodePointValue(ch); GetValues(result, val); // check if the source codepoint is unassigned if (val.type == UNASSIGNED && allowUnassigned == false) { throw new StringPrepParseException( "An unassigned code point was found in the input", IBM.ICU.Text.StringPrepParseException.UNASSIGNED_ERROR, iter.GetText(), iter.GetIndex()); } else if ((val.type == MAP)) { int index, length; if (val.isIndex) { index = val.value_ren; if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) { length = 1; } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) { length = 2; } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) { length = 3; } else { length = mappingData[index++]; } /* copy mapping to destination */ dest.Append(mappingData, index, length); continue; } else { ch -= val.value_ren; } } else if (val.type == DELETE) { // just consume the codepoint and contine continue; } // copy the source into destination IBM.ICU.Text.UTF16.Append(dest, ch); } return(dest); }
/// <summary> /// Convenience function that implements the IDNToUnicode operation as /// defined in the IDNA RFC. This operation is done on complete domain names, /// e.g: "www.example.com". /// <b>Note:</b> IDNA RFC specifies that a conformant application should /// divide a domain name into separate labels, decide whether to apply /// allowUnassigned and useSTD3ASCIIRules on each, and then convert. This /// function does not offer that level of granularity. The options once set /// will apply to all labels in the domain name /// </summary> /// /// <param name="src">The input string as UCharacterIterator to be processed</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>StringBuffer the converted String</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public static StringBuilder ConvertIDNToUnicode(UCharacterIterator src, int options) { return(ConvertIDNToUnicode(src.GetText(), options)); }
/// <summary> /// Function that implements the ToUnicode operation as defined in the IDNA /// RFC. This operation is done on <b>single labels</b> before sending it to /// something that expects Unicode names. A label is an individual part of a /// domain name. Labels are usually separated by dots; for /// e.g." "www.example.com" is composed of 3 labels "www","example", and /// "com". /// </summary> /// /// <param name="src">The input string as UCharacterIterator to be processed</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>StringBuffer the converted String</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public static StringBuilder ConvertToUnicode(UCharacterIterator src, int options) { bool[] caseFlags = null; // the source contains all ascii codepoints bool srcIsASCII = true; // assume the source contains all LDH codepoints // boolean srcIsLDH = true; // get the options // boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); // int failPos = -1; int ch; int saveIndex = src.GetIndex(); // step 1: find out if all the codepoints in src are ASCII while ((ch = src.Next()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { if (ch > 0x7F) { srcIsASCII = false; } /* * else if((srcIsLDH = isLDHChar(ch))==false){ failPos = * src.getIndex(); } */ } StringBuilder processOut; if (srcIsASCII == false) { try { // step 2: process the string src.SetIndex(saveIndex); processOut = singleton.namePrep.Prepare(src, options); } catch (StringPrepParseException ex) { return(new StringBuilder(src.GetText())); } } else { // just point to source processOut = new StringBuilder(src.GetText()); } // TODO: // The RFC states that // <quote> // ToUnicode never fails. If any step fails, then the original input // is returned immediately in that step. // </quote> // step 3: verify ACE Prefix if (StartsWithPrefix(processOut)) { StringBuilder decodeOut = null; // step 4: Remove the ACE Prefix String temp = processOut.ToString(ACE_PREFIX.Length, processOut.Length - ACE_PREFIX.Length); // step 5: Decode using punycode try { decodeOut = IBM.ICU.Text.Punycode.Decode(new StringBuilder(temp), caseFlags); } catch (StringPrepParseException e) { decodeOut = null; } // step 6:Apply toASCII if (decodeOut != null) { StringBuilder toASCIIOut = ConvertToASCII(decodeOut, options); // step 7: verify if (CompareCaseInsensitiveASCII(processOut, toASCIIOut) != 0) { // throw new // StringPrepParseException("The verification step prescribed by the RFC 3491 failed", // StringPrepParseException.VERIFICATION_ERROR); decodeOut = null; } } // step 8: return output of step 5 if (decodeOut != null) { return(decodeOut); } } // }else{ // // verify that STD3 ASCII rules are satisfied // if(useSTD3ASCIIRules == true){ // if( srcIsLDH == false /* source contains some non-LDH characters */ // || processOut.charAt(0) == HYPHEN // || processOut.charAt(processOut.length()-1) == HYPHEN){ // // if(srcIsLDH==false){ // throw new // StringPrepParseException("The input does not conform to the STD 3 ASCII rules", // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), // (failPos>0) ? (failPos-1) : failPos); // }else if(processOut.charAt(0) == HYPHEN){ // throw new // StringPrepParseException("The input does not conform to the STD 3 ASCII rules", // StringPrepParseException.STD3_ASCII_RULES_ERROR, // processOut.toString(),0); // // }else{ // throw new // StringPrepParseException("The input does not conform to the STD 3 ASCII rules", // StringPrepParseException.STD3_ASCII_RULES_ERROR, // processOut.toString(), // processOut.length()); // // } // } // } // // just return the source // return new StringBuffer(src.getText()); // } return(new StringBuilder(src.GetText())); }
/// <summary> /// This function implements the ToUnicode operation as defined in the IDNA /// RFC. This operation is done on <b>single labels</b> before sending it to /// something that expects Unicode names. A label is an individual part of a /// domain name. Labels are usually separated by dots; for /// e.g." "www.example.com" is composed of 3 labels "www","example", and /// "com". /// </summary> /// /// <param name="src">The input string as StringBuffer to be processed</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>StringBuffer the converted String</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public static StringBuilder ConvertToUnicode(StringBuilder src, int options) { UCharacterIterator iter = IBM.ICU.Text.UCharacterIterator.GetInstance(src); return(ConvertToUnicode(iter, options)); }
/// <summary> /// This function implements the ToASCII operation as defined in the IDNA /// RFC. This operation is done on <b>single labels</b> before sending it to /// something that expects ASCII names. A label is an individual part of a /// domain name. Labels are usually separated by dots; /// e.g." "www.example.com" is composed of 3 labels "www","example", and /// "com". /// </summary> /// /// <param name="src">The input string as UCharacterIterator to be processed</param> /// <param name="options">A bit set of options: - IDNA.DEFAULT Use default options,i.e., do not process unassigned code points and do not useSTD3 ASCII rules If unassigned code points are found theoperation fails with ParseException.- IDNA.ALLOW_UNASSIGNED Unassigned values can be converted toASCII for query operations If this option is set, theunassigned code points are in the input are treated as normalUnicode code points.- IDNA.USE_STD3_RULES Use STD3 ASCII rules for host namesyntax restrictions If this option is set and the input doesnot satisfy STD3 rules, the operation will fail withParseException</param> /// <returns>StringBuffer the converted String</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public static StringBuilder ConvertToASCII(UCharacterIterator src, int options) { bool[] caseFlags = null; // the source contains all ascii codepoints bool srcIsASCII = true; // assume the source contains all LDH codepoints bool srcIsLDH = true; // get the options bool useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); int ch; // step 1 while ((ch = src.Next()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { if (ch > 0x7f) { srcIsASCII = false; } } int failPos = -1; src.SetToStart(); StringBuilder processOut = null; // step 2 is performed only if the source contains non ASCII if (!srcIsASCII) { // step 2 processOut = singleton.namePrep.Prepare(src, options); } else { processOut = new StringBuilder(src.GetText()); } int poLen = processOut.Length; if (poLen == 0) { throw new StringPrepParseException( "Found zero length lable after NamePrep.", IBM.ICU.Text.StringPrepParseException.ZERO_LENGTH_LABEL); } StringBuilder dest = new StringBuilder(); // reset the variable to verify if output of prepare is ASCII or not srcIsASCII = true; // step 3 & 4 for (int j = 0; j < poLen; j++) { ch = processOut[j]; if (ch > 0x7F) { srcIsASCII = false; } else if (IsLDHChar(ch) == false) { // here we do not assemble surrogates // since we know that LDH code points // are in the ASCII range only srcIsLDH = false; failPos = j; } } if (useSTD3ASCIIRules == true) { // verify 3a and 3b if (srcIsLDH == false || /* source contains some non-LDH characters */ processOut[0] == HYPHEN || processOut[processOut.Length - 1] == HYPHEN) { /* populate the parseError struct */ if (srcIsLDH == false) { throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", IBM.ICU.Text.StringPrepParseException.STD3_ASCII_RULES_ERROR, processOut.ToString(), (failPos > 0) ? (failPos - 1) : failPos); } else if (processOut[0] == HYPHEN) { throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", IBM.ICU.Text.StringPrepParseException.STD3_ASCII_RULES_ERROR, processOut.ToString(), 0); } else { throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", IBM.ICU.Text.StringPrepParseException.STD3_ASCII_RULES_ERROR, processOut.ToString(), (poLen > 0) ? poLen - 1 : poLen); } } } if (srcIsASCII) { dest = processOut; } else { // step 5 : verify the sequence does not begin with ACE prefix if (!StartsWithPrefix(processOut)) { // step 6: encode the sequence with punycode caseFlags = new bool[poLen]; StringBuilder punyout = IBM.ICU.Text.Punycode.Encode(processOut, caseFlags); // convert all codepoints to lower case ASCII StringBuilder lowerOut = ToASCIILower(punyout); // Step 7: prepend the ACE prefix dest.Append(ACE_PREFIX, 0, ACE_PREFIX.Length); // Step 6: copy the contents in b2 into dest dest.Append(lowerOut); } else { throw new StringPrepParseException( "The input does not start with the ACE Prefix.", IBM.ICU.Text.StringPrepParseException.ACE_PREFIX_ERROR, processOut.ToString(), 0); } } if (dest.Length > MAX_LABEL_LENGTH) { throw new StringPrepParseException( "The labels in the input are too long. Length > 63.", IBM.ICU.Text.StringPrepParseException.LABEL_TOO_LONG_ERROR, dest.ToString(), 0); } return(dest); }