/// <summary> /// Changes fully translated Unicode data, e.g. with Japanaese Kanji characters, into /// a string of characters that represents the raw, ISO character repertoire representation, i.e. /// with escape sequences, but encoded as a Unicode string /// </summary> /// <param name="unicodeData">Fully translated Unicode data</param> /// <param name="repertoire">Target repertoire to be transformed into</param> /// <returns></returns> private static string Encode(string unicodeData, CharacterSetInfo repertoire) { byte[] rawBytes; Encode(unicodeData, repertoire, out rawBytes); char[] rawCharacters = IsomorphicEncoding.GetChars(rawBytes); return(new string(rawCharacters)); }
public Last(string specificCharacterSet, CharacterSetInfo defaultRepertoire, Dictionary <string, CharacterSetInfo> extensionRepertoires) { SpecificCharacterSet = specificCharacterSet; DefaultRepertoire = defaultRepertoire; ExtensionRepertoires = extensionRepertoires; }
public Last(string specificCharacterSet, CharacterSetInfo defaultRepertoire, Dictionary<string, CharacterSetInfo> extensionRepertoires) { SpecificCharacterSet = specificCharacterSet; DefaultRepertoire = defaultRepertoire; ExtensionRepertoires = extensionRepertoires; }
/// <summary> /// Takes a string that is a representation of the raw sequence of bytes /// encoded using an ISO repertoire, but current encoded as a Unicode string /// and gives back a true Unicode string, e.g. containing the Japanese Kanji /// characters /// </summary> /// <param name="rawData">Sequence of bytes formatted in Unicode</param> /// <param name="repertoire">Original ISO repertoire used in the encoding</param> /// <returns>True Unicode string</returns> private static string Decode(string rawData, CharacterSetInfo repertoire) { // get it back to byte array form using a character set that includes // both GR and GL areas (characters up to \xff in binary value) // and it seems Windows-1252 works better than ISO-8859-1 byte[] rawBytes = IsomorphicEncoding.GetBytes(rawData); return(Decode(rawBytes, repertoire)); }
/// <summary> /// Takes a string that is encoded using the ISO repertoires, as a raw sequence /// of bytes, and then gives back a fully translated Unicode representation, e.g. /// with the correct Japanese Kanji characters in Unicode /// </summary> /// <param name="rawData">Byte sequence encoded using ISO repertoire</param> /// <param name="repertoire">Repertoire the byte sequence is encoded using</param> /// <returns>Unicode string</returns> private static string Decode(byte[] rawData, CharacterSetInfo repertoire) { Encoding rawEncoding = Encoding.GetEncoding(repertoire.MicrosoftCodePage); string rawDataDecoded = new string(rawEncoding.GetChars(rawData)); // get rid of any escape sequences, if they appear in the decoded string, // like the case of Korean, using code page 20949 for some reason if ("" != repertoire.G1Sequence) { return(rawDataDecoded.Replace(repertoire.G1Sequence, "")); } else { return(rawDataDecoded); } }
/// <summary> /// Takes a string that is encoded using the ISO repertoires, as a raw sequence /// of bytes, and then gives back a fully translated Unicode representation, e.g. /// with the correct Japanese Kanji characters in Unicode /// </summary> /// <param name="rawData">Byte sequence encoded using ISO repertoire</param> /// <param name="repertoire">Repertoire the byte sequence is encoded using</param> /// <returns>Unicode string</returns> private static string Decode(byte[] rawData, CharacterSetInfo repertoire) { Encoding rawEncoding = Encoding.GetEncoding(repertoire.MicrosoftCodePage); string rawDataDecoded = new string(rawEncoding.GetChars(rawData)); // get rid of any escape sequences, if they appear in the decoded string, // like the case of Korean, using code page 20949 for some reason if ("" != repertoire.G1Sequence) return rawDataDecoded.Replace(repertoire.G1Sequence, ""); else return rawDataDecoded; }
/// <summary> /// Takes a string that is a representation of the raw sequence of bytes /// encoded using an ISO repertoire, but current encoded as a Unicode string /// and gives back a true Unicode string, e.g. containing the Japanese Kanji /// characters /// </summary> /// <param name="rawData">Sequence of bytes formatted in Unicode</param> /// <param name="repertoire">Original ISO repertoire used in the encoding</param> /// <returns>True Unicode string</returns> private static string Decode(string rawData, CharacterSetInfo repertoire) { // get it back to byte array form using a character set that includes // both GR and GL areas (characters up to \xff in binary value) // and it seems Windows-1252 works better than ISO-8859-1 byte[] rawBytes = Encoding.GetEncoding(IsomorphicCodePage).GetBytes(rawData); return Decode(rawBytes, repertoire); }
/// <summary> /// Changes fully translated Unicode data, e.g. with Japanese Kanji characters, into /// a raw byte array containing the 8-bit representation in the target repertoire /// </summary> /// <param name="unicodeData">Fully translated Unicode data</param> /// <param name="repertoire">Target repertoire to be transformed into</param> /// <param name="encoded">Output: byte array to hold the results</param> private static void Encode(string unicodeData, CharacterSetInfo repertoire, out byte[] encoded) { byte[] rawBytes = Encoding.GetEncoding(repertoire.MicrosoftCodePage).GetBytes(unicodeData); encoded = rawBytes; }
/// <summary> /// Changes fully translated Unicode data, e.g. with Japanaese Kanji characters, into /// a string of characters that represents the raw, ISO character repertoire representation, i.e. /// with escape sequences, but encoded as a Unicode string /// </summary> /// <param name="unicodeData">Fully translated Unicode data</param> /// <param name="repertoire">Target repertoire to be transformed into</param> /// <returns></returns> private static string Encode(string unicodeData, CharacterSetInfo repertoire) { byte[] rawBytes; Encode(unicodeData, repertoire, out rawBytes); char[] rawCharacters = Encoding.GetEncoding(IsomorphicCodePage).GetChars(rawBytes); return new string(rawCharacters); }
private static void GetRepertoires(string specificCharacterSet, out CharacterSetInfo defaultRepertoire, out Dictionary<string, CharacterSetInfo> extensionRepertoires) { // TODO: // Specific Character Set may have up to n values if // Code Extensions are used. We accomodate for that here // by parsing out all the different possible defined terms. // At this point, however, we're not going to handle escaping // between character sets from different code pages within // a single string. For example, DICOM implies that you should // be able to have JIS-encoded Japanese, ISO European characters, // Thai characters and Korean characters on the same line, using // Code Extensions (escape sequences). (Chinese is not included // since the only support for Chinese is through GB18030 and // UTF-8, both of which do not support Code Extensions.) string[] specificCharacterSetValues = specificCharacterSet.Split('\\'); defaultRepertoire = null; // set the default repertoire from Value 1 if (specificCharacterSetValues.GetUpperBound(0) >= 0) { if (!CharacterSetDatabase.TryGetValue(specificCharacterSetValues[0], out defaultRepertoire)) // we put in the default repertoire. Technically, it may // not be ISO 2022 IR 6, but ISO_IR 6, but the information // we want to use is the same defaultRepertoire = CharacterSetDatabase["ISO 2022 IR 6"]; } // Here we are accounting for cases where the same character sets are repeated, so // we need to select out the unique ones. It should never really happen, but it // does happen with a particular dataset when querying JDicom. List<string> uniqueExtensionRepertoireDefinedTerms = new List<string>(); for (int i = 1; i < specificCharacterSetValues.Length; ++i) { string value = specificCharacterSetValues[i]; if (value != defaultRepertoire.DefinedTerm && !uniqueExtensionRepertoireDefinedTerms.Contains(value)) uniqueExtensionRepertoireDefinedTerms.Add(value); } // parse out the extension repertoires extensionRepertoires = new Dictionary<string, CharacterSetInfo>(); foreach (string value in uniqueExtensionRepertoireDefinedTerms) { if (CharacterSetDatabase.ContainsKey(value) && !extensionRepertoires.ContainsKey(value)) { // special robustness handling of GB18030 and UTF-8 if ("GB18030" == value || "ISO_IR 192" == value) { // these two character sets can't use code extensions, so there should really only be 1 // character set in the repertoire extensionRepertoires.Clear(); extensionRepertoires.Add(value, CharacterSetDatabase[value]); break; } extensionRepertoires.Add(value, CharacterSetDatabase[value]); } else if (!extensionRepertoires.ContainsKey("ISO 2022 IR 6")) { // we put in the default repertoire. Technically, it may // not be ISO 2022 IR 6, but ISO_IR 6, but the information // we want to use is the same extensionRepertoires.Add(value, SpecificCharacterSetParser.CharacterSetDatabase["ISO 2022 IR 6"]); } } }
private static void GetRepertoires(string specificCharacterSet, out CharacterSetInfo defaultRepertoire, out Dictionary <string, CharacterSetInfo> extensionRepertoires) { //Most of the time, especially on the same thread, the specific character set will be the same. //This simple check avoids having to figure it out over and over again, which gets expensive. var last = _last; if (last != null && specificCharacterSet == last.SpecificCharacterSet) { defaultRepertoire = last.DefaultRepertoire; extensionRepertoires = last.ExtensionRepertoires; return; } // TODO: // Specific Character Set may have up to n values if // Code Extensions are used. We accomodate for that here // by parsing out all the different possible defined terms. // At this point, however, we're not going to handle escaping // between character sets from different code pages within // a single string. For example, DICOM implies that you should // be able to have JIS-encoded Japanese, ISO European characters, // Thai characters and Korean characters on the same line, using // Code Extensions (escape sequences). (Chinese is not included // since the only support for Chinese is through GB18030 and // UTF-8, both of which do not support Code Extensions.) string[] specificCharacterSetValues = specificCharacterSet.Split('\\'); defaultRepertoire = null; // set the default repertoire from Value 1 if (specificCharacterSetValues.GetUpperBound(0) >= 0) { if (!CharacterSetDatabase.TryGetValue(specificCharacterSetValues[0], out defaultRepertoire)) { // we put in the default repertoire. Technically, it may // not be ISO 2022 IR 6, but ISO_IR 6, but the information // we want to use is the same defaultRepertoire = CharacterSetDatabase["ISO 2022 IR 6"]; } } // Here we are accounting for cases where the same character sets are repeated, so // we need to select out the unique ones. It should never really happen, but it // does happen with a particular dataset when querying JDicom. List <string> uniqueExtensionRepertoireDefinedTerms = new List <string>(); for (int i = 1; i < specificCharacterSetValues.Length; ++i) { string value = specificCharacterSetValues[i]; if (value != defaultRepertoire.DefinedTerm && !uniqueExtensionRepertoireDefinedTerms.Contains(value)) { uniqueExtensionRepertoireDefinedTerms.Add(value); } } // parse out the extension repertoires extensionRepertoires = new Dictionary <string, CharacterSetInfo>(); foreach (string value in uniqueExtensionRepertoireDefinedTerms) { if (CharacterSetDatabase.ContainsKey(value) && !extensionRepertoires.ContainsKey(value)) { // special robustness handling of GB18030 and UTF-8 if ("GB18030" == value || "ISO_IR 192" == value) { // these two character sets can't use code extensions, so there should really only be 1 // character set in the repertoire extensionRepertoires.Clear(); extensionRepertoires.Add(value, CharacterSetDatabase[value]); break; } extensionRepertoires.Add(value, CharacterSetDatabase[value]); } else if (!extensionRepertoires.ContainsKey("ISO 2022 IR 6")) { // we put in the default repertoire. Technically, it may // not be ISO 2022 IR 6, but ISO_IR 6, but the information // we want to use is the same extensionRepertoires.Add(value, CharacterSetDatabase["ISO 2022 IR 6"]); } } _last = new Last(specificCharacterSet, defaultRepertoire, extensionRepertoires); }