예제 #1
0
 /// <summary>
 /// Changes fully translated Unicode data, e.g. with Japanaese Kanji characters, into
 /// a string of characters that represents the raw, ISO character repertoire representation, i.e.
 /// with escape sequences, but encoded as a Unicode string
 /// </summary>
 /// <param name="unicodeData">Fully translated Unicode data</param>
 /// <param name="repertoire">Target repertoire to be transformed into</param>
 /// <returns></returns>
 private static string Encode(string unicodeData, CharacterSetInfo repertoire)
 {
     byte[] rawBytes;
     Encode(unicodeData, repertoire, out rawBytes);
     char[] rawCharacters = IsomorphicEncoding.GetChars(rawBytes);
     return(new string(rawCharacters));
 }
예제 #2
0
 public Last(string specificCharacterSet, CharacterSetInfo defaultRepertoire,
             Dictionary <string, CharacterSetInfo> extensionRepertoires)
 {
     SpecificCharacterSet = specificCharacterSet;
     DefaultRepertoire    = defaultRepertoire;
     ExtensionRepertoires = extensionRepertoires;
 }
 public Last(string specificCharacterSet, CharacterSetInfo defaultRepertoire, 
     Dictionary<string, CharacterSetInfo> extensionRepertoires)
 {
     SpecificCharacterSet = specificCharacterSet;
     DefaultRepertoire = defaultRepertoire;
     ExtensionRepertoires = extensionRepertoires;
 }
예제 #4
0
 /// <summary>
 /// Takes a string that is a representation of the raw sequence of bytes
 /// encoded using an ISO repertoire, but current encoded as a Unicode string
 /// and gives back a true Unicode string, e.g. containing the Japanese Kanji
 /// characters
 /// </summary>
 /// <param name="rawData">Sequence of bytes formatted in Unicode</param>
 /// <param name="repertoire">Original ISO repertoire used in the encoding</param>
 /// <returns>True Unicode string</returns>
 private static string Decode(string rawData, CharacterSetInfo repertoire)
 {
     // get it back to byte array form using a character set that includes
     // both GR and GL areas (characters up to \xff in binary value)
     // and it seems Windows-1252 works better than ISO-8859-1
     byte[] rawBytes = IsomorphicEncoding.GetBytes(rawData);
     return(Decode(rawBytes, repertoire));
 }
예제 #5
0
        /// <summary>
        /// Takes a string that is encoded using the ISO repertoires, as a raw sequence
        /// of bytes, and then gives back a fully translated Unicode representation, e.g.
        /// with the correct Japanese Kanji characters in Unicode
        /// </summary>
        /// <param name="rawData">Byte sequence encoded using ISO repertoire</param>
        /// <param name="repertoire">Repertoire the byte sequence is encoded using</param>
        /// <returns>Unicode string</returns>
        private static string Decode(byte[] rawData, CharacterSetInfo repertoire)
        {
            Encoding rawEncoding    = Encoding.GetEncoding(repertoire.MicrosoftCodePage);
            string   rawDataDecoded = new string(rawEncoding.GetChars(rawData));

            // get rid of any escape sequences, if they appear in the decoded string,
            // like the case of Korean, using code page 20949 for some reason
            if ("" != repertoire.G1Sequence)
            {
                return(rawDataDecoded.Replace(repertoire.G1Sequence, ""));
            }
            else
            {
                return(rawDataDecoded);
            }
        }
        /// <summary>
        /// Takes a string that is encoded using the ISO repertoires, as a raw sequence
        /// of bytes, and then gives back a fully translated Unicode representation, e.g.
        /// with the correct Japanese Kanji characters in Unicode
        /// </summary>
        /// <param name="rawData">Byte sequence encoded using ISO repertoire</param>
        /// <param name="repertoire">Repertoire the byte sequence is encoded using</param>
        /// <returns>Unicode string</returns>
        private static string Decode(byte[] rawData, CharacterSetInfo repertoire)
        {
            Encoding rawEncoding = Encoding.GetEncoding(repertoire.MicrosoftCodePage);
            string rawDataDecoded = new string(rawEncoding.GetChars(rawData));

            // get rid of any escape sequences, if they appear in the decoded string,
            // like the case of Korean, using code page 20949 for some reason
            if ("" != repertoire.G1Sequence)
                return rawDataDecoded.Replace(repertoire.G1Sequence, "");
            else
                return rawDataDecoded;
        }
 /// <summary>
 /// Takes a string that is a representation of the raw sequence of bytes 
 /// encoded using an ISO repertoire, but current encoded as a Unicode string
 /// and gives back a true Unicode string, e.g. containing the Japanese Kanji 
 /// characters
 /// </summary>
 /// <param name="rawData">Sequence of bytes formatted in Unicode</param>
 /// <param name="repertoire">Original ISO repertoire used in the encoding</param>
 /// <returns>True Unicode string</returns>
 private static string Decode(string rawData, CharacterSetInfo repertoire)
 {
     // get it back to byte array form using a character set that includes 
     // both GR and GL areas (characters up to \xff in binary value)
     // and it seems Windows-1252 works better than ISO-8859-1
     byte[] rawBytes = Encoding.GetEncoding(IsomorphicCodePage).GetBytes(rawData);
     return Decode(rawBytes, repertoire);
 }
 /// <summary>
 /// Changes fully translated Unicode data, e.g. with Japanese Kanji characters, into
 /// a raw byte array containing the 8-bit representation in the target repertoire
 /// </summary>
 /// <param name="unicodeData">Fully translated Unicode data</param>
 /// <param name="repertoire">Target repertoire to be transformed into</param>
 /// <param name="encoded">Output: byte array to hold the results</param>
 private static void Encode(string unicodeData, CharacterSetInfo repertoire, out byte[] encoded)
 {
     byte[] rawBytes = Encoding.GetEncoding(repertoire.MicrosoftCodePage).GetBytes(unicodeData);
     encoded = rawBytes;
 }
 /// <summary>
 /// Changes fully translated Unicode data, e.g. with Japanaese Kanji characters, into
 /// a string of characters that represents the raw, ISO character repertoire representation, i.e.
 /// with escape sequences, but encoded as a Unicode string
 /// </summary>
 /// <param name="unicodeData">Fully translated Unicode data</param>
 /// <param name="repertoire">Target repertoire to be transformed into</param>
 /// <returns></returns>
 private static string Encode(string unicodeData, CharacterSetInfo repertoire)
 {
     byte[] rawBytes;
     Encode(unicodeData, repertoire, out rawBytes);
     char[] rawCharacters = Encoding.GetEncoding(IsomorphicCodePage).GetChars(rawBytes);
     return new string(rawCharacters);
 }
        private static void GetRepertoires(string specificCharacterSet, out CharacterSetInfo defaultRepertoire, out Dictionary<string, CharacterSetInfo> extensionRepertoires)
        {
            // TODO:
            // Specific Character Set may have up to n values if 
            // Code Extensions are used. We accomodate for that here
            // by parsing out all the different possible defined terms.
            // At this point, however, we're not going to handle escaping
            // between character sets from different code pages within
            // a single string. For example, DICOM implies that you should
            // be able to have JIS-encoded Japanese, ISO European characters,
            // Thai characters and Korean characters on the same line, using
            // Code Extensions (escape sequences). (Chinese is not included
            // since the only support for Chinese is through GB18030 and
            // UTF-8, both of which do not support Code Extensions.)
            string[] specificCharacterSetValues = specificCharacterSet.Split('\\');
            defaultRepertoire = null;

            // set the default repertoire from Value 1 
            if (specificCharacterSetValues.GetUpperBound(0) >= 0)
            {
                if (!CharacterSetDatabase.TryGetValue(specificCharacterSetValues[0], out defaultRepertoire))
                    // we put in the default repertoire. Technically, it may
                    // not be ISO 2022 IR 6, but ISO_IR 6, but the information
                    // we want to use is the same
                    defaultRepertoire = CharacterSetDatabase["ISO 2022 IR 6"];
            }

            // Here we are accounting for cases where the same character sets are repeated, so
            // we need to select out the unique ones.  It should never really happen, but it 
            // does happen with a particular dataset when querying JDicom.
            List<string> uniqueExtensionRepertoireDefinedTerms = new List<string>();
            for (int i = 1; i < specificCharacterSetValues.Length; ++i)
            {
                string value = specificCharacterSetValues[i];
                if (value != defaultRepertoire.DefinedTerm && !uniqueExtensionRepertoireDefinedTerms.Contains(value))
                    uniqueExtensionRepertoireDefinedTerms.Add(value);
            }

            // parse out the extension repertoires
            extensionRepertoires = new Dictionary<string, CharacterSetInfo>();
            foreach (string value in uniqueExtensionRepertoireDefinedTerms)
            {
                if (CharacterSetDatabase.ContainsKey(value) && !extensionRepertoires.ContainsKey(value))
                {
                    // special robustness handling of GB18030 and UTF-8
                    if ("GB18030" == value || "ISO_IR 192" == value)
                    {
                        // these two character sets can't use code extensions, so there should really only be 1
                        // character set in the repertoire
                        extensionRepertoires.Clear();
                        extensionRepertoires.Add(value, CharacterSetDatabase[value]);
                        break;
                    }

                    extensionRepertoires.Add(value, CharacterSetDatabase[value]);
                }
                else if (!extensionRepertoires.ContainsKey("ISO 2022 IR 6"))
                {
                    // we put in the default repertoire. Technically, it may
                    // not be ISO 2022 IR 6, but ISO_IR 6, but the information
                    // we want to use is the same
                    extensionRepertoires.Add(value, SpecificCharacterSetParser.CharacterSetDatabase["ISO 2022 IR 6"]);
                }
            }
        }
예제 #11
0
 /// <summary>
 /// Changes fully translated Unicode data, e.g. with Japanese Kanji characters, into
 /// a raw byte array containing the 8-bit representation in the target repertoire
 /// </summary>
 /// <param name="unicodeData">Fully translated Unicode data</param>
 /// <param name="repertoire">Target repertoire to be transformed into</param>
 /// <param name="encoded">Output: byte array to hold the results</param>
 private static void Encode(string unicodeData, CharacterSetInfo repertoire, out byte[] encoded)
 {
     byte[] rawBytes = Encoding.GetEncoding(repertoire.MicrosoftCodePage).GetBytes(unicodeData);
     encoded = rawBytes;
 }
예제 #12
0
        private static void GetRepertoires(string specificCharacterSet, out CharacterSetInfo defaultRepertoire, out Dictionary <string, CharacterSetInfo> extensionRepertoires)
        {
            //Most of the time, especially on the same thread, the specific character set will be the same.
            //This simple check avoids having to figure it out over and over again, which gets expensive.
            var last = _last;

            if (last != null && specificCharacterSet == last.SpecificCharacterSet)
            {
                defaultRepertoire    = last.DefaultRepertoire;
                extensionRepertoires = last.ExtensionRepertoires;
                return;
            }

            // TODO:
            // Specific Character Set may have up to n values if
            // Code Extensions are used. We accomodate for that here
            // by parsing out all the different possible defined terms.
            // At this point, however, we're not going to handle escaping
            // between character sets from different code pages within
            // a single string. For example, DICOM implies that you should
            // be able to have JIS-encoded Japanese, ISO European characters,
            // Thai characters and Korean characters on the same line, using
            // Code Extensions (escape sequences). (Chinese is not included
            // since the only support for Chinese is through GB18030 and
            // UTF-8, both of which do not support Code Extensions.)
            string[] specificCharacterSetValues = specificCharacterSet.Split('\\');
            defaultRepertoire = null;

            // set the default repertoire from Value 1
            if (specificCharacterSetValues.GetUpperBound(0) >= 0)
            {
                if (!CharacterSetDatabase.TryGetValue(specificCharacterSetValues[0], out defaultRepertoire))
                {
                    // we put in the default repertoire. Technically, it may
                    // not be ISO 2022 IR 6, but ISO_IR 6, but the information
                    // we want to use is the same
                    defaultRepertoire = CharacterSetDatabase["ISO 2022 IR 6"];
                }
            }

            // Here we are accounting for cases where the same character sets are repeated, so
            // we need to select out the unique ones.  It should never really happen, but it
            // does happen with a particular dataset when querying JDicom.
            List <string> uniqueExtensionRepertoireDefinedTerms = new List <string>();

            for (int i = 1; i < specificCharacterSetValues.Length; ++i)
            {
                string value = specificCharacterSetValues[i];
                if (value != defaultRepertoire.DefinedTerm && !uniqueExtensionRepertoireDefinedTerms.Contains(value))
                {
                    uniqueExtensionRepertoireDefinedTerms.Add(value);
                }
            }

            // parse out the extension repertoires
            extensionRepertoires = new Dictionary <string, CharacterSetInfo>();
            foreach (string value in uniqueExtensionRepertoireDefinedTerms)
            {
                if (CharacterSetDatabase.ContainsKey(value) && !extensionRepertoires.ContainsKey(value))
                {
                    // special robustness handling of GB18030 and UTF-8
                    if ("GB18030" == value || "ISO_IR 192" == value)
                    {
                        // these two character sets can't use code extensions, so there should really only be 1
                        // character set in the repertoire
                        extensionRepertoires.Clear();
                        extensionRepertoires.Add(value, CharacterSetDatabase[value]);
                        break;
                    }

                    extensionRepertoires.Add(value, CharacterSetDatabase[value]);
                }
                else if (!extensionRepertoires.ContainsKey("ISO 2022 IR 6"))
                {
                    // we put in the default repertoire. Technically, it may
                    // not be ISO 2022 IR 6, but ISO_IR 6, but the information
                    // we want to use is the same
                    extensionRepertoires.Add(value, CharacterSetDatabase["ISO 2022 IR 6"]);
                }
            }

            _last = new Last(specificCharacterSet, defaultRepertoire, extensionRepertoires);
        }