Encapsulates a Character Set ECI, according to "Extended Channel Interpretations" 5.3.1.1 of ISO 18004.
Inheritance: ZXing.Common.ECI
        private static void addCharacterSet(int value, String encodingName)
        {
            var eci = new CharacterSetECI(value, encodingName);

            VALUE_TO_ECI[value]       = eci; // can't use valueOf
            NAME_TO_ECI[encodingName] = eci;
        }
Esempio n. 2
0
 /// <summary>
 /// returns the encoding object fo the specified charset
 /// </summary>
 /// <param name="charsetECI"></param>
 /// <returns></returns>
 public static System.Text.Encoding getEncoding(CharacterSetECI charsetECI)
 {
     if (charsetECI == null)
     {
         return(null);
     }
     return(getEncoding(charsetECI.EncodingName));
 }
        private static void addCharacterSet(int value, String[] encodingNames)
        {
            var eci = new CharacterSetECI(value, encodingNames[0]);

            VALUE_TO_ECI[value] = eci; // can't use valueOf
            foreach (string t in encodingNames)
            {
                NAME_TO_ECI[t] = eci;
            }
        }
Esempio n. 4
0
 /// <param name="value_Renamed">ECI value
 /// </param>
 /// <returns> {@link ECI} representing ECI of given value, or null if it is legal but unsupported
 /// </returns>
 /// <throws>  IllegalArgumentException if ECI value is invalid </throws>
 public static ECI getECIByValue(int value_Renamed)
 {
     if (value_Renamed < 0 || value_Renamed > 999999)
     {
         throw new System.ArgumentException("Bad ECI value: " + value_Renamed);
     }
     if (value_Renamed < 900)
     {
         // Character set ECIs use 000000 - 000899
         return(CharacterSetECI.getCharacterSetECIByValue(value_Renamed));
     }
     return(null);
 }
Esempio n. 5
0
        static StringUtils()
        {
#if (NETFX_CORE || PORTABLE || NETSTANDARD)
            PLATFORM_DEFAULT_ENCODING   = UTF8;
            PLATFORM_DEFAULT_ENCODING_T = Encoding.UTF8;
#else
            PLATFORM_DEFAULT_ENCODING   = Encoding.Default.WebName.ToUpper();
            PLATFORM_DEFAULT_ENCODING_T = Encoding.Default;
#endif
            SHIFT_JIS_ENCODING = CharacterSetECI.getEncoding(SHIFT_JIS);
            GB2312_ENCODING    = CharacterSetECI.getEncoding(GB2312);
            EUC_JP_ENCODING    = CharacterSetECI.getEncoding(EUC_JP);
            ISO88591_ENCODING  = CharacterSetECI.getEncoding(ISO88591);
            ASSUME_SHIFT_JIS   =
                PLATFORM_DEFAULT_ENCODING_T.Equals(SHIFT_JIS_ENCODING) ||
                PLATFORM_DEFAULT_ENCODING_T.Equals(EUC_JP_ENCODING);
        }
Esempio n. 6
0
      private static void appendECI(CharacterSetECI eci, BitArray bits)
      {
         bits.appendBits(Mode.ECI.Bits, 4);

         // This is correct for values up to 127, which is all we need now.
         bits.appendBits(eci.Value, 8);
      }
Esempio n. 7
0
 private static void addCharacterSet(int value, String[] encodingNames)
 {
    var eci = new CharacterSetECI(value, encodingNames[0]);
    VALUE_TO_ECI[value] = eci; // can't use valueOf
    foreach (string t in encodingNames)
    {
       NAME_TO_ECI[t] = eci;
    }
 }
Esempio n. 8
0
 private static void addCharacterSet(int value, String encodingName)
 {
    var eci = new CharacterSetECI(value, encodingName);
    VALUE_TO_ECI[value] = eci; // can't use valueOf
    NAME_TO_ECI[encodingName] = eci;
 }
		private static bool decodeByteSegment (BitSource bits,
		                                          StringBuilder result,
		                                          int count,
		                                          CharacterSetECI currentCharacterSetECI,
		                                          IList<byte[]> byteSegments,
		                                          IDictionary<DecodeHintType, object> hints)
		{
			// Don't crash trying to read more bits than we have available.
			if (count << 3 > bits.available ()) {
				return false;
			}

			byte[] readBytes = new byte[count];
			for (int i = 0; i < count; i++) {
				readBytes [i] = (byte)bits.readBits (8);
			}
			String encoding;
			if (currentCharacterSetECI == null) {
				// The spec isn't clear on this mode; see
				// section 6.4.5: t does not say which encoding to assuming
				// upon decoding. I have seen ISO-8859-1 used as well as
				// Shift_JIS -- without anything like an ECI designator to
				// give a hint.
				encoding = StringUtils.guessEncoding (readBytes, hints);
			} else {
				encoding = currentCharacterSetECI.EncodingName;
			}
			try {
				result.Append (Encoding.GetEncoding (encoding).GetString (readBytes, 0, readBytes.Length));
			}
#if (WINDOWS_PHONE70 || WINDOWS_PHONE71 || SILVERLIGHT4 || SILVERLIGHT5 || NETFX_CORE || MONOANDROID || MONOTOUCH)
         catch (ArgumentException)
         {
            try
            {
               // Silverlight only supports a limited number of character sets, trying fallback to UTF-8
               result.Append(Encoding.GetEncoding("UTF-8").GetString(readBytes, 0, readBytes.Length));
            }
            catch (Exception)
            {
               return false;
            }
         }
#endif
#if WindowsCE
         catch (PlatformNotSupportedException)
         {
            try
            {
               // WindowsCE doesn't support all encodings. But it is device depended.
               // So we try here the some different ones
               if (encoding == "ISO-8859-1")
               {
                  result.Append(Encoding.GetEncoding(1252).GetString(readBytes, 0, readBytes.Length));
               }
               else
               {
                  result.Append(Encoding.GetEncoding("UTF-8").GetString(readBytes, 0, readBytes.Length));
               }
            }
            catch (Exception)
            {
               return false;
            }
         }
#endif
         catch (Exception) {
				return false;
			}
			byteSegments.Add (readBytes);

			return true;
		}
Esempio n. 10
0
        /// <summary></summary>
        /// <param name="bytes">bytes encoding a string, whose encoding should be guessed</param>
        /// <param name="hints">decode hints if applicable</param>
        /// <returns>Charset of guessed encoding; at the moment will only guess one of:
        ///  {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8},
        ///  {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16},
        ///  or the platform default encoding if
        ///  none of these can possibly be correct</returns>
        public static Encoding guessCharset(byte[] bytes, IDictionary <DecodeHintType, object> hints)
        {
            if (hints != null && hints.ContainsKey(DecodeHintType.CHARACTER_SET))
            {
                String characterSet = (String)hints[DecodeHintType.CHARACTER_SET];
                if (characterSet != null)
                {
                    var encoding = CharacterSetECI.getEncoding(characterSet);
                    if (encoding != null)
                    {
                        return(encoding);
                    }
                }
            }

            // First try UTF-16, assuming anything with its BOM is UTF-16
            if (bytes.Length > 2 &&
                ((bytes[0] == (byte)0xFE && bytes[1] == (byte)0xFF) ||
                 (bytes[0] == (byte)0xFF && bytes[1] == (byte)0xFE)))
            {
                return(Encoding.Unicode);
            }

            // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
            // which should be by far the most common encodings.
            int  length                       = bytes.Length;
            bool canBeISO88591                = true;
            bool canBeShiftJIS                = true;
            bool canBeUTF8                    = true;
            int  utf8BytesLeft                = 0;
            int  utf2BytesChars               = 0;
            int  utf3BytesChars               = 0;
            int  utf4BytesChars               = 0;
            int  sjisBytesLeft                = 0;
            int  sjisKatakanaChars            = 0;
            int  sjisCurKatakanaWordLength    = 0;
            int  sjisCurDoubleBytesWordLength = 0;
            int  sjisMaxKatakanaWordLength    = 0;
            int  sjisMaxDoubleBytesWordLength = 0;
            int  isoHighOther                 = 0;

            bool utf8bom = bytes.Length > 3 &&
                           bytes[0] == 0xEF &&
                           bytes[1] == 0xBB &&
                           bytes[2] == 0xBF;

            for (int i = 0;
                 i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
                 i++)
            {
                int value = bytes[i] & 0xFF;

                // UTF-8 stuff
                if (canBeUTF8)
                {
                    if (utf8BytesLeft > 0)
                    {
                        if ((value & 0x80) == 0)
                        {
                            canBeUTF8 = false;
                        }
                        else
                        {
                            utf8BytesLeft--;
                        }
                    }
                    else if ((value & 0x80) != 0)
                    {
                        if ((value & 0x40) == 0)
                        {
                            canBeUTF8 = false;
                        }
                        else
                        {
                            utf8BytesLeft++;
                            if ((value & 0x20) == 0)
                            {
                                utf2BytesChars++;
                            }
                            else
                            {
                                utf8BytesLeft++;
                                if ((value & 0x10) == 0)
                                {
                                    utf3BytesChars++;
                                }
                                else
                                {
                                    utf8BytesLeft++;
                                    if ((value & 0x08) == 0)
                                    {
                                        utf4BytesChars++;
                                    }
                                    else
                                    {
                                        canBeUTF8 = false;
                                    }
                                }
                            }
                        }
                    }
                }

                // ISO-8859-1 stuff
                if (canBeISO88591)
                {
                    if (value > 0x7F && value < 0xA0)
                    {
                        canBeISO88591 = false;
                    }
                    else if (value > 0x9F)
                    {
                        if (value < 0xC0 || value == 0xD7 || value == 0xF7)
                        {
                            isoHighOther++;
                        }
                    }
                }

                // Shift_JIS stuff
                if (canBeShiftJIS)
                {
                    if (sjisBytesLeft > 0)
                    {
                        if (value < 0x40 || value == 0x7F || value > 0xFC)
                        {
                            canBeShiftJIS = false;
                        }
                        else
                        {
                            sjisBytesLeft--;
                        }
                    }
                    else if (value == 0x80 || value == 0xA0 || value > 0xEF)
                    {
                        canBeShiftJIS = false;
                    }
                    else if (value > 0xA0 && value < 0xE0)
                    {
                        sjisKatakanaChars++;
                        sjisCurDoubleBytesWordLength = 0;
                        sjisCurKatakanaWordLength++;
                        if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength)
                        {
                            sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
                        }
                    }
                    else if (value > 0x7F)
                    {
                        sjisBytesLeft++;
                        //sjisDoubleBytesChars++;
                        sjisCurKatakanaWordLength = 0;
                        sjisCurDoubleBytesWordLength++;
                        if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength)
                        {
                            sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
                        }
                    }
                    else
                    {
                        //sjisLowChars++;
                        sjisCurKatakanaWordLength    = 0;
                        sjisCurDoubleBytesWordLength = 0;
                    }
                }
            }

            if (canBeUTF8 && utf8BytesLeft > 0)
            {
                canBeUTF8 = false;
            }
            if (canBeShiftJIS && sjisBytesLeft > 0)
            {
                canBeShiftJIS = false;
            }

            // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
            if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0))
            {
                return(Encoding.UTF8);
            }
            // Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii characters (and no evidence it can't be), done
            if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3))
            {
                return(SHIFT_JIS_ENCODING);
            }
            // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
            // - If we saw
            //   - only two consecutive katakana chars in the whole text, or
            //   - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
            // - then we conclude Shift_JIS, else ISO-8859-1
            if (canBeISO88591 && canBeShiftJIS)
            {
                return((sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length
                    ? SHIFT_JIS_ENCODING : ISO88591_ENCODING);
            }

            // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
            if (canBeISO88591)
            {
                return(ISO88591_ENCODING);
            }
            if (canBeShiftJIS)
            {
                return(SHIFT_JIS_ENCODING);
            }
            if (canBeUTF8)
            {
                return(Encoding.UTF8);
            }
            // Otherwise, we take a wild guess with platform encoding
            return(PLATFORM_DEFAULT_ENCODING_T);
        }