コード例 #1
0
ファイル: ISCIIEncoding.cs プロジェクト: noahfalk/corefx
        [System.Security.SecurityCritical]  // auto-generated
        public override unsafe int GetChars(byte* bytes, int byteCount,
                                                char* chars, int charCount, DecoderNLS baseDecoder)
        {
            // Just need to ASSERT, this is called by something else internal that checked parameters already
            // Allow null chars for counting
            Debug.Assert(bytes != null, "[ISCIIEncoding.GetChars]bytes is null");
            Debug.Assert(byteCount >= 0, "[ISCIIEncoding.GetChars]byteCount is negative");
            //            Debug.Assert(chars != null, "[ISCIIEncoding.GetChars]chars is null");
            Debug.Assert(charCount >= 0, "[ISCIIEncoding.GetChars]charCount is negative");

            // Need the ISCII Decoder
            ISCIIDecoder decoder = (ISCIIDecoder)baseDecoder;

            // Get our info.
            EncodingCharBuffer buffer = new EncodingCharBuffer(this, decoder, chars, charCount, bytes, byteCount);

            int currentCodePage = _defaultCodePage;
            bool bLastATR = false;
            bool bLastVirama = false;
            bool bLastDevenagariStressAbbr = false;
            char cLastCharForNextNukta = '\0';
            char cLastCharForNoNextNukta = '\0';

            // See if there's anything in our decoder
            if (decoder != null)
            {
                currentCodePage = decoder.currentCodePage;
                bLastATR = decoder.bLastATR;
                bLastVirama = decoder.bLastVirama;
                bLastDevenagariStressAbbr = decoder.bLastDevenagariStressAbbr;
                cLastCharForNextNukta = decoder.cLastCharForNextNukta;
                cLastCharForNoNextNukta = decoder.cLastCharForNoNextNukta;
            }

            bool bLastSpecial = bLastVirama | bLastATR | bLastDevenagariStressAbbr |
                (cLastCharForNextNukta != '\0');

            // Get our current code page index (some code pages are dups)
            int currentCodePageIndex = -1;
            Debug.Assert(currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi,
                "[ISCIIEncoding.GetChars]Decoder code page must be >= Devanagari and <= Punjabi, not " + currentCodePage);

            if (currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi)
            {
                currentCodePageIndex = s_IndicMappingIndex[currentCodePage];
            }

            // Loop through our input
            while (buffer.MoreData)
            {
                byte b = buffer.GetNextByte();

                // See if last one was special
                if (bLastSpecial)
                {
                    // Now it won't be
                    bLastSpecial = false;

                    // One and only one of our flags should be set
                    Debug.Assert(((bLastVirama ? 1 : 0) + (bLastATR ? 1 : 0) +
                               (bLastDevenagariStressAbbr ? 1 : 0) +
                               ((cLastCharForNextNukta > 0) ? 1 : 0)) == 1,
                        String.Format(CultureInfo.InvariantCulture,
                            "[ISCIIEncoding.GetChars]Special cases require 1 and only 1 special case flag: LastATR {0} Dev. {1} Nukta {2}",
                            bLastATR, bLastDevenagariStressAbbr, cLastCharForNextNukta));
                    // If the last one was an ATR, then we'll have to do ATR stuff
                    if (bLastATR)
                    {
                        // We only support Devanagari - Punjabi
                        if (b >= (0x40 | CodeDevanagari) && b <= (0x40 | CodePunjabi))
                        {
                            // Remember the code page
                            currentCodePage = b & 0xf;
                            currentCodePageIndex = s_IndicMappingIndex[currentCodePage];
                            // No longer last ATR
                            bLastATR = false;
                            continue;
                        }

                        // Change back to default?
                        if (b == 0x40)
                        {
                            currentCodePage = _defaultCodePage;
                            currentCodePageIndex = -1;

                            if (currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi)
                            {
                                currentCodePageIndex = s_IndicMappingIndex[currentCodePage];
                            }
                            // No longer last ATR
                            bLastATR = false;
                            continue;
                        }

                        // We don't support Roman
                        if (b == 0x41)
                        {
                            currentCodePage = _defaultCodePage;
                            currentCodePageIndex = -1;

                            if (currentCodePage >= CodeDevanagari && currentCodePage <= CodePunjabi)
                            {
                                currentCodePageIndex = s_IndicMappingIndex[currentCodePage];
                            }

                            // Even though we don't know how to support Roman, windows didn't add a ? so we don't either.
                            // No longer last ATR
                            bLastATR = false;
                            continue;
                        }

                        // Other code pages & ATR codes not supported, fallback the ATR
                        // If fails, decrements the buffer, which is OK, we remember ATR state.
                        if (!buffer.Fallback(ControlATR))
                            break;

                        // No longer last ATR (fell back)
                        bLastATR = false;

                        // we know we can't have any of these other modes
                        Debug.Assert(bLastVirama == false, "[ISCIIEncoding.GetChars] Expected no bLastVirama in bLastATR mode");
                        Debug.Assert(bLastDevenagariStressAbbr == false, "[ISCIIEncoding.GetChars] Expected no bLastDevenagariStressAbbr in bLastATR mode");
                        Debug.Assert(cLastCharForNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNextNukta in bLastATR mode");
                        Debug.Assert(cLastCharForNoNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNoNextNukta in bLastATR mode");
                        // Keep processing this byte
                    }
                    else if (bLastVirama)
                    {
                        // If last was Virama, then we might need ZWNJ or ZWJ instead
                        if (b == Virama)
                        {
                            // If no room, then stop
                            if (!buffer.AddChar(ZWNJ))
                                break;
                            bLastVirama = false;
                            continue;
                        }
                        if (b == Nukta)
                        {
                            // If no room, then stop
                            if (!buffer.AddChar(ZWJ))
                                break;
                            bLastVirama = false;
                            continue;
                        }

                        // No longer in this mode, fall through to handle character
                        // (Virama itself was added when flag was set last iteration)
                        bLastVirama = false;

                        // We know we can't have any of these other modes
                        Debug.Assert(bLastATR == false, "[ISCIIEncoding.GetChars] Expected no bLastATR in bLastVirama mode");
                        Debug.Assert(bLastDevenagariStressAbbr == false, "[ISCIIEncoding.GetChars] Expected no bLastDevenagariStressAbbr in bLastVirama mode");
                        Debug.Assert(cLastCharForNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNextNukta in bLastVirama mode");
                        Debug.Assert(cLastCharForNoNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNoNextNukta in bLastVirama mode");
                    }
                    else if (bLastDevenagariStressAbbr)
                    {
                        // Last byte was an 0xf0 (ext).
                        // If current is b8 or bf, then we have 952 or 970.  Otherwise fallback
                        if (b == 0xb8)
                        {
                            // It was a 0xb8
                            if (!buffer.AddChar('\x0952'))         // Devanagari stress sign anudatta
                                break;
                            bLastDevenagariStressAbbr = false;
                            continue;
                        }

                        if (b == 0xbf)
                        {
                            // It was a 0xbf
                            if (!buffer.AddChar('\x0970'))         // Devanagari abbr. sign
                                break;
                            bLastDevenagariStressAbbr = false;
                            continue;
                        }

                        // Wasn't an expected pattern, do fallback for f0 (ext)
                        // if fails, fallback will back up our buffer
                        if (!buffer.Fallback(DevenagariExt))
                            break;

                        // Keep processing this byte (turn off mode)
                        // (last character was added when mode was set)
                        bLastDevenagariStressAbbr = false;

                        Debug.Assert(bLastATR == false, "[ISCIIEncoding.GetChars] Expected no bLastATR in bLastDevenagariStressAbbr mode");
                        Debug.Assert(bLastVirama == false, "[ISCIIEncoding.GetChars] Expected no bLastVirama in bLastDevenagariStressAbbr mode");
                        Debug.Assert(cLastCharForNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNextNukta in bLastDevenagariStressAbbr mode");
                        Debug.Assert(cLastCharForNoNextNukta == (char)0, "[ISCIIEncoding.GetChars] Expected no cLastCharForNoNextNukta in bLastDevenagariStressAbbr mode");
                    }
                    else
                    {
                        // We were checking for next char being a nukta
                        Debug.Assert(cLastCharForNextNukta > 0 && cLastCharForNoNextNukta > 0,
                            "[ISCIIEncoding.GetChars]No other special case found, but cLastCharFor(No)NextNukta variable(s) aren't set.");

                        // We'll either add combined char or last char
                        if (b == Nukta)
                        {
                            // We combine nukta with previous char
                            if (!buffer.AddChar(cLastCharForNextNukta))
                                break;

                            // Done already
                            cLastCharForNextNukta = cLastCharForNoNextNukta = '\0';
                            continue;
                        }

                        // No Nukta, just add last character and keep processing current byte
                        if (!buffer.AddChar(cLastCharForNoNextNukta))
                            break;

                        // Keep processing this byte, turn off mode.
                        cLastCharForNextNukta = cLastCharForNoNextNukta = '\0';

                        Debug.Assert(bLastATR == false, "[ISCIIEncoding.GetChars] Expected no bLastATR in cLastCharForNextNukta mode");
                        Debug.Assert(bLastVirama == false, "[ISCIIEncoding.GetChars] Expected no bLastVirama in cLastCharForNextNukta mode");
                        Debug.Assert(bLastDevenagariStressAbbr == false, "[ISCIIEncoding.GetChars] Expected no bLastDevenagariStressAbbr in cLastCharForNextNukta mode");
                    }
                }

                // Now bLastSpecial should be false and all flags false.
                Debug.Assert(!bLastSpecial && !bLastDevenagariStressAbbr && !bLastVirama && !bLastATR &&
                          cLastCharForNextNukta == '\0',
                          "[ISCIIEncoding.GetChars]No special state for last code point should exist at this point.");

                // If its a simple byte, just add it
                if (b < MultiByteBegin)
                {
                    if (!buffer.AddChar((char)b))
                        break;
                    continue;
                }

                // See if its an ATR marker
                if (b == ControlATR)
                {
                    bLastATR = bLastSpecial = true;
                    continue;
                }

                Debug.Assert(currentCodePageIndex != -1, "[ISCIIEncoding.GetChars]Expected valid currentCodePageIndex != -1");
                char ch = s_IndicMapping[currentCodePageIndex, 0, b - MultiByteBegin];
                char cAlt = s_IndicMapping[currentCodePageIndex, 1, b - MultiByteBegin];

                // If no 2nd char, just add it, also lonely Nuktas get added as well.
                if (cAlt == 0 || b == Nukta)
                {
                    // If it was an unknown character do fallback

                    // ? if not known.
                    if (ch == 0)
                    {
                        // Fallback the unknown byte
                        if (!buffer.Fallback(b))
                            break;
                    }
                    else
                    {
                        // Add the known character
                        if (!buffer.AddChar(ch))
                            break;
                    }
                    continue;
                }

                // if b == Virama set last Virama so we can do ZWJ or ZWNJ next time if needed.
                if (b == Virama)
                {
                    // Add Virama
                    if (!buffer.AddChar(ch))
                        break;
                    bLastVirama = bLastSpecial = true;
                    continue;
                }

                // See if its one that changes with a Nukta
                if ((cAlt & 0xF000) == 0)
                {
                    // It could change if next char is a nukta
                    bLastSpecial = true;
                    cLastCharForNextNukta = cAlt;
                    cLastCharForNoNextNukta = ch;
                    continue;
                }

                // We must be the Devenagari special case for F0, B8 & F0, BF
                Debug.Assert(currentCodePage == CodeDevanagari && b == DevenagariExt,
                    String.Format(CultureInfo.InvariantCulture,
                        "[ISCIIEncoding.GetChars] Devenagari special case must {0} not {1} or in Devanagari code page {2} not {3}.",
                        DevenagariExt, b, CodeDevanagari, currentCodePage));
                bLastDevenagariStressAbbr = bLastSpecial = true;
            }

            // If we don't have a decoder, or if we had to flush, then we need to get rid
            // of last ATR, LastNoNextNukta and LastDevenagariExt.
            if (decoder == null || decoder.MustFlush)
            {
                // If these fail (because of Convert with insufficient buffer), then they'll turn off MustFlush as well.
                if (bLastATR)
                {
                    // Have to add ATR fallback
                    if (buffer.Fallback(ControlATR))
                        bLastATR = false;
                    else
                        // If not successful, convert will maintain state for next time, also
                        // AddChar will have decremented our byte count, however we need it to remain the same
                        buffer.GetNextByte();
                }
                else if (bLastDevenagariStressAbbr)
                {
                    // Have to do fallback for DevenagariExt
                    if (buffer.Fallback(DevenagariExt))
                        bLastDevenagariStressAbbr = false;
                    else
                        // If not successful, convert will maintain state for next time, also
                        // AddChar will have decremented our byte count, however we need it to remain the same
                        buffer.GetNextByte();
                }
                else if (cLastCharForNoNextNukta != '\0')
                {
                    // Have to add our last char because there was no next nukta
                    if (buffer.AddChar(cLastCharForNoNextNukta))
                        cLastCharForNoNextNukta = cLastCharForNextNukta = '\0';
                    else
                        // If not successful, convert will maintain state for next time, also
                        // AddChar will have decremented our byte count, however we need it to remain the same
                        buffer.GetNextByte();
                }
                // LastVirama is unimportant for flushing decoder.
            }

            // Remember any left over stuff
            // (only remember if we aren't counting)
            if (decoder != null && chars != null)
            {
                // If not flushing or have state (from convert) then need to remember state
                if (!decoder.MustFlush ||
                    cLastCharForNoNextNukta != '\0' || bLastATR || bLastDevenagariStressAbbr)
                {
                    // Either not flushing or had state (from convert)
                    Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
                        "[ISCIIEncoding.GetChars]Expected no state or not converting or not flushing");
                    decoder.currentCodePage = currentCodePage;
                    decoder.bLastVirama = bLastVirama;
                    decoder.bLastATR = bLastATR;
                    decoder.bLastDevenagariStressAbbr = bLastDevenagariStressAbbr;
                    decoder.cLastCharForNextNukta = cLastCharForNextNukta;
                    decoder.cLastCharForNoNextNukta = cLastCharForNoNextNukta;
                }
                else
                {
                    decoder.currentCodePage = _defaultCodePage;
                    decoder.bLastVirama = false;
                    decoder.bLastATR = false;
                    decoder.bLastDevenagariStressAbbr = false;
                    decoder.cLastCharForNextNukta = '\0';
                    decoder.cLastCharForNoNextNukta = '\0';
                }
                decoder.m_bytesUsed = buffer.BytesUsed;
            }
            // Otherwise we already did fallback and added extra things

            // Return the # of characters we found
            return buffer.Count;
        }
コード例 #2
0
ファイル: GB18030Encoding.cs プロジェクト: noahfalk/corefx
        [System.Security.SecurityCritical]  // auto-generated
        public override unsafe int GetChars(byte* bytes, int byteCount,
                                                char* chars, int charCount, DecoderNLS baseDecoder)
        {
            // Just need to ASSERT, this is called by something else internal that checked parameters already
            // We'll allow null chars as a count
            Debug.Assert(bytes != null, "[GB18030Encoding.GetChars]bytes is null");
            Debug.Assert(byteCount >= 0, "[GB18030Encoding.GetChars]byteCount is negative");
            //            Debug.Assert(chars != null, "[GB18030Encoding.GetChars]chars is null");
            Debug.Assert(charCount >= 0, "[GB18030Encoding.GetChars]charCount is negative");

            // Fix our decoder
            GB18030Decoder decoder = (GB18030Decoder)baseDecoder;

            // Get our info.
            EncodingCharBuffer buffer = new EncodingCharBuffer(this, decoder, chars, charCount, bytes, byteCount);

            // Need temp bytes because we can't muss up decoder
            short byte1 = -1;
            short byte2 = -1;
            short byte3 = -1;
            short byte4 = -1;

            // See if there was anything to get out of the decoder
            if (decoder != null && decoder.bLeftOver1 != -1)
            {
                // Need temp bytes because we can't muss up decoder
                byte1 = decoder.bLeftOver1;
                byte2 = decoder.bLeftOver2;
                byte3 = decoder.bLeftOver3;
                byte4 = decoder.bLeftOver4;

                // Loop because we might have too many in buffer
                // This could happen if we are working on a 4 byte sequence, but it isn't valid.
                while (byte1 != -1)
                {
                    // If its not a lead byte, use ? or its value, then scoot them down & try again
                    // This could happen if we previously had a bad 4 byte sequence and this is a trail byte
                    if (!IsGBLeadByte(byte1))
                    {
                        // This is either a ? or ASCII, need 1 char output
                        if (byte1 <= 0x7f)
                        {
                            if (!buffer.AddChar((char)byte1))      // Its ASCII
                                break;
                        }
                        else
                        {
                            if (!buffer.Fallback((byte)byte1))     // Not a valid byte
                                break;
                        }

                        byte1 = byte2;
                        byte2 = byte3;
                        byte3 = byte4;
                        byte4 = -1;
                        continue;
                    }

                    // Read in more bytes as needed
                    while (byte2 == -1 ||
                           (IsGBFourByteTrailing(byte2) && byte4 == -1))
                    {
                        // Do we have room?
                        if (!buffer.MoreData)
                        {
                            // No input left to read, do we have to flush?
                            if (!decoder.MustFlush)
                            {
                                // Don't stick stuff in decoder when counting
                                if (chars != null)
                                {
                                    // Don't have to flush, won't have any chars
                                    // Decoder is correct, just return
                                    decoder.bLeftOver1 = byte1;
                                    decoder.bLeftOver2 = byte2;
                                    decoder.bLeftOver3 = byte3;
                                    decoder.bLeftOver4 = byte4;
                                }

                                decoder.m_bytesUsed = buffer.BytesUsed;
                                return buffer.Count;
                            }

                            // We'll have to flush, add a ? and scoot them down to try again
                            // We could be trying for a 4 byte sequence but byte 3 could be ascii and should be spit out
                            // Breaking will do this because we have zeros
                            break;
                        }

                        // Read them in
                        if (byte2 == -1) byte2 = buffer.GetNextByte();
                        else if (byte3 == -1) byte3 = buffer.GetNextByte();
                        else byte4 = buffer.GetNextByte();
                    }

                    // Now we have our 2 or 4 bytes
                    if (IsGBTwoByteTrailing(byte2))
                    {
                        //
                        // The trailing byte is a GB18030 two-byte sequence trailing byte.
                        //
                        int iTwoBytes = byte1 << 8;
                        iTwoBytes |= unchecked((byte)byte2);
                        if (!buffer.AddChar(mapBytesToUnicode[iTwoBytes], 2))
                            break;

                        // We're done with it
                        byte1 = -1;
                        byte2 = -1;
                    }
                    else if (IsGBFourByteTrailing(byte2) &&
                             IsGBLeadByte(byte3) &&
                             IsGBFourByteTrailing(byte4))
                    {
                        //
                        // Four-byte GB18030
                        //

                        int sFourBytesOffset = GetFourBytesOffset(
                            byte1, byte2, byte3, byte4);

                        // What kind is it?
                        if (sFourBytesOffset <= GBLast4ByteCode)
                        {
                            //
                            // The Unicode will be in the BMP range.
                            //
                            if (!buffer.AddChar(map4BytesToUnicode[sFourBytesOffset], 4))
                                break;
                        }
                        else if (sFourBytesOffset >= GBSurrogateOffset &&
                                 sFourBytesOffset <= GBLastSurrogateOffset)
                        {
                            //
                            // This will be converted to a surrogate pair, need another char
                            //

                            // Use our surrogate
                            sFourBytesOffset -= GBSurrogateOffset;
                            if (!buffer.AddChar(unchecked((char)(0xd800 + (sFourBytesOffset / 0x400))),
                                                unchecked((char)(0xdc00 + (sFourBytesOffset % 0x400))), 4))
                                break;
                        }
                        else
                        {
                            // Real GB18030 codepoint, but can't be mapped to unicode
                            // We already checked our buffer space.
                            // Do fallback here if we implement decoderfallbacks.
                            if (!buffer.Fallback((byte)byte1, (byte)byte2, (byte)byte3, (byte)byte4))
                                break;
                        }

                        // We're done with this one
                        byte1 = -1;
                        byte2 = -1;
                        byte3 = -1;
                        byte4 = -1;
                    }
                    else
                    {
                        // Not a valid sequence, use '?' for 1st byte & scoot them all down 1
                        if (!buffer.Fallback((byte)byte1))
                            break;

                        // Move all bytes down 1
                        byte1 = byte2;
                        byte2 = byte3;
                        byte3 = byte4;
                        byte4 = -1;
                    }
                }
            }

            // Loop, just do '?' replacement because we don't have fallbacks for decodings.
            while (buffer.MoreData)
            {
                byte ch = buffer.GetNextByte();

                // ASCII case is easy
                if (ch <= 0x7f)
                {
                    // ASCII, have room?
                    if (!buffer.AddChar((char)ch))
                        break;              // No room in convert buffer, so stop
                }
                // See if its a lead byte
                else if (IsGBLeadByte(ch))
                {
                    // ch is a lead byte, have room for more?
                    if (buffer.MoreData)
                    {
                        byte ch2 = buffer.GetNextByte();
                        if (IsGBTwoByteTrailing(ch2))
                        {
                            //
                            // The trailing byte is a GB18030 two-byte sequence trailing byte.
                            //

                            //
                            // Two-byte GB18030
                            //
                            int iTwoBytes = ch << 8;
                            iTwoBytes |= ch2;
                            if (!buffer.AddChar(mapBytesToUnicode[iTwoBytes], 2))
                                break;
                        }
                        else if (IsGBFourByteTrailing(ch2))
                        {
                            // Do we have room for Four Byte Sequence? (already have 1 byte)
                            if (buffer.EvenMoreData(2))
                            {
                                // Is it a valid 4 byte sequence?
                                byte ch3 = buffer.GetNextByte();
                                byte ch4 = buffer.GetNextByte();
                                if (IsGBLeadByte(ch3) &&
                                    IsGBFourByteTrailing(ch4))
                                {
                                    //
                                    // Four-byte GB18030
                                    //
                                    int sFourBytesOffset = GetFourBytesOffset(ch, ch2, ch3, ch4);

                                    // What kind is it?
                                    // We'll be at least 1 BMP char or a '?' char.

                                    if (sFourBytesOffset <= GBLast4ByteCode)
                                    {
                                        //
                                        // The Unicode will be in the BMP range.
                                        //
                                        if (!buffer.AddChar(map4BytesToUnicode[sFourBytesOffset], 4))
                                            break;
                                    }
                                    else if (sFourBytesOffset >= GBSurrogateOffset &&
                                             sFourBytesOffset <= GBLastSurrogateOffset)
                                    {
                                        //
                                        // This will be converted to a surrogate pair, need another char
                                        //

                                        // Use our surrogate
                                        sFourBytesOffset -= GBSurrogateOffset;
                                        if (!buffer.AddChar(unchecked((char)(0xd800 + (sFourBytesOffset / 0x400))),
                                                             unchecked((char)(0xdc00 + (sFourBytesOffset % 0x400))), 4))
                                            break;
                                    }
                                    else
                                    {
                                        // Real GB18030 codepoint, but can't be mapped to unicode
                                        if (!buffer.Fallback(ch, ch2, ch3, ch4))
                                            break;
                                    }
                                }
                                else
                                {
                                    // Not a valid 2 or 4 byte sequence, use '?' for ch and try other 3 again
                                    buffer.AdjustBytes(-3);
                                    if (!buffer.Fallback(ch))
                                        break;
                                }
                            }
                            else
                            {
                                // No room for 4 bytes, have 2 already, may be one more
                                // Lead byte but no place to stick it
                                if (decoder != null && !decoder.MustFlush)
                                {
                                    // (make sure not to set decoder if counting, so check chars)
                                    if (chars != null)
                                    {
                                        // We'll be able to stick the remainder in the decoder
                                        byte1 = ch;
                                        byte2 = ch2;

                                        if (buffer.MoreData)
                                            byte3 = buffer.GetNextByte();
                                        else
                                            byte3 = -1;

                                        byte4 = -1;
                                    }
                                    break;
                                }

                                // Won't go in decoder, we'll use '?' for it.
                                if (!buffer.Fallback(ch, ch2))
                                    break;
                            }
                        }
                        else
                        {
                            // Unknown byte sequence, fall back lead byte and try 2nd one again
                            buffer.AdjustBytes(-1);
                            if (!buffer.Fallback(ch))
                                break;
                        }
                    }
                    else
                    {
                        // Lead byte but don't know about trail byte
                        // (make sure not to set decoder if counting, so check bytes)
                        if (decoder != null && !decoder.MustFlush)
                        {
                            // We'll be able to stick it in the decoder
                            // (don't actually do it when counting though)
                            if (chars != null)
                            {
                                byte1 = ch;
                                byte2 = -1;
                                byte3 = -1;
                                byte4 = -1;
                            }
                            break;
                        }

                        if (!buffer.Fallback(ch))
                            break;
                    }
                }
                else
                {
                    // Not ASCII and not a lead byte, we'll use '?' for it if we have room
                    if (!buffer.Fallback(ch))
                        break;
                }
            }

            // Need to flush the decoder if necessary
            // (make sure not to set decoder if counting, so check bytes)
            if (decoder != null)
            {
                if (chars != null)
                {
                    decoder.bLeftOver1 = byte1;
                    decoder.bLeftOver2 = byte2;
                    decoder.bLeftOver3 = byte3;
                    decoder.bLeftOver4 = byte4;
                }
                decoder.m_bytesUsed = buffer.BytesUsed;
            }

            // Return the # of characters we found
            return buffer.Count;
        }
コード例 #3
0
        [System.Security.SecurityCritical]  // auto-generated
        private unsafe int GetCharsCP52936(byte* bytes, int byteCount,
                                                char* chars, int charCount, ISO2022Decoder decoder)
        {
            Debug.Assert(byteCount >= 0, "[ISO2022Encoding.GetCharsCP52936]count >=0");
            Debug.Assert(bytes != null, "[ISO2022Encoding.GetCharsCP52936]bytes!=null");

            // Get our info.
            EncodingCharBuffer buffer = new EncodingCharBuffer(this, decoder, chars, charCount, bytes, byteCount);

            // No mode information yet
            ISO2022Modes currentMode = ISO2022Modes.ModeASCII;
            int byteLeftOver = -1;
            bool bUsedDecoder = false;

            if (decoder != null)
            {
                currentMode = decoder.currentMode;
                // See if we have leftover decoder buffer to use
                // Don't want to mess up decoder if we're counting or throw an exception
                if (decoder.bytesLeftOverCount != 0)
                {
                    // Load our bytesLeftOver
                    byteLeftOver = decoder.bytesLeftOver[0];
                }
            }

            // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings.
            while (buffer.MoreData || byteLeftOver >= 0)
            {
                byte ch;

                // May have a left over byte
                if (byteLeftOver >= 0)
                {
                    ch = (byte)byteLeftOver;
                    byteLeftOver = -1;
                }
                else
                {
                    ch = buffer.GetNextByte();
                }

                // We're in escape mode
                if (ch == '~')
                {
                    // Next char is type of switch
                    if (!buffer.MoreData)
                    {
                        // We don't have anything left, it'll be in decoder or a ?
                        // don't fail if we are allowing overflows
                        if (decoder == null || decoder.MustFlush)
                        {
                            // We'll be a '?'
                            buffer.Fallback(ch);
                            // break if we fail & break if we don't (because !MoreData)
                            // Add succeeded, continue
                            break;
                        }

                        // Stick it in decoder
                        if (decoder != null)
                            decoder.ClearMustFlush();

                        if (chars != null)
                        {
                            decoder.bytesLeftOverCount = 1;
                            decoder.bytesLeftOver[0] = (byte)'~';
                            bUsedDecoder = true;
                        }
                        break;
                    }

                    // What type is it?, get 2nd byte
                    ch = buffer.GetNextByte();

                    if (ch == '~' && currentMode == ISO2022Modes.ModeASCII)
                    {
                        // Its just a ~~ replacement for ~, add it
                        if (!buffer.AddChar((char)ch, 2))
                            // Add failed, break for converting
                            break;

                        // Add succeeded, continue
                        continue;
                    }
                    else if (ch == '{')
                    {
                        // Switching to Double Byte mode
                        currentMode = ISO2022Modes.ModeHZ;
                        continue;
                    }
                    else if (ch == '}')
                    {
                        // Switching to ASCII mode
                        currentMode = ISO2022Modes.ModeASCII;
                        continue;
                    }
                    else if (ch == '\n')
                    {
                        // Ignore ~\n sequence
                        continue;
                    }
                    else
                    {
                        // Unknown escape, back up and try the '~' as a "normal" byte or lead byte
                        buffer.AdjustBytes(-1);
                        ch = (byte)'~';
                    }
                }

                // go ahead and add our data
                if (currentMode != ISO2022Modes.ModeASCII)
                {
                    // Should be ModeHZ
                    Debug.Assert(currentMode == ISO2022Modes.ModeHZ, "[ISO2022Encoding.GetCharsCP52936]Expected ModeHZ");
                    char cm;

                    // Everett allowed characters < 0x20 to be passed as if they were ASCII
                    if (ch < 0x20)
                    {
                        // Emit it as ASCII
                        goto STOREASCII;
                    }

                    // Its multibyte, should have another byte
                    if (!buffer.MoreData)
                    {
                        // No bytes left
                        // don't fail if we are allowing overflows
                        if (decoder == null || decoder.MustFlush)
                        {
                            // Not enough bytes, fallback lead byte
                            buffer.Fallback(ch);

                            // Break if we fail & break because !MoreData
                            break;
                        }

                        if (decoder != null)
                            decoder.ClearMustFlush();

                        // Stick it in decoder
                        if (chars != null)
                        {
                            decoder.bytesLeftOverCount = 1;
                            decoder.bytesLeftOver[0] = ch;
                            bUsedDecoder = true;
                        }
                        break;
                    }

                    // Everett uses space as an escape character for single SBCS bytes
                    byte ch2 = buffer.GetNextByte();
                    ushort iBytes = (ushort)(ch << 8 | ch2);

                    if (ch == ' ' && ch2 != 0)
                    {
                        // Get next char and treat it like ASCII (Everett treated space like an escape
                        // allowing the next char to be just ascii)
                        cm = (char)ch2;
                        goto STOREMULTIBYTE;
                    }

                    // Bytes should be in range: lead byte 0x21-0x77, trail byte: 0x21 - 0x7e
                    if ((ch < 0x21 || ch > 0x77 || ch2 < 0x21 || ch2 > 0x7e) &&
                    // Everett allowed high bit mappings for same characters (but only if both bits set)
                        (ch < 0xa1 || ch > 0xf7 || ch2 < 0xa1 || ch2 > 0xfe))
                    {
                        // For some reason Everett allowed XX20 to become unicode 3000... (ideo sp)
                        if (ch2 == 0x20 && 0x21 <= ch && ch <= 0x7d)
                        {
                            iBytes = 0x2121;
                            goto MULTIBYTE;
                        }

                        // Illegal char, use fallback.  If lead byte is 0 have to do it special and do it first
                        if (!buffer.Fallback((byte)(iBytes >> 8), (byte)(iBytes)))
                            break;
                        continue;
                    }

                MULTIBYTE:
                    iBytes |= 0x8080;
                    // Look up the multibyte char to stick it in our data

                    // We have a iBytes to try to convert.
                    cm = mapBytesToUnicode[iBytes];

                STOREMULTIBYTE:

                    // See if it was unknown
                    if (cm == UNKNOWN_CHAR_FLAG && iBytes != 0)
                    {
                        // Fall back the unknown stuff
                        if (!buffer.Fallback((byte)(iBytes >> 8), (byte)(iBytes)))
                            break;
                        continue;
                    }

                    if (!buffer.AddChar(cm, 2))
                        break;              // convert ran out of buffer, stop
                    continue;
                }

            // Just ASCII
            // We allow some chars > 7f because everett did, so we have to look them up.
            STOREASCII:
                char c = mapBytesToUnicode[ch];

                // Check if it was unknown
                if ((c == UNKNOWN_CHAR_FLAG || c == 0) && (ch != 0))
                {
                    // fallback the unkown bytes
                    if (!buffer.Fallback((byte)ch))
                        break;
                    continue;
                }

                // Go ahead and add our ASCII character
                if (!buffer.AddChar(c))
                    break;                  // convert ran out of buffer, stop
            }

            // Need to remember our state, IF we're not counting
            if (chars != null && decoder != null)
            {
                if (!bUsedDecoder)
                {
                    // If we didn't use it, clear the byte left over
                    decoder.bytesLeftOverCount = 0;
                }

                if (decoder.MustFlush && decoder.bytesLeftOverCount == 0)
                {
                    decoder.currentMode = ISO2022Modes.ModeASCII;
                }
                else
                {
                    // Either not flushing or had state (from convert)
                    Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
                        "[ISO2022Encoding.GetCharsCP52936]Expected no state or not converting or not flushing");

                    decoder.currentMode = currentMode;
                }
                decoder.m_bytesUsed = buffer.BytesUsed;
            }

            // Return # of characters we found
            return buffer.Count;
        }
コード例 #4
0
        [System.Security.SecurityCritical]  // auto-generated
        private unsafe int GetCharsCP5022xJP(byte* bytes, int byteCount,
                                                  char* chars, int charCount, ISO2022Decoder decoder)
        {
            // Get our info.
            EncodingCharBuffer buffer = new EncodingCharBuffer(this, decoder, chars, charCount, bytes, byteCount);

            // No mode information yet
            ISO2022Modes currentMode = ISO2022Modes.ModeASCII;      // Our current Mode
            ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII;      // Mode that we'll shift in to
            byte[] escapeBytes = new byte[4];
            int escapeCount = 0;

            if (decoder != null)
            {
                currentMode = decoder.currentMode;
                shiftInMode = decoder.shiftInOutMode;

                // See if we have leftover decoder buffer to use
                // Load our bytesLeftOver
                escapeCount = decoder.bytesLeftOverCount;

                // Don't want to mess up decoder if we're counting or throw an exception
                for (int i = 0; i < escapeCount; i++)
                    escapeBytes[i] = decoder.bytesLeftOver[i];
            }

            // Do this until the end
            while (buffer.MoreData || escapeCount > 0)
            {
                byte ch;

                if (escapeCount > 0)
                {
                    // Get more escape sequences if necessary
                    if (escapeBytes[0] == ESCAPE)
                    {
                        // Stop if no more input
                        if (!buffer.MoreData)
                        {
                            if (decoder != null && !decoder.MustFlush)
                                break;
                        }
                        else
                        {
                            // Add it to the sequence we can check
                            escapeBytes[escapeCount++] = buffer.GetNextByte();

                            // We have an escape sequence
                            ISO2022Modes modeReturn =
                                CheckEscapeSequenceJP(escapeBytes, escapeCount);

                            if (modeReturn != ISO2022Modes.ModeInvalidEscape)
                            {
                                if (modeReturn != ISO2022Modes.ModeIncompleteEscape)
                                {
                                    // Processed escape correctly
                                    escapeCount = 0;

                                    // We're now this mode
                                    currentMode = shiftInMode = modeReturn;
                                }

                                // Either way, continue to get next escape or real byte
                                continue;
                            }
                        }
                        // If ModeInvalidEscape, or no input & must flush, then fall through to add escape.
                    }

                    // Read next escape byte and move them down one.
                    ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
                }
                else
                {
                    // Get our next byte
                    ch = buffer.GetNextByte();

                    if (ch == ESCAPE)
                    {
                        // We'll have an escape sequence, use it if we don't have one buffered already
                        if (escapeCount == 0)
                        {
                            // Start this new escape sequence
                            escapeBytes[0] = ch;
                            escapeCount = 1;
                            continue;
                        }

                        // Flush the previous escape sequence, then reuse this escape byte
                        buffer.AdjustBytes(-1);
                    }
                }

                if (ch == SHIFT_OUT)
                {
                    shiftInMode = currentMode;
                    currentMode = ISO2022Modes.ModeHalfwidthKatakana;
                    continue;
                }
                else if (ch == SHIFT_IN)
                {
                    currentMode = shiftInMode;
                    continue;
                }

                // Get our full character
                ushort iBytes = ch;
                bool b2Bytes = false;

                if (currentMode == ISO2022Modes.ModeJIS0208)
                {
                    //
                    //  To handle errors, we need to check:
                    //    1. if trailbyte is there
                    //    2. if code is valid
                    //
                    if (escapeCount > 0)
                    {
                        // Let another escape fall through
                        if (escapeBytes[0] != ESCAPE)
                        {
                            // Move them down one & get the next data
                            iBytes <<= 8;
                            iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
                            b2Bytes = true;
                        }
                    }
                    else if (buffer.MoreData)
                    {
                        iBytes <<= 8;
                        iBytes |= buffer.GetNextByte();
                        b2Bytes = true;
                    }
                    else
                    {
                        // Not enough input, use decoder if possible
                        if (decoder == null || decoder.MustFlush)
                        {
                            // No decoder, do fallback for this byte
                            buffer.Fallback(ch);
                            break;
                        }

                        // Stick it in the decoder if we're not counting
                        if (chars != null)
                        {
                            escapeBytes[0] = ch;
                            escapeCount = 1;
                        }
                        break;
                    }

                    // MLang treated JIS 0208 '*' lead byte like a single halfwidth katakana
                    // escape, so use 0x8e00 as katakana lead byte and keep same trail byte.
                    // 0x2a lead byte range is normally unused in JIS 0208, so shouldn't have
                    // any wierd compatibility issues.
                    if ((b2Bytes == true) && ((iBytes & 0xff00) == 0x2a00))
                    {
                        iBytes = (ushort)(iBytes & 0xff);
                        iBytes |= (LEADBYTE_HALFWIDTH << 8);   // Put us in the halfwidth katakana range
                    }
                }
                else if (iBytes >= 0xA1 && iBytes <= 0xDF)
                {
                    // Everett accidentally mapped Katakana like shift-jis (932),
                    // even though this is a 7 bit code page.  We keep that mapping
                    iBytes |= (LEADBYTE_HALFWIDTH << 8);    // Map to halfwidth katakana range
                    iBytes &= 0xff7f;                       // remove extra 0x80
                }
                else if (currentMode == ISO2022Modes.ModeHalfwidthKatakana)
                {
                    // Add 0x10 lead byte that our encoding expects for Katakana:
                    iBytes |= (LEADBYTE_HALFWIDTH << 8);
                }

                // We have an iBytes to try to convert.
                char c = mapBytesToUnicode[iBytes];

                // See if it was unknown
                if (c == UNKNOWN_CHAR_FLAG && iBytes != 0)
                {
                    // Have to do fallback
                    if (b2Bytes)
                    {
                        if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes))
                            break;
                    }
                    else
                    {
                        if (!buffer.Fallback(ch))
                            break;
                    }
                }
                else
                {
                    // If we were JIS 0208, then we consumed an extra byte
                    if (!buffer.AddChar(c, b2Bytes ? 2 : 1))
                        break;
                }
            }

            // Make sure our decoder state matches our mode, if not counting
            if (chars != null && decoder != null)
            {
                // Remember it if we don't flush
                if (!decoder.MustFlush || escapeCount != 0)
                {
                    // Either not flushing or had state (from convert)
                    Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
                        "[ISO2022Encoding.GetCharsCP5022xJP]Expected no state or not converting or not flushing");

                    decoder.currentMode = currentMode;
                    decoder.shiftInOutMode = shiftInMode;

                    // Remember escape buffer
                    decoder.bytesLeftOverCount = escapeCount;
                    decoder.bytesLeftOver = escapeBytes;
                }
                else
                {
                    // We flush, clear buffer
                    decoder.currentMode = ISO2022Modes.ModeASCII;
                    decoder.shiftInOutMode = ISO2022Modes.ModeASCII;
                    decoder.bytesLeftOverCount = 0;
                    // Slightly different if counting/not counting
                }

                decoder.m_bytesUsed = buffer.BytesUsed;
            }

            // Return # of characters we found
            return buffer.Count;
        }
コード例 #5
0
        [System.Security.SecurityCritical]  // auto-generated
        private unsafe int GetCharsCP50225KR(byte* bytes, int byteCount,
                                                   char* chars, int charCount, ISO2022Decoder decoder)
        {
            // Get our info.
            EncodingCharBuffer buffer = new EncodingCharBuffer(this, decoder, chars, charCount, bytes, byteCount);

            // No mode information yet
            ISO2022Modes currentMode = ISO2022Modes.ModeASCII;      // Our current Mode

            byte[] escapeBytes = new byte[4];
            int escapeCount = 0;

            if (decoder != null)
            {
                currentMode = decoder.currentMode;

                // See if we have leftover decoder buffer to use
                // Load our bytesLeftOver
                escapeCount = decoder.bytesLeftOverCount;

                // Don't want to mess up decoder if we're counting or throw an exception
                for (int i = 0; i < escapeCount; i++)
                    escapeBytes[i] = decoder.bytesLeftOver[i];
            }

            // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings.
            while (buffer.MoreData || escapeCount > 0)
            {
                byte ch;

                if (escapeCount > 0)
                {
                    // Get more escape sequences if necessary
                    if (escapeBytes[0] == ESCAPE)
                    {
                        // Stop if no more input
                        if (!buffer.MoreData)
                        {
                            if (decoder != null && !decoder.MustFlush)
                                break;
                        }
                        else
                        {
                            // Add it to the sequence we can check
                            escapeBytes[escapeCount++] = buffer.GetNextByte();

                            // We have an escape sequence
                            ISO2022Modes modeReturn =
                                CheckEscapeSequenceKR(escapeBytes, escapeCount);

                            if (modeReturn != ISO2022Modes.ModeInvalidEscape)
                            {
                                if (modeReturn != ISO2022Modes.ModeIncompleteEscape)
                                {
                                    // Processed escape correctly, no effect (we know about KR mode)
                                    escapeCount = 0;
                                }

                                // Either way, continue to get next escape or real byte
                                continue;
                            }
                        }
                        // If ModeInvalidEscape, or no input & must flush, then fall through to add escape.
                    }

                    // Still have something left over in escape buffer
                    // Get it and move them down one
                    ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
                }
                else
                {
                    // Get our next byte
                    ch = buffer.GetNextByte();

                    if (ch == ESCAPE)
                    {
                        // We'll have an escape sequence, use it if we don't have one buffered already
                        if (escapeCount == 0)
                        {
                            // Start this new escape sequence
                            escapeBytes[0] = ch;
                            escapeCount = 1;
                            continue;
                        }

                        // Flush previous escape sequence, then reuse this escape byte
                        buffer.AdjustBytes(-1);
                    }
                }

                if (ch == SHIFT_OUT)
                {
                    currentMode = ISO2022Modes.ModeKR;
                    continue;
                }
                else if (ch == SHIFT_IN)
                {
                    currentMode = ISO2022Modes.ModeASCII;
                    continue;
                }

                // Get our full character
                ushort iBytes = ch;
                bool b2Bytes = false;

                // MLANG was passing through ' ', '\t' and '\n', so we do so as well, but I don't see that in the RFC.
                if (currentMode == ISO2022Modes.ModeKR && ch != ' ' && ch != '\t' && ch != '\n')
                {
                    //
                    //  To handle errors, we need to check:
                    //    1. if trailbyte is there
                    //    2. if code is valid
                    //
                    if (escapeCount > 0)
                    {
                        // Let another escape fall through
                        if (escapeBytes[0] != ESCAPE)
                        {
                            // Move them down one & get the next data
                            iBytes <<= 8;
                            iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
                            b2Bytes = true;
                        }
                    }
                    else if (buffer.MoreData)
                    {
                        iBytes <<= 8;
                        iBytes |= buffer.GetNextByte();
                        b2Bytes = true;
                    }
                    else
                    {
                        // Not enough input, use decoder if possible
                        if (decoder == null || decoder.MustFlush)
                        {
                            // No decoder, do fallback for lonely 1st byte
                            buffer.Fallback(ch);
                            break;
                        }

                        // Stick it in the decoder if we're not counting
                        if (chars != null)
                        {
                            escapeBytes[0] = ch;
                            escapeCount = 1;
                        }
                        break;
                    }
                }

                // We have a iBytes to try to convert.
                char c = mapBytesToUnicode[iBytes];

                // See if it was unknown
                if (c == UNKNOWN_CHAR_FLAG && iBytes != 0)
                {
                    // Have to do fallback
                    if (b2Bytes)
                    {
                        if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes))
                            break;
                    }
                    else
                    {
                        if (!buffer.Fallback(ch))
                            break;
                    }
                }
                else
                {
                    if (!buffer.AddChar(c, b2Bytes ? 2 : 1))
                        break;
                }
            }

            // Make sure our decoder state matches our mode, if not counting
            if (chars != null && decoder != null)
            {
                // Remember it if we don't flush
                if (!decoder.MustFlush || escapeCount != 0)
                {
                    // Either not flushing or had state (from convert)
                    Debug.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
                        "[ISO2022Encoding.GetCharsCP50225KR]Expected no state or not converting or not flushing");

                    decoder.currentMode = currentMode;

                    // Remember escape buffer
                    decoder.bytesLeftOverCount = escapeCount;
                    decoder.bytesLeftOver = escapeBytes;
                }
                else
                {
                    // We flush, clear buffer
                    decoder.currentMode = ISO2022Modes.ModeASCII;
                    decoder.shiftInOutMode = ISO2022Modes.ModeASCII;
                    decoder.bytesLeftOverCount = 0;
                }

                decoder.m_bytesUsed = buffer.BytesUsed;
            }

            // Return # of characters we found
            return buffer.Count;
        }