C# (CSharp) JIS Examples

Programming Language: C# (CSharp)

Class/Type: JIS

Examples at hotexamples.com: 2

C# (CSharp) JIS - 2 examples found. These are the top rated real world C# (CSharp) examples of JIS extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GetEncoding(1)

hasSOSI(1)

Example #1

Show file

File: CharCode.cs Project: Nordes/ReadJEnc

 public override string GetString(byte[] bytes, int len)
 {
     try
     {
         StringBuilder ret = new StringBuilder(len);
         int           pos = 0;
         while (pos < len)
         {   //JIS補助漢字エスケープ以外の範囲を把握
             int start = pos;
             while (pos < len)
             {
                 if (bytes[pos] == 0x1B && pos + 3 < len &&
                     bytes[pos + 1] == 0x24 &&
                     bytes[pos + 2] == 0x28 &&
                     bytes[pos + 3] == 0x44)
                 {   //JIS補助漢字エスケープシーケンスを検出、ループ脱出
                     break;
                 }
                 pos++;
             }
             if (start < pos)
             {   //通常のCP5022Xでデコードする
                 ret.Append(JIS.GetEncoding().GetString(bytes, start, pos - start));
             }
             //JIS補助漢字エスケープ部分の処理
             if (pos < len)
             {   //JIS補助漢字エスケープシーケンス除去、補助漢字範囲特定
                 pos   = pos + 4;
                 start = pos;
                 while (pos < len && bytes[pos] != 0x1B)
                 {
                     pos++;
                 }
                 if (start < pos)
                 {
                     byte[] bytesForCP20932 = new byte[pos - start];
                     for (int i = 0; i < bytesForCP20932.Length; i++)
                     {   //CP20932のコード体系に合わせ、１バイト目は0xA1-0XFE,２バイト目は0x21-0x7Eとなるようにする
                         bytesForCP20932[i] = bytes[start + i];
                         if (i % 2 == 0)
                         {
                             bytesForCP20932[i] |= 0x80;
                         }
                     }
                     //EUC補助漢字のCP20932を用いてデコードする
                     ret.Append(EUCH.GetEncoding().GetString(bytesForCP20932, 0, bytesForCP20932.Length));
                 }
             }
         }
         return(ret.ToString());
     }
     catch (DecoderFallbackException)
     {   //読み出し失敗(マッピングされていない文字があった場合など)
         return(null);
     }
 }

Example #2

Show file

        // 文字コード判別メソッド================================================

        /// <summary>バイト配列を全走査し、文字コードを自動判別する</summary>
        /// <param name="bytes">判定対象のバイト配列</param>
        /// <param name="len">ファイルサイズ(バイト配列先頭からのデコード対象バイト数)</param>
        /// <param name="text">out 判別した文字コードにより取り出したテキスト文字列（非テキストならnull）</param>
        /// <returns>文字コード判別結果（非テキストならnull）</returns>
        public CharCode GetEncoding(byte[] bytes, int len, out string text)
        {
            if (len == 0)
            {   // ■空ファイルにつき非テキストと判断
                text = null;
                return(null);
            }

            byte b1 = bytes[0]; // 汎用バイトデータ読み取り変数初期化

            // 【1】7bit文字コードの範囲の走査(ASCII判定/非ASCII文字開始位置把握)、およびUTF16N/JISチェック
            JIS escapeSequenceChecker = null; // JISエスケープシーケンス評価
            int asciiEndPos           = 0;    // ループ変数、兼、非ASCII文字を初めて検出した位置

            while (b1 < DEL)                  // 非ASCII文字が出現したらループ脱出：b1にはあらかじめ読み込み済
            {
                if (b1 <= BINARY)
                {   // バイナリ文字検出：先頭２バイトでの検出ならUTF16Nの可能性をチェック、否ならバイナリ確定
                    CharCode ret = (asciiEndPos < 2 ? SeemsUTF16N(bytes, len) : null);
                    if (ret != null && (text = ret.GetString(bytes, len)) != null)
                    {   // UTF16Nデコード成功：非テキスト文字混入チェック
                        int i;
                        for (i = -3; i <= BINARY; i++)
                        {   // 0xFFFD,0xFFFE,0xFFFF,0～BINARY、DELが混入している場合は非テキストとみなす
                            if (text.IndexOf((char)i, 0, text.Length) != -1)
                            {
                                break;
                            }
                        }
                        if (i > BINARY && text.IndexOf((char)DEL, 0, text.Length) == -1)
                        {   // ■UTF16N確定（非テキスト文字混入なし）
                            return(ret);
                        }
                    }
                    text = null;
                    return(null); // ■バイナリ確定
                }
                if (b1 == 0x1B)
                {   // エスケープシーケンス判定(エスケープコード内容を読み飛ばす)
                    if (escapeSequenceChecker == null)
                    {
                        escapeSequenceChecker = new JIS(bytes, len, asciiEndPos);
                    }
                    asciiEndPos += escapeSequenceChecker.GetEncoding(asciiEndPos);
                }
                // 次の文字へ
                if ((++asciiEndPos) >= len)
                {     // 全文字チェック完了：非ASCII文字未検出、JISもしくはASCII
                    if (escapeSequenceChecker != null)
                    { // エスケープシーケンスに基づく文字コードが取得できるか確認
                        CharCode ret = escapeSequenceChecker.GetEncoding(out text);
                        if (ret != null)
                        {   // ■エスケープシーケンスに基づく文字コードで確定
                            return(ret);
                        }
                    }
                    else if (JIS.hasSOSI(bytes, len))
                    {     // SO,SIによるエスケープを検出した場合は、半角カナJISの可能性を判定
                        if (escapeSequenceChecker == null && (text = CharCode.JIS50222.GetString(bytes, len)) != null)
                        { // ■エスケープシーケンスなし、半角カナSOSIのみを使用したJISで確定
                            return(CharCode.JIS50222);
                        }
                    }
                    // ■ASCII確定（ただしデコード失敗時はバイナリ）
                    return(((text = CharCode.ASCII.GetString(bytes, len)) != null) ? CharCode.ASCII : null);
                }
                b1 = bytes[asciiEndPos];
            }

            // 【2】非ASCII文字を含む範囲の走査、CP1252系/UTF8/EUCチェック、JIS残チェック
            byte b2;
            int  cp1252Score   = 0;                                     // いずれも、可能性が否定されたらint.MinValueが設定される
            int  utfScore      = 0;
            int  eucScore      = (this.EUC == null ? int.MinValue : 0); // EUC検出対象なしなら最初からチェック対象外
            int  sjisScore     = 0;
            bool existsEUC0x8F = false;                                 // EUC補助漢字を見つけたらtrueを設定
            uint NODEF         = this.NODEF;                            // パフォーマンス改善のためローカル変数におろす

            for (int cp1252Pos = asciiEndPos; cp1252Pos < len;)         // cp1252Posの加算はロジック途中で随時実施
            {
                if (b1 == DEL)
                {   // 制御文字0x7F登場なら、ごくわずかなJISの可能性以外全消滅。JISの可能性を消しきれるか判定
                    cp1252Score = int.MinValue;
                    utfScore    = int.MinValue;
                    eucScore    = int.MinValue;
                    sjisScore   = int.MinValue;
                    if (escapeSequenceChecker == null || (cp1252Pos++) >= len || (b1 = bytes[cp1252Pos]) < 0x21 || b1 >= DEL)
                    {                 // JISエスケープ未出現 or ファイル末尾で2バイト目なし or 2バイト目が0x21-0x7E範囲外ならJISの可能性も否定
                        text = null;
                        return(null); // ■バイナリ確定
                    }
                }
                // CP1252系チェック＆0x80以上の文字範囲の把握(notAsciiStartPos～cp1252Pos)。b1読込済
                int notAsciiStart = cp1252Pos;
                switch (cp1252Score)
                {
                case int.MinValue:     // CP1252系の可能性否定済み、非ASCII文字のスキップのみ実施
                    while (b1 > DEL && (++cp1252Pos) < len)
                    {
                        b1 = bytes[cp1252Pos];
                    }
                    break;

                default:                            // CP1252系可能性あり、定義外文字混入チェック＆ポイント加算
                    while (b1 > DEL)
                    {                               // そのバイト値が未定義コードかどうかチェック（bit表現値とかみあうようビットシフトして照合）
                        if (b1 <= 0x9F && (NODEF & (1u << (b1 % 32))) != 0)
                        {                           // ビットが立ってる＝未定義コード、可能性消滅
                            cp1252Score = int.MinValue;
                            goto case int.MinValue; // 非ASCII文字スキップへ
                        }
                        if ((++cp1252Pos) >= len)
                        {
                            break;
                        }
                        b1 = bytes[cp1252Pos];
                    }
                    // 非ASCII文字範囲終了、評価ポイント加算
                    // １バイトのみ出現時（SJISよりもCP1252系の可能性が高い）、SJIS漢字1文字目と同評価・SJISカナよりも高評価となるようポイント加算
                    if (cp1252Pos == notAsciiStart + 1)
                    {
                        cp1252Score += 2;
                    }
                    else if (cp1252Pos == notAsciiStart + 2 && (b2 = bytes[cp1252Pos - 1]) >= 0xC0)
                    {       // ２バイトのみ出現時、ダイアクリティカルマーク（発音記号等）つきアルファベットなら配点補正
                        if (b2 == (b2 = bytes[cp1252Pos - 2]))
                        {
                            cp1252Score += 5;
                        }       // 同一文字重ねはかなり特徴的(SJISカナより可能性高)
                        else if (b2 >= 0xC0)
                        {       // 続きor直前のASCII文字がアルファベットっぽければ、SJISカナより可能性が高くなるよう補正
                            if (b1 > 0x40 || (notAsciiStart > 0 && bytes[notAsciiStart - 1] > 0x40))
                            {
                                cp1252Score += 5;
                            }
                            else
                            {
                                cp1252Score += 3;
                            }                              // どちらでもなければ、EUCよりは可能性高とする
                        }
                        else
                        {
                            cp1252Score++;
                        }                           // 否ならば低めの加算とする
                    }
                    else
                    {
                        cp1252Score++;
                    }                           // いずれにも該当しなければやや低めの加算とする
                    break;
                }
                // notAsciiStartPos～cp1252Pos範囲のUTF8チェック
                if (utfScore >= 0)
                {
                    bool prevIsKanji = false;
                    for (int utfPos = notAsciiStart; utfPos < cp1252Pos; utfPos++)
                    {
                        b1 = bytes[utfPos]; // ※1バイト目は厳密にチェック、2バイト目以降は（デコード時にチェックアウトできる前提で）冗長なエンコードやサロゲート等を許容している
                        // 1バイト目・２バイト目(ともに0x80以上であることは確認済み)をチェック
                        if (b1 < 0xC2 || (++utfPos) >= cp1252Pos || bytes[utfPos] > 0xBF)
                        {
                            utfScore = int.MinValue; break;
                        }   // UTF8可能性消滅
                        else if (b1 < 0xE0)
                        {   // ２バイト文字OK（半角文字とみなして評価）
                            if (prevIsKanji == false)
                            {
                                utfScore += 6;
                            }
                            else
                            {
                                utfScore += 2; prevIsKanji = false;
                            }
                        }
                        // 3バイト目(0x80以上であることは確認済み)をチェック
                        else if ((++utfPos) >= cp1252Pos || bytes[utfPos] > 0xBF)
                        {
                            utfScore = int.MinValue; break;
                        }   // UTF8可能性消滅
                        else if (b1 < 0xF0)
                        {   // ３バイト文字OK（全角文字とみなして評価）
                            if (prevIsKanji == true)
                            {
                                utfScore += 8;
                            }
                            else
                            {
                                utfScore += 4; prevIsKanji = true;
                            }
                        }
                        // 4バイト目(0x80以上であることは確認済み)をチェック
                        else if ((++utfPos) >= cp1252Pos || bytes[utfPos] > 0xBF)
                        {
                            utfScore = int.MinValue; break;
                        }   // UTF8可能性消滅
                        else if (b1 < 0xF5)
                        {   // ４バイト文字OK（全角文字とみなして評価）
                            if (prevIsKanji == true)
                            {
                                utfScore += 12;
                            }
                            else
                            {
                                utfScore += 6; prevIsKanji = true;
                            }
                        }
                        else
                        {
                            utfScore = int.MinValue; break;
                        }                                        // UTF8可能性消滅(0xF5以降はUTF8未定義)
                    }
                }
                // notAsciiStartPos～cp1252Pos範囲のEUCチェック ※EUCの文字コード範囲はEUC-JP/TW/CN/KRでほぼ共通
                if (eucScore >= 0)
                {                               // 前の文字との連続性チェック用定数定義
                    const int PREV_KANA    = 1; // 直前文字は半角カナ
                    const int PREV_ZENKAKU = 2; // 直前文字は全角
                    int       prevChar     = 0; // 前の文字はKANAでもZENKAKUでもない
                    for (int eucPos = notAsciiStart; eucPos < cp1252Pos; eucPos++)
                    {                           // １バイト目(0xA1-0xFE,0x8E,0x8F)・２バイト目(１バイト目に応じ範囲が異なる)のチェック
                        b1 = bytes[eucPos];
                        if (b1 == 0xFF || (++eucPos) >= cp1252Pos)
                        {
                            eucScore = int.MinValue; break;
                        }                                                                              // EUC可能性消滅
                        b2 = bytes[eucPos];
                        if (b1 >= 0xA1)
                        {   // １バイト目＝全角文字指定、２バイト全角文字チェック
                            if (b2 < 0xA1 || b2 == 0xFF)
                            {
                                eucScore = int.MinValue; break;
                            }                                                                // EUC可能性消滅
                            // ２バイト文字OK（全角）
                            if (prevChar == PREV_ZENKAKU)
                            {
                                eucScore += 5;
                            }
                            else
                            {
                                eucScore += 2; prevChar = PREV_ZENKAKU;
                            }
                        }
                        else if (b1 == 0x8E)
                        {   // １バイト目＝EUC-JPのかな文字(orEUC-TWの４バイト文字)指定。２バイトの半角カナ文字チェック
                            if (b2 < 0xA1 || b2 > 0xDF)
                            {
                                eucScore = int.MinValue; break;
                            }                                                               // EUC可能性消滅
                            // 検出OK,EUC文字数を加算（半角文字）
                            if (prevChar == PREV_KANA)
                            {
                                eucScore += 6;
                            }
#if (!JPONLY)
                            // 漢字圏テキスト文字コードのうちEUC-TWに限り全角文字相当の扱いとする(0x8E,0xA2-0xB0,0xA1-0xFE,0xA1-0xFEの４バイト文字の判定に流用)
                            else if (this.EUC == CharCode.EUCTW)
                            {
                                if (prevChar == PREV_ZENKAKU)
                                {
                                    eucScore += 6;
                                }
                                else
                                {
                                    eucScore += 2; prevChar = PREV_ZENKAKU;
                                }
                            }
#endif
                            else
                            {
                                eucScore += 2; prevChar = PREV_KANA;
                            }
                        }
                        else if (b1 == 0x8F &&
                                 b2 >= 0xA1 && b2 < 0xFF &&
                                 (++eucPos) < cp1252Pos &&
                                 (b2 = bytes[eucPos]) >= 0xA1 && b2 < 0xFF)
                        {   // 残る可能性は３バイト文字：検出OKならEUC文字数を加算（全角文字、補助漢字）
                            if (prevChar == PREV_ZENKAKU)
                            {
                                eucScore += 8;
                            }
                            else
                            {
                                eucScore += 3; prevChar = PREV_ZENKAKU;
                            }
                            existsEUC0x8F = true; // ※補助漢字有
                        }
                        else
                        {
                            eucScore = int.MinValue; break;
                        }                                        // EUC可能性消滅
                    }
                }

                // ASCII文字範囲の読み飛ばし＆バイナリチェック＆JISチェック、b1に非ASCII文字出現位置のバイト値を格納
                while (cp1252Pos < len && (b1 = bytes[cp1252Pos]) < DEL)
                {
                    if (b1 <= BINARY)
                    {   // ■バイナリ確定
                        text = null;
                        return(null);
                    }
                    if (b1 == 0x1B)
                    {   // エスケープシーケンス判定(エスケープコード内容を読み飛ばす)
                        if (escapeSequenceChecker == null)
                        {
                            escapeSequenceChecker = new JIS(bytes, len, cp1252Pos);
                        }
                        cp1252Pos += escapeSequenceChecker.GetEncoding(cp1252Pos);
                    }
                    cp1252Pos++;
                }
            }

            // 【3】SJISなどの各国語文字コードチェック（非ASCII登場位置からチェック開始:ただしDEL検出時などは可能性なし）
            if (sjisScore != int.MinValue)
            {
                sjisScore = GetEncoding(bytes, asciiEndPos, len);
            }

            // 【4】ポイントに応じ文字コードを決定（実際にそのエンコーディングで読み出し成功すればOKとみなす）
            if (escapeSequenceChecker != null)
            {   // JIS系可能性高：エスケープシーケンスに基づく文字コードが取得できるか確認
                CharCode ret = escapeSequenceChecker.GetEncoding(out text);
                if (ret != null)
                {
                    return(ret);
                }                                // ■エスケープシーケンスに基づく文字コードで確定
            }
            if (eucScore > 0 && eucScore > sjisScore && eucScore > utfScore)
            {     // EUC可能性高
                if (cp1252Score > eucScore)
                { // ただし可能性が高ければCP1252系を先にチェック
                    if ((text = this.CP125X.GetString(bytes, len)) != null)
                    {
                        return(this.CP125X);
                    }                                                                               // ■CP1252系で読みこみ成功
                }
                if (existsEUC0x8F && (text = CharCode.EUCH.GetString(bytes, len)) != null)
                {
                    return(CharCode.EUCH);
                }                                                                                                   // ■EUC補助漢字読みこみ成功
                if ((text = this.EUC.GetString(bytes, len)) != null)
                {
                    return(this.EUC);
                }                                                                         // ■EUCで読みこみ成功
            }
            if (utfScore > 0 && utfScore >= sjisScore)
            {   // UTF可能性高
                if ((text = CharCode.UTF8N.GetString(bytes, len)) != null)
                {
                    return(CharCode.UTF8N);
                }                                                                                     // ■UTF-8Nで読みこみ成功
            }
            if (sjisScore >= 0)
            {   // SJISなどの各国語指定に合致したなら、そのコードでの読み出しを試みる(ただし可能性が高ければCP1252系を先にチェック)
                if (cp1252Score > sjisScore && (text = this.CP125X.GetString(bytes, len)) != null)
                {
                    return(this.CP125X);
                }                                                                                                          // ■CP1252系で読みこみ成功
                if ((text = this.CharCode.GetString(bytes, len)) != null)
                {
                    return(this.CharCode);
                }                                                                                   // ■各国語文字コードで読みこみ成功
            }
            if (cp1252Score > 0)
            {   // CP1252系の可能性のみ残っているのでチェック
                if ((text = this.CP125X.GetString(bytes, len)) != null)
                {
                    return(this.CP125X);
                }                                                                               // ■CP1252系で読みこみ成功
            }
            // ■いずれにも該当しなかった場合は、バイナリファイル扱いとする
            text = null;
            return(null);
        }