Ejemplo n.º 1
0
 public static bool IsCJKCharacter(char input)
 {
     Character.UnicodeBlock ub = Character.UnicodeBlock.Of(input);
     if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
         ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS ||
         ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
         //全角数字字符和日韩字符
         || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
         //韩文字符集
         || ub == Character.UnicodeBlock.HANGUL_SYLLABLES ||
         ub == Character.UnicodeBlock.HANGUL_JAMO ||
         ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
         //日文字符集
         || ub == Character.UnicodeBlock.HIRAGANA ||  //平假名
         ub == Character.UnicodeBlock.KATAKANA ||     //片假名
         ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS
         )
     {
         return(true);
     }
     else
     {
         return(false);
     }
 }
Ejemplo n.º 2
0
 /**
  * 判断是否中文
  * GENERAL_PUNCTUATION 判断中文的“号
  * CJK_SYMBOLS_AND_PUNCTUATION 判断中文的。号
  * HALFWIDTH_AND_FULLWIDTH_FORMS 判断中文的,号
  * @param c 字符
  * @return 是否中文
  */
 public static bool IsChinese(char c)
 {
     Character.UnicodeBlock ub = Character.UnicodeBlock.Of(c);
     return(ub == Character.UnicodeBlock.CjkUnifiedIdeographs ||
            ub == Character.UnicodeBlock.CjkCompatibilityIdeographs ||
            ub == Character.UnicodeBlock.CjkUnifiedIdeographsExtensionA ||
            ub == Character.UnicodeBlock.CjkUnifiedIdeographsExtensionB ||
            ub == Character.UnicodeBlock.CjkSymbolsAndPunctuation ||
            ub == Character.UnicodeBlock.HalfwidthAndFullwidthForms ||
            ub == Character.UnicodeBlock.GeneralPunctuation);
 }
Ejemplo n.º 3
0
        private static string NormalizeUnicode(string @in, int ascii, int spaceChar, int midDot)
        {
            StringBuilder @out = new StringBuilder();
            int           len  = @in.Length;
            // Do it properly with codepoints, for non-BMP Unicode as well
            // int numCP = in.codePointCount(0, len);
            int cpp = 0;

            // previous codepoint
            for (int offset = 0; offset < len; offset += char.CharCount(cp))
            {
                // int offset = in.offsetByCodePoints(0, offset);
                cp = @in.CodePointAt(offset);
                Character.UnicodeBlock cub = Character.UnicodeBlock.Of(cp);
                if (cub == Character.UnicodeBlock.PrivateUseArea || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaA || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaB)
                {
                    EncodingPrintWriter.Err.Println("ChineseUtils.normalize warning: private use area codepoint U+" + int.ToHexString(cp) + " in " + @in);
                }
                bool delete = false;
                switch (ascii)
                {
                case Leave:
                {
                    break;
                }

                case Ascii:
                {
                    if (cp >= '\uFF01' && cp <= '\uFF5E')
                    {
                        cp -= (unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020)));
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (cp >= '\u0021' && cp <= '\u007E')
                    {
                        cp += (unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020)));
                    }
                    break;
                }

                default:
                {
                    throw new ArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
                }
                }
                switch (spaceChar)
                {
                case Leave:
                {
                    break;
                }

                case Ascii:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        cp = ' ';
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        cp = '\u3000';
                    }
                    break;
                }

                case Delete:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        delete = true;
                    }
                    break;
                }

                case DeleteExceptBetweenAscii:
                {
                    int nextOffset = offset + char.CharCount(cp);
                    int cpn        = 0;
                    if (nextOffset < len)
                    {
                        cpn = @in.CodePointAt(nextOffset);
                    }
                    if (char.IsSpaceChar(cp) && !(IsAsciiLowHigh(cpp) && IsAsciiLowHigh(cpn)))
                    {
                        delete = true;
                    }
                    break;
                }
                }
                switch (midDot)
                {
                case Leave:
                {
                    break;
                }

                case Normalize:
                {
                    if (IsMidDot(cp))
                    {
                        cp = '\u00B7';
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (IsMidDot(cp))
                    {
                        cp = '\u30FB';
                    }
                    break;
                }

                case Delete:
                {
                    if (IsMidDot(cp))
                    {
                        delete = true;
                    }
                    break;
                }

                default:
                {
                    throw new ArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
                }
                }
                if (!delete)
                {
                    @out.AppendCodePoint(cp);
                }
                cpp = cp;
            }
            // end for
            return(@out.ToString());
        }
Ejemplo n.º 4
0
        private static string NormalizeBMP(string @in, int ascii, int spaceChar, int midDot)
        {
            StringBuilder @out = new StringBuilder();
            int           len  = @in.Length;

            for (int i = 0; i < len; i++)
            {
                char cp = @in[i];
                if (char.IsHighSurrogate(cp))
                {
                    if (i + 1 < len)
                    {
                        log.Warn("ChineseUtils.normalize warning: non-BMP codepoint U+" + int.ToHexString(char.CodePointAt(@in, i)) + " in " + @in);
                    }
                    else
                    {
                        log.Warn("ChineseUtils.normalize warning: unmatched high surrogate character U+" + int.ToHexString(char.CodePointAt(@in, i)) + " in " + @in);
                    }
                }
                Character.UnicodeBlock cub = Character.UnicodeBlock.Of(cp);
                if (cub == Character.UnicodeBlock.PrivateUseArea || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaA || cub == Character.UnicodeBlock.SupplementaryPrivateUseAreaB)
                {
                    EncodingPrintWriter.Err.Println("ChineseUtils.normalize warning: private use area codepoint U+" + int.ToHexString(cp) + " in " + @in);
                }
                bool delete = false;
                switch (ascii)
                {
                case Leave:
                {
                    break;
                }

                case Ascii:
                {
                    if (cp >= '\uFF01' && cp <= '\uFF5E')
                    {
                        cp -= (char)(unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020)));
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (cp >= '\u0021' && cp <= '\u007E')
                    {
                        cp += (char)(unchecked ((int)(0xFF00)) - unchecked ((int)(0x0020)));
                    }
                    break;
                }

                default:
                {
                    throw new ArgumentException("ChineseUtils: Unsupported parameter option: ascii=" + ascii);
                }
                }
                switch (spaceChar)
                {
                case Leave:
                {
                    break;
                }

                case Ascii:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        cp = ' ';
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        cp = '\u3000';
                    }
                    break;
                }

                case Delete:
                {
                    if (char.IsSpaceChar(cp))
                    {
                        delete = true;
                    }
                    break;
                }

                case DeleteExceptBetweenAscii:
                {
                    char cpp = 0;
                    if (i > 0)
                    {
                        cpp = @in[i - 1];
                    }
                    char cpn = 0;
                    if (i < (len - 1))
                    {
                        cpn = @in[i + 1];
                    }
                    // EncodingPrintWriter.out.println("cp: " + cp + "; cpp: " + cpp + "cpn: " + cpn +
                    //      "; isSpace: " + Character.isSpaceChar(cp) + "; isAsciiLHL: " + isAsciiLowHigh(cpp) +
                    //      "; isAsciiLHR: " + isAsciiLowHigh(cpn), "UTF-8");
                    if (char.IsSpaceChar(cp) && !(IsAsciiLowHigh(cpp) && IsAsciiLowHigh(cpn)))
                    {
                        delete = true;
                    }
                    break;
                }
                }
                switch (midDot)
                {
                case Leave:
                {
                    break;
                }

                case Normalize:
                {
                    if (IsMidDot(cp))
                    {
                        cp = '\u00B7';
                    }
                    break;
                }

                case Fullwidth:
                {
                    if (IsMidDot(cp))
                    {
                        cp = '\u30FB';
                    }
                    break;
                }

                case Delete:
                {
                    if (IsMidDot(cp))
                    {
                        delete = true;
                    }
                    break;
                }

                default:
                {
                    throw new ArgumentException("ChineseUtils: Unsupported parameter option: midDot=" + midDot);
                }
                }
                if (!delete)
                {
                    @out.Append(cp);
                }
            }
            // end for
            return(@out.ToString());
        }