コード例 #1
0
ファイル: UnicodeToAnsel.cs プロジェクト: gjunge/MARC4J.Net
 private static bool AllCharsHaveMatch(ReverseCodeTable rct, String str)
 {
     foreach (var c in str)
     {
         if (!rct.CharHasMatch(c))
         {
             return(false);
         }
     }
     return(true);
 }
コード例 #2
0
ファイル: UnicodeToAnsel.cs プロジェクト: gjunge/MARC4J.Net
        /// <summary>
        /// Does the actual work of converting UCS/Unicode data to MARC-8.
        ///
        /// <para>
        /// If the Unicode data has been normalized into composed form, and the composed character
        /// does not have a corresponding MARC8 character, this routine will normalize that character into
        /// its decomposed form, and try to translate that equivalent string into MARC8.
        /// </para>
        /// </summary>
        /// <param name="data">the UCS/Unicode data in an array of char</param>
        /// <param name="sb">the MARC-8 data</param>
        private void ConvertPortion(char[] data, StringBuilder sb)
        {
            for (int i = 0; i < data.Length; i++)
            {
                var c         = data[i];
                var marc      = new StringBuilder();
                int charValue = (int)c;
                if (charValue == 0x20 && rct.GetPreviousG0() != (int)'1')
                {
                    if (rct.GetPreviousG0() == (int)'1')
                    {
                        sb.Append(ESC);
                        sb.Append(G0);
                        sb.Append((char)ASCII);
                        rct.SetPreviousG0(ASCII);
                    }
                    marc.Append(" ");
                }
                else if (!rct.CharHasMatch(c))
                {
                    // Unicode character c has no match in the Marc8 tables.  Try unicode-decompose on it
                    // to see whether the decomposed form can be represented.  If when decomposed, all of
                    // the characters can be translated to marc8, then use that.  If not and the decomposed form
                    // if three (or more) characters long (which indicates multiple diacritic marks), then
                    // re-compose the the main character with the first diacritic, and check whether that
                    // and the remaining diacritics can be translated. If so go with that, otherwise, give up
                    // and merely use the &#xXXXX; Numeric char Reference form to represent the original
                    // unicode character
                    String tmpnorm   = c.ToString();
                    String tmpNormed = tmpnorm.Normalize(NormalizationForm.FormD);
                    if (!tmpNormed.Equals(tmpnorm))
                    {
                        if (AllCharsHaveMatch(rct, tmpNormed))
                        {
                            ConvertPortion(tmpNormed.ToCharArray(), sb);
                            continue;
                        }
                        else if (tmpNormed.Length > 2)
                        {
                            String firstTwo      = tmpNormed.Substring(0, 2);
                            String partialNormed = firstTwo.Normalize(NormalizationForm.FormC);
                            if (!partialNormed.Equals(firstTwo) && AllCharsHaveMatch(rct, partialNormed) &&
                                AllCharsHaveMatch(rct, tmpNormed.Substring(2)))
                            {
                                ConvertPortion((partialNormed + tmpNormed.Substring(2)).ToCharArray(), sb);
                                continue;
                            }
                        }
                    }
                    if (rct.GetPreviousG0() != ASCII)
                    {
                        sb.Append(ESC);
                        sb.Append(G0);
                        sb.Append((char)ASCII);
                        rct.SetPreviousG0(ASCII);
                    }
                    if (charValue < 0x1000)
                    {
                        sb.Append("&#x" + (charValue + 0x10000).ToString("X").ToUpper().Substring(1) + ";");
                    }
                    else
                    {
                        sb.Append("&#x" + charValue.ToString("X").ToUpper() + ";");
                    }
                    continue;
                }
                else if (rct.InPreviousG0CharEntry(c))
                {
                    marc.Append(rct.GetCurrentG0CharEntry(c));
                }
                else if (rct.InPreviousG1CharEntry(c))
                {
                    marc.Append(rct.GetCurrentG1CharEntry(c));
                }
                else // need to change character set
                {
                    // if several MARC-8 character sets contain the given Unicode character, select the
                    // best char set to use for encoding the character.  Preference is given to character
                    // sets that have been used previously in the field being encoded.  Since the default
                    // character sets for Basic and extended latin are pre-loaded, usually if a character
                    // can be encoded by one of those character sets, that is what will be chosen.
                    int    charset = rct.GetBestCharSet(c);
                    char[] marc8   = rct.GetCharEntry(c, charset);

                    if (marc8.Length == 3)
                    {
                        marc.Append(ESC);
                        marc.Append(G0multibyte);
                        rct.SetPreviousG0(charset);
                    }
                    else if (marc8[0] < 0x80)
                    {
                        marc.Append(ESC);
                        if (charset == 0x62 || charset == 0x70)
                        {
                            //technique1 = true;
                        }
                        else
                        {
                            marc.Append(G0);
                        }
                        rct.SetPreviousG0(charset);
                    }
                    else
                    {
                        marc.Append(ESC);
                        marc.Append(G1);
                        rct.SetPreviousG1(charset);
                    }
                    marc.Append((char)charset);
                    marc.Append(marc8);
                }

                if (rct.IsCombining(c) && sb.Length > 0)
                {
                    sb.Insert(sb.Length - 1, marc);

                    // Special case handling to handle the COMBINING DOUBLE INVERTED BREVE
                    // and the COMBINING DOUBLE TILDE where a single double wide accent character
                    // in unicode is represented by two half characters in Marc8
                    if (((int)c) == 0x360)
                    {
                        sb.Append((char)(0xfb));
                    }
                    if (((int)c) == 0x361)
                    {
                        sb.Append((char)(0xec));
                    }
                }
                else
                {
                    sb.Append(marc);
                }
            }
        }