private void Set_cdt(CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, bool multibyte) { if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E') { addnlOffset++; } else if (data[cdt.offset + addnlOffset] == ' ') { if (errorList != null) { errorList.AddError(Error.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space."); } else { throw new MarcException("Extraneous space character found within MARC8 character set escape sequence"); } addnlOffset++; } else if ("(,)-$!".IndexOf(data[cdt.offset + addnlOffset]) != -1) { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character."); } else { throw new MarcException("Extraneaous intermediate character found following escape character."); } addnlOffset++; } if ("34BE1NQS2".IndexOf(data[cdt.offset + addnlOffset]) == -1) { cdt.offset += 1; cdt.multibyte = false; if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } } else // All is well, proceed normally { if (g0_or_g1 == 0) { cdt.g0 = data[cdt.offset + addnlOffset]; } else { cdt.g1 = data[cdt.offset + addnlOffset]; } cdt.offset += 1 + addnlOffset; cdt.multibyte = multibyte; } }
/// <summary> /// Converts MARC-8 data to UCS/Unicode. /// </summary> /// <param name="data">the MARC-8 data in an array of char</param> /// <returns>the UCS/Unicode data</returns> public override String Convert(char[] data) { StringBuilder sb = new StringBuilder(); int len = data.Length; CodeTracker cdt = new CodeTracker(); cdt.g0 = 0x42; cdt.g1 = 0x45; cdt.multibyte = false; cdt.offset = 0; CheckMode(data, cdt); var diacritics = new Queue <char>(); while (cdt.offset < data.Length) { if (ct.IsCombining(data[cdt.offset], cdt.g0, cdt.g1) && HasNext(cdt.offset, len)) { while (cdt.offset < len && ct.IsCombining(data[cdt.offset], cdt.g0, cdt.g1) && HasNext(cdt.offset, len)) { char c = GetChar(data[cdt.offset], cdt.g0, cdt.g1); if (c != 0) { diacritics.Enqueue(c); } cdt.offset++; CheckMode(data, cdt); } if (cdt.offset >= len) { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Diacritic found at the end of field, without the character that it is supposed to decorate"); break; } } char c2 = GetChar(data[cdt.offset], cdt.g0, cdt.g1); cdt.offset++; CheckMode(data, cdt); if (c2 != 0) { sb.Append(c2); } while (!diacritics.Any()) { char c1 = diacritics.Dequeue(); sb.Append(c1); } } else if (cdt.multibyte) { if (data[cdt.offset] == 0x20) { // if a 0x20 byte occurs amidst a sequence of multibyte characters // skip over it and output a space. sb.Append(GetChar(data[cdt.offset], cdt.g0, cdt.g1)); cdt.offset += 1; } else if (cdt.offset + 3 <= data.Length && (errorList == null || data[cdt.offset + 1] != 0x20 && data[cdt.offset + 2] != 0x20)) { char c = GetMBChar(MakeMultibyte(data[cdt.offset], data[cdt.offset + 1], data[cdt.offset + 2])); if (errorList == null || c != 0) { sb.Append(c); cdt.offset += 3; } else if (cdt.offset + 6 <= data.Length && data[cdt.offset + 4] != 0x20 && data[cdt.offset + 5] != 0x20 && GetMBChar(MakeMultibyte(data[cdt.offset + 3], data[cdt.offset + 4], data[cdt.offset + 5])) != 0) { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters"); sb.Append("[?]"); cdt.offset += 3; } } else if (cdt.offset + 4 <= data.Length && data[cdt.offset] > 0x7f && GetMBChar(MakeMultibyte(data[cdt.offset + 1], data[cdt.offset + 2], data[cdt.offset + 3])) != 0) { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters"); sb.Append(GetChar(data[cdt.offset], 0x42, 0x45)); cdt.offset += 1; } } else { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; } } else if (errorList != null && cdt.offset + 4 <= data.Length && (data[cdt.offset + 1] == 0x20 || data[cdt.offset + 2] == 0x20)) { int multiByte = MakeMultibyte(data[cdt.offset], ((data[cdt.offset + 1] != 0x20) ? data[cdt.offset + 1] : data[cdt.offset + 2]), data[cdt.offset + 3]); char c = GetMBChar(multiByte); if (c != 0) { if (errorList != null) { errorList.AddError(Error.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character"); } sb.Append(c); sb.Append(' '); cdt.offset += 4; } else { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; } } else if (cdt.offset + 3 > data.Length || cdt.offset + 3 == data.Length && (data[cdt.offset + 1] == 0x20 || data[cdt.offset + 2] == 0x20)) { if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set"); cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; } // if a field ends with an incomplete encoding of a multibyte character // simply discard that final partial character. else { cdt.offset += 3; } } } else { char c = GetChar(data[cdt.offset], cdt.g0, cdt.g1); if (c != 0) { sb.Append(c); } else { String val = "0000" + ((int)data[cdt.offset]).ToString("X"); sb.Append("<U+" + (val.Substring(val.Length - 4, 4)) + ">"); } cdt.offset += 1; } if (HasNext(cdt.offset, len)) { CheckMode(data, cdt); } } String dataElement = sb.ToString(); if (translateNCR && Regex.IsMatch(dataElement, "[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*")) { dataElement = Regex.Replace(dataElement, "&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);", a => GetCharFromCodePoint(a.Value)); } return(dataElement); }
private void CheckMode(char[] data, CodeTracker cdt) { int extra = 0; int extra2 = 0; while (cdt.offset + extra + extra2 < data.Length && IsEscape(data[cdt.offset])) { if (cdt.offset + extra + extra2 + 1 == data.Length) { cdt.offset += 1; if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Escape character found at end of field, discarding it."); } else { throw new MarcException("Escape character found at end of field"); } break; } switch (data[cdt.offset + 1 + extra]) { case (char)0x28: // '(' case (char)0x2c: // ',' Set_cdt(cdt, 0, data, 2 + extra, false); break; case (char)0x29: // ')' case (char)0x2d: // '-' Set_cdt(cdt, 1, data, 2 + extra, false); break; case (char)0x24: // '$' if (!loadedMultibyte) { LoadMultibyte(); loadedMultibyte = true; } switch (data[cdt.offset + 2 + extra + extra2]) { case (char)0x29: // ')' case (char)0x2d: // '-' Set_cdt(cdt, 1, data, 3 + extra + extra2, true); break; case (char)0x2c: // ',' Set_cdt(cdt, 0, data, 3 + extra + extra2, true); break; case (char)0x31: // '1' cdt.g0 = data[cdt.offset + 2 + extra + extra2]; cdt.offset += 3 + extra + extra2; cdt.multibyte = true; break; case (char)0x20: // ' ' // space found in escape code: look ahead and try to proceed extra2++; break; default: // unknown code character found: discard escape sequence and return cdt.offset += 1; if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } break; } break; case (char)0x67: // 'g' case (char)0x62: // 'b' case (char)0x70: // 'p' cdt.g0 = data[cdt.offset + 1 + extra]; cdt.offset += 2 + extra; cdt.multibyte = false; break; case (char)0x73: // 's' cdt.g0 = 0x42; cdt.offset += 2 + extra; cdt.multibyte = false; break; case (char)0x20: // ' ' // space found in escape code: look ahead and try to proceed if (errorList == null) { throw new MarcException("Extraneous space character found within MARC8 character set escape sequence"); } extra++; break; default: // unknown code character found: discard escape sequence and return cdt.offset += 1; if (errorList != null) { errorList.AddError(Error.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } break; } } if (errorList != null && (extra != 0 || extra2 != 0)) { errorList.AddError(Error.ERROR_TYPO, "" + (extra + extra2) + " extraneous space characters found within MARC8 character set escape sequence"); } }