public static DictionaryMatcher LoadDictionaryFor(string dictType) { ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.GetBundleInstance(ICUData.IcuBreakIteratorBaseName); // com/ibm/icu/impl/data/icudt60b/brkitr string dictFileName = rb.GetStringWithFallback("dictionaries/" + dictType); // ICU4N TODO: Possibly rename the above and use this syntax instead...? //var rm = new ResourceManager(ICUData.ICU_BRKITR_BASE_NAME, typeof(DictionaryData).GetTypeInfo().Assembly); //string dictFileName = rm.GetString("dictionaries_" + dictType); dictFileName = ICUData.IcuBreakIteratorName + '/' + dictFileName; ByteBuffer bytes = ICUBinary.GetRequiredData(dictFileName); ICUBinary.ReadHeader(bytes, DATA_FORMAT_ID, null); int[] indexes = new int[IX_COUNT]; // TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[] for (int i = 0; i < IX_COUNT; i++) { indexes[i] = bytes.GetInt32(); } int offset = indexes[IX_STRING_TRIE_OFFSET]; Assert.Assrt(offset >= (4 * IX_COUNT)); if (offset > (4 * IX_COUNT)) { int diff = offset - (4 * IX_COUNT); ICUBinary.SkipBytes(bytes, diff); } int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK; int totalSize = indexes[IX_TOTAL_SIZE] - offset; DictionaryMatcher m = null; if (trieType == TRIE_TYPE_BYTES) { int transform = indexes[IX_TRANSFORM]; byte[] data = new byte[totalSize]; bytes.Get(data); m = new BytesDictionaryMatcher(data, transform); } else if (trieType == TRIE_TYPE_UCHARS) { Assert.Assrt(totalSize % 2 == 0); string data = ICUBinary.GetString(bytes, totalSize / 2, totalSize & 1); m = new CharsDictionaryMatcher(data); } else { m = null; } return(m); }
/// <summary> /// Get an <see cref="RBBIDataWrapper"/> from an InputStream onto a pre-compiled set /// of RBBI rules. /// </summary> /// <param name="bytes"></param> /// <returns></returns> internal static RBBIDataWrapper Get(ByteBuffer bytes) { RBBIDataWrapper This = new RBBIDataWrapper(); ICUBinary.ReadHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); This.isBigEndian = bytes.Order == ByteOrder.BigEndian; // Read in the RBBI data header... This.fHeader = new RBBIDataHeader(); This.fHeader.fMagic = bytes.GetInt32(); This.fHeader.fFormatVersion[0] = bytes.Get(); This.fHeader.fFormatVersion[1] = bytes.Get(); This.fHeader.fFormatVersion[2] = bytes.Get(); This.fHeader.fFormatVersion[3] = bytes.Get(); This.fHeader.fLength = bytes.GetInt32(); This.fHeader.fCatCount = bytes.GetInt32(); This.fHeader.fFTable = bytes.GetInt32(); This.fHeader.fFTableLen = bytes.GetInt32(); This.fHeader.fRTable = bytes.GetInt32(); This.fHeader.fRTableLen = bytes.GetInt32(); This.fHeader.fSFTable = bytes.GetInt32(); This.fHeader.fSFTableLen = bytes.GetInt32(); This.fHeader.fSRTable = bytes.GetInt32(); This.fHeader.fSRTableLen = bytes.GetInt32(); This.fHeader.fTrie = bytes.GetInt32(); This.fHeader.fTrieLen = bytes.GetInt32(); This.fHeader.fRuleSource = bytes.GetInt32(); This.fHeader.fRuleSourceLen = bytes.GetInt32(); This.fHeader.fStatusTable = bytes.GetInt32(); This.fHeader.fStatusTableLen = bytes.GetInt32(); ICUBinary.SkipBytes(bytes, 6 * 4); // uint32_t fReserved[6]; if (This.fHeader.fMagic != 0xb1a0 || !IS_ACCEPTABLE.IsDataVersionAcceptable(This.fHeader.fFormatVersion)) { throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version."); } // Current position in the buffer. int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes) // // Read in the Forward state transition table as an array of shorts. // // Quick Sanity Check if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) { throw new IOException("Break iterator Rule data corrupt"); } // Skip over any padding preceding this table ICUBinary.SkipBytes(bytes, This.fHeader.fFTable - pos); pos = This.fHeader.fFTable; This.fFTable = ICUBinary.GetInt16s( bytes, This.fHeader.fFTableLen / 2, This.fHeader.fFTableLen & 1); pos += This.fHeader.fFTableLen; // // Read in the Reverse state table // // Skip over any padding in the file ICUBinary.SkipBytes(bytes, This.fHeader.fRTable - pos); pos = This.fHeader.fRTable; // Create & fill the table itself. This.fRTable = ICUBinary.GetInt16s( bytes, This.fHeader.fRTableLen / 2, This.fHeader.fRTableLen & 1); pos += This.fHeader.fRTableLen; // // Read in the Safe Forward state table // if (This.fHeader.fSFTableLen > 0) { // Skip over any padding in the file ICUBinary.SkipBytes(bytes, This.fHeader.fSFTable - pos); pos = This.fHeader.fSFTable; // Create & fill the table itself. This.fSFTable = ICUBinary.GetInt16s( bytes, This.fHeader.fSFTableLen / 2, This.fHeader.fSFTableLen & 1); pos += This.fHeader.fSFTableLen; } // // Read in the Safe Reverse state table // if (This.fHeader.fSRTableLen > 0) { // Skip over any padding in the file ICUBinary.SkipBytes(bytes, This.fHeader.fSRTable - pos); pos = This.fHeader.fSRTable; // Create & fill the table itself. This.fSRTable = ICUBinary.GetInt16s( bytes, This.fHeader.fSRTableLen / 2, This.fHeader.fSRTableLen & 1); pos += This.fHeader.fSRTableLen; } // Rule Compatibility Hacks // If a rule set includes reverse rules but does not explicitly include safe reverse rules, // the reverse rules are to be treated as safe reverse rules. if (This.fSRTable == null && This.fRTable != null) { This.fSRTable = This.fRTable; This.fRTable = null; } // // Unserialize the Character categories TRIE // Because we can't be absolutely certain where the Trie deserialize will // leave the buffer, leave position unchanged. // The seek to the start of the next item following the TRIE will get us // back in sync. // ICUBinary.SkipBytes(bytes, This.fHeader.fTrie - pos); // seek buffer from end of pos = This.fHeader.fTrie; // previous section to the start of the trie bytes.Mark(); // Mark position of start of TRIE in the input // and tell Java to keep the mark valid so long // as we don't go more than 100 bytes past the // past the end of the TRIE. This.fTrie = Trie2.CreateFromSerialized(bytes); // Deserialize the TRIE, leaving buffer // at an unknown position, preceding the // padding between TRIE and following section. bytes.Reset(); // Move buffer back to marked position at // the start of the serialized TRIE. Now our // "pos" variable and the buffer are in // agreement. // // Read the Rule Status Table // if (pos > This.fHeader.fStatusTable) { throw new IOException("Break iterator Rule data corrupt"); } ICUBinary.SkipBytes(bytes, This.fHeader.fStatusTable - pos); pos = This.fHeader.fStatusTable; This.fStatusTable = ICUBinary.GetInt32s( bytes, This.fHeader.fStatusTableLen / 4, This.fHeader.fStatusTableLen & 3); pos += This.fHeader.fStatusTableLen; // // Put the break rule source into a String // if (pos > This.fHeader.fRuleSource) { throw new IOException("Break iterator Rule data corrupt"); } ICUBinary.SkipBytes(bytes, This.fHeader.fRuleSource - pos); pos = This.fHeader.fRuleSource; This.fRuleSource = ICUBinary.GetString( bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1); if (RuleBasedBreakIterator.fDebugEnv != null && RuleBasedBreakIterator.fDebugEnv.IndexOf("data", StringComparison.Ordinal) >= 0) { This.Dump(Console.Out); } return(This); }
internal static void Read(CollationTailoring @base, ByteBuffer inBytes, CollationTailoring tailoring) { tailoring.Version = ICUBinary.ReadHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE); if (@base != null && @base.GetUCAVersion() != tailoring.GetUCAVersion()) { throw new ICUException("Tailoring UCA version differs from base data UCA version"); } int inLength = inBytes.Remaining; if (inLength < 8) { throw new ICUException("not enough bytes"); } int indexesLength = inBytes.GetInt32(); // inIndexes[IX_INDEXES_LENGTH] if (indexesLength < 2 || inLength < indexesLength * 4) { throw new ICUException("not enough indexes"); } int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; inIndexes[0] = indexesLength; for (int i = 1; i < indexesLength && i < inIndexes.Length; ++i) { inIndexes[i] = inBytes.GetInt32(); } for (int i = indexesLength; i < inIndexes.Length; ++i) { inIndexes[i] = -1; } if (indexesLength > inIndexes.Length) { ICUBinary.SkipBytes(inBytes, (indexesLength - inIndexes.Length) * 4); } // Assume that the tailoring data is in initial state, // with null pointers and 0 lengths. // Set pointers to non-empty data parts. // Do this in order of their byte offsets. (Should help porting to Java.) int index; // one of the indexes[] slots int offset; // byte offset for the index part int length; // number of bytes in the index part if (indexesLength > IX_TOTAL_SIZE) { length = inIndexes[IX_TOTAL_SIZE]; } else if (indexesLength > IX_REORDER_CODES_OFFSET) { length = inIndexes[indexesLength - 1]; } else { length = 0; // only indexes, and inLength was already checked for them } if (inLength < length) { throw new ICUException("not enough bytes"); } CollationData baseData = @base == null ? null : @base.Data; int[] reorderCodes; int reorderCodesLength; index = IX_REORDER_CODES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 4) { if (baseData == null) { // We assume for collation settings that // the base data does not have a reordering. throw new ICUException("Collation base data must not reorder scripts"); } reorderCodesLength = length / 4; reorderCodes = ICUBinary.GetInts(inBytes, reorderCodesLength, length & 3); // The reorderRanges (if any) are the trailing reorderCodes entries. // Split the array at the boundary. // Script or reorder codes do not exceed 16-bit values. // Range limits are stored in the upper 16 bits, and are never 0. int reorderRangesLength = 0; while (reorderRangesLength < reorderCodesLength && (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { ++reorderRangesLength; } Debug.Assert(reorderRangesLength < reorderCodesLength); reorderCodesLength -= reorderRangesLength; } else { reorderCodes = new int[0]; reorderCodesLength = 0; ICUBinary.SkipBytes(inBytes, length); } // There should be a reorder table only if there are reorder codes. // However, when there are reorder codes the reorder table may be omitted to reduce // the data size. byte[] reorderTable = null; index = IX_REORDER_TABLE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 256) { if (reorderCodesLength == 0) { throw new ICUException("Reordering table without reordering codes"); } reorderTable = new byte[256]; inBytes.Get(reorderTable); length -= 256; } else { // If we have reorder codes, then build the reorderTable at the end, // when the CollationData is otherwise complete. } ICUBinary.SkipBytes(inBytes, length); if (baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { throw new ICUException("Tailoring numeric primary weight differs from base data"); } CollationData data = null; // Remains null if there are no mappings. index = IX_TRIE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 8) { tailoring.EnsureOwnedData(); data = tailoring.OwnedData; data.Base = baseData; data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; data.trie = tailoring.Trie = Trie2_32.CreateFromSerialized(inBytes); int trieLength = data.trie.GetSerializedLength(); if (trieLength > length) { throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. } length -= trieLength; } else if (baseData != null) { // Use the base data. Only the settings are tailored. tailoring.Data = baseData; } else { throw new ICUException("Missing collation data mappings"); // No mappings. } ICUBinary.SkipBytes(inBytes, length); index = IX_RESERVED8_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.SkipBytes(inBytes, length); index = IX_CES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 8) { if (data == null) { throw new ICUException("Tailored ces without tailored trie"); } data.ces = ICUBinary.GetLongs(inBytes, length / 8, length & 7); } else { ICUBinary.SkipBytes(inBytes, length); } index = IX_RESERVED10_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.SkipBytes(inBytes, length); index = IX_CE32S_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 4) { if (data == null) { throw new ICUException("Tailored ce32s without tailored trie"); } data.ce32s = ICUBinary.GetInts(inBytes, length / 4, length & 3); } else { ICUBinary.SkipBytes(inBytes, length); } int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; if (jamoCE32sStart >= 0) { if (data == null || data.ce32s == null) { throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); } data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; // ICU4N specific - added extension method to IList<T> to handle "copy to" data.ce32s.CopyTo(jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); } else if (data == null) { // Nothing to do. } else if (baseData != null) { data.jamoCE32s = baseData.jamoCE32s; } else { throw new ICUException("Missing Jamo CE32s for Hangul processing"); } index = IX_ROOT_ELEMENTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 4) { int rootElementsLength = length / 4; if (data == null) { throw new ICUException("Root elements but no mappings"); } if (rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { throw new ICUException("Root elements array too short"); } data.rootElements = new long[rootElementsLength]; for (int i = 0; i < rootElementsLength; ++i) { data.rootElements[i] = inBytes.GetInt32() & 0xffffffffL; // unsigned int -> long } long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; if (commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); } long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; if ((secTerBoundaries.TripleShift(24)) < CollationKeys.SEC_COMMON_HIGH) { // [fixed last secondary common byte] is too low, // and secondary weights would collide with compressed common secondaries. throw new ICUException("[fixed last secondary common byte] is too low"); } length &= 3; } ICUBinary.SkipBytes(inBytes, length); index = IX_CONTEXTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 2) { if (data == null) { throw new ICUException("Tailored contexts without tailored trie"); } data.contexts = ICUBinary.GetString(inBytes, length / 2, length & 1); } else { ICUBinary.SkipBytes(inBytes, length); } index = IX_UNSAFE_BWD_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 2) { if (data == null) { throw new ICUException("Unsafe-backward-set but no mappings"); } if (baseData == null) { // Create the unsafe-backward set for the root collator. // Include all non-zero combining marks and trail surrogates. // We do this at load time, rather than at build time, // to simplify Unicode version bootstrapping: // The root data builder only needs the new FractionalUCA.txt data, // but it need not be built with a version of ICU already updated to // the corresponding new Unicode Character Database. // // The following is an optimized version of // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). // It is faster and requires fewer code dependencies. tailoring.UnsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates data.nfcImpl.AddLcccChars(tailoring.UnsafeBackwardSet); } else { // Clone the root collator's set contents. tailoring.UnsafeBackwardSet = baseData.unsafeBackwardSet.CloneAsThawed(); } // Add the ranges from the data file to the unsafe-backward set. USerializedSet sset = new USerializedSet(); char[] unsafeData = ICUBinary.GetChars(inBytes, length / 2, length & 1); length = 0; sset.GetSet(unsafeData, 0); int count = sset.CountRanges(); int[] range = new int[2]; for (int i = 0; i < count; ++i) { sset.GetRange(i, range); tailoring.UnsafeBackwardSet.Add(range[0], range[1]); } // Mark each lead surrogate as "unsafe" // if any of its 1024 associated supplementary code points is "unsafe". int c = 0x10000; for (int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { if (!tailoring.UnsafeBackwardSet.ContainsNone(c, c + 0x3ff)) { tailoring.UnsafeBackwardSet.Add(lead); } } tailoring.UnsafeBackwardSet.Freeze(); data.unsafeBackwardSet = tailoring.UnsafeBackwardSet; } else if (data == null) { // Nothing to do. } else if (baseData != null) { // No tailoring-specific data: Alias the root collator's set. data.unsafeBackwardSet = baseData.unsafeBackwardSet; } else { throw new ICUException("Missing unsafe-backward-set"); } ICUBinary.SkipBytes(inBytes, length); // If the fast Latin format version is different, // or the version is set to 0 for "no fast Latin table", // then just always use the normal string comparison path. index = IX_FAST_LATIN_TABLE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (data != null) { data.fastLatinTable = null; data.fastLatinTableHeader = null; if (((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { if (length >= 2) { char header0 = inBytes.GetChar(); int headerLength = header0 & 0xff; data.fastLatinTableHeader = new char[headerLength]; data.fastLatinTableHeader[0] = header0; for (int i = 1; i < headerLength; ++i) { data.fastLatinTableHeader[i] = inBytes.GetChar(); } int tableLength = length / 2 - headerLength; data.fastLatinTable = ICUBinary.GetChars(inBytes, tableLength, length & 1); length = 0; if ((header0 >> 8) != CollationFastLatin.VERSION) { throw new ICUException("Fast-Latin table version differs from version in data header"); } } else if (baseData != null) { data.fastLatinTable = baseData.fastLatinTable; data.fastLatinTableHeader = baseData.fastLatinTableHeader; } } } ICUBinary.SkipBytes(inBytes, length); index = IX_SCRIPTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 2) { if (data == null) { throw new ICUException("Script order data but no mappings"); } int scriptsLength = length / 2; CharBuffer inChars = inBytes.AsCharBuffer(); data.numScripts = inChars.Get(); // There must be enough entries for both arrays, including more than two range starts. int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16); if (scriptStartsLength <= 2) { throw new ICUException("Script order data too short"); } inChars.Get(data.scriptsIndex = new char[data.numScripts + 16]); inChars.Get(data.scriptStarts = new char[scriptStartsLength]); if (!(data.scriptStarts[0] == 0 && data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && data.scriptStarts[scriptStartsLength - 1] == (Collation.TRAIL_WEIGHT_BYTE << 8))) { throw new ICUException("Script order data not valid"); } } else if (data == null) { // Nothing to do. } else if (baseData != null) { data.numScripts = baseData.numScripts; data.scriptsIndex = baseData.scriptsIndex; data.scriptStarts = baseData.scriptStarts; } ICUBinary.SkipBytes(inBytes, length); index = IX_COMPRESSIBLE_BYTES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 256) { if (data == null) { throw new ICUException("Data for compressible primary lead bytes but no mappings"); } data.compressibleBytes = new bool[256]; for (int i = 0; i < 256; ++i) { data.compressibleBytes[i] = inBytes.Get() != 0; } length -= 256; } else if (data == null) { // Nothing to do. } else if (baseData != null) { data.compressibleBytes = baseData.compressibleBytes; } else { throw new ICUException("Missing data for compressible primary lead bytes"); } ICUBinary.SkipBytes(inBytes, length); index = IX_RESERVED18_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.SkipBytes(inBytes, length); CollationSettings ts = tailoring.Settings.ReadOnly; int options = inIndexes[IX_OPTIONS] & 0xffff; char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; int fastLatinOptions = CollationFastLatin.GetOptions( tailoring.Data, ts, fastLatinPrimaries); if (options == ts.Options && ts.VariableTop != 0 && Arrays.Equals(reorderCodes, ts.ReorderCodes) && fastLatinOptions == ts.FastLatinOptions && (fastLatinOptions < 0 || Arrays.Equals(fastLatinPrimaries, ts.FastLatinPrimaries))) { return; } CollationSettings settings = tailoring.Settings.CopyOnWrite(); settings.Options = options; // Set variableTop from options and scripts data. settings.VariableTop = tailoring.Data.GetLastPrimaryForGroup( ReorderCodes.First + settings.MaxVariable); if (settings.VariableTop == 0) { throw new ICUException("The maxVariable could not be mapped to a variableTop"); } if (reorderCodesLength != 0) { settings.AliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable); } settings.FastLatinOptions = CollationFastLatin.GetOptions( tailoring.Data, settings, settings.FastLatinPrimaries); }