private Regex /*!*/ Transform(ref RubyEncoding encoding, MutableString /*!*/ input, int start, out string strInput) { ContractUtils.RequiresNotNull(input, "input"); // K-coding of the current operation (the current KCODE gets preference over the KCODE regex option): RubyRegexOptions kc = _options & RubyRegexOptions.EncodingMask; if (kc != 0) { encoding = _pattern.Encoding; } else { kc = RubyEncoding.ToRegexOption(encoding); } // Convert input to a string. Force k-coding if necessary. if (kc != 0) { // Handling multi-byte K-coded characters is not entirely correct here. // Three cases to be considered: // 1) Multi-byte character is explicitly contained in the pattern: /�*/ // 2) Subsequent escapes form a complete character: /\342\202\254*/ or /\xe2\x82\xac*/ // 3) Subsequent escapes form an incomplete character: /[\x7f-\xff]{1,3}/ // // In the first two cases we want to "group" the byte triplet so that regex operators like *, +, ? and {n,m} operate on // the entire character, not just the last byte. We could unescape the bytes and replace them with complete Unicode characters. // Then we could encode the input using the same K-coding and we would get a match. // However, case 3) requires the opposite: to match the bytes we need to encode the input using binary encoding. // Using this encoding makes *+? operators operate on the last byte (encoded as UTF16 character). // // The right solution would require the regex engine to handle multi-byte escaped characters, which it doesn't. // // TODO: // A correct workaround would be to wrap the byte sequence that forms a character into a non-capturing group, // for example transform /\342\202\254*/ to /(?:\342\202\254)*/ and use binary encoding on both input and pattern. // For now, we just detect if there are any non-ascii character escapes. If so we use a binary encoding accomodating case 3), // but breaking cases 1 and 2. Otherwise we encode using k-coding to make case 1 match. if (HasEscapedNonAsciiBytes(_pattern)) { encoding = RubyEncoding.Binary; kc = 0; } strInput = ForceEncoding(input, encoding.Encoding, start); } else if (input.Encoding.IsKCoding) { strInput = input.ToString(BinaryEncoding.Instance); } else { _pattern.RequireCompatibleEncoding(input); input.PrepareForCharacterRead(); strInput = input.ConvertToString(); } return(TransformPattern(encoding, kc)); }