private Regex /*!*/ Transform(ref RubyEncoding encoding, MutableString /*!*/ input, int start, out string strInput) { ContractUtils.RequiresNotNull(input, "input"); // TODO: // K-coding of the current operation (the current KCODE gets preference over the KCODE regex option): RubyRegexOptions kc = _options & RubyRegexOptions.EncodingMask; if (kc != 0) { encoding = _pattern.Encoding; } else { kc = RubyRegexOptions.NONE; } // Convert input to a string. Force k-coding if necessary. if (kc != 0) { // Handling multi-byte K-coded characters is not entirely correct here. // Three cases to be considered: // 1) Multi-byte character is explicitly contained in the pattern: /€*/ // 2) Subsequent escapes form a complete character: /\342\202\254*/ or /\xe2\x82\xac*/ // 3) Subsequent escapes form an incomplete character: /[\x7f-\xff]{1,3}/ // // In the first two cases we want to "group" the byte triplet so that regex operators like *, +, ? and {n,m} operate on // the entire character, not just the last byte. We could unescape the bytes and replace them with complete Unicode characters. // Then we could encode the input using the same K-coding and we would get a match. // However, case 3) requires the opposite: to match the bytes we need to encode the input using binary encoding. // Using this encoding makes *+? operators operate on the last byte (encoded as UTF16 character). // // The right solution would require the regex engine to handle multi-byte escaped characters, which it doesn't. // // TODO: // A correct workaround would be to wrap the byte sequence that forms a character into a non-capturing group, // for example transform /\342\202\254*/ to /(?:\342\202\254)*/ and use binary encoding on both input and pattern. // For now, we just detect if there are any non-ascii character escapes. If so we use a binary encoding accomodating case 3), // but breaking cases 1 and 2. Otherwise we encode using k-coding to make case 1 match. if (HasEscapedNonAsciiBytes(_pattern)) { encoding = RubyEncoding.Binary; kc = 0; } strInput = ForceEncoding(input, encoding.Encoding, start); } else { _pattern.RequireCompatibleEncoding(input); input.PrepareForCharacterRead(); strInput = input.ConvertToString(); } return(TransformPattern(encoding, kc)); }
public void Set(MutableString /*!*/ pattern, RubyRegexOptions options) { ContractUtils.RequiresNotNull(pattern, "pattern"); // RubyRegexOptions.Once is only used to determine how the Regexp object should be created and cached. // It is not a property of the final object. /foo/ should compare equal with /foo/o. _options = options & ~RubyRegexOptions.Once; RubyEncoding encoding = RubyEncoding.GetRegexEncoding(options); if (encoding != null || pattern.Encoding.IsKCoding) { _pattern = MutableString.CreateBinary(pattern.ToByteArray(), encoding ?? RubyEncoding.Binary).Freeze(); } else { _pattern = pattern.PrepareForCharacterRead().Clone().Freeze(); } TransformPattern(encoding, options & RubyRegexOptions.EncodingMask); }