Example #1
0
        private Regex /*!*/ Transform(ref RubyEncoding encoding, MutableString /*!*/ input, int start, out string strInput)
        {
            ContractUtils.RequiresNotNull(input, "input");

            // TODO:

            // K-coding of the current operation (the current KCODE gets preference over the KCODE regex option):
            RubyRegexOptions kc = _options & RubyRegexOptions.EncodingMask;

            if (kc != 0)
            {
                encoding = _pattern.Encoding;
            }
            else
            {
                kc = RubyRegexOptions.NONE;
            }

            // Convert input to a string. Force k-coding if necessary.
            if (kc != 0)
            {
                // Handling multi-byte K-coded characters is not entirely correct here.
                // Three cases to be considered:
                // 1) Multi-byte character is explicitly contained in the pattern: /€*/
                // 2) Subsequent escapes form a complete character: /\342\202\254*/ or /\xe2\x82\xac*/
                // 3) Subsequent escapes form an incomplete character: /[\x7f-\xff]{1,3}/
                //
                // In the first two cases we want to "group" the byte triplet so that regex operators like *, +, ? and {n,m} operate on
                // the entire character, not just the last byte. We could unescape the bytes and replace them with complete Unicode characters.
                // Then we could encode the input using the same K-coding and we would get a match.
                // However, case 3) requires the opposite: to match the bytes we need to encode the input using binary encoding.
                // Using this encoding makes *+? operators operate on the last byte (encoded as UTF16 character).
                //
                // The right solution would require the regex engine to handle multi-byte escaped characters, which it doesn't.
                //
                // TODO:
                // A correct workaround would be to wrap the byte sequence that forms a character into a non-capturing group,
                // for example transform /\342\202\254*/ to /(?:\342\202\254)*/ and use binary encoding on both input and pattern.
                // For now, we just detect if there are any non-ascii character escapes. If so we use a binary encoding accomodating case 3),
                // but breaking cases 1 and 2. Otherwise we encode using k-coding to make case 1 match.
                if (HasEscapedNonAsciiBytes(_pattern))
                {
                    encoding = RubyEncoding.Binary;
                    kc       = 0;
                }

                strInput = ForceEncoding(input, encoding.Encoding, start);
            }
            else
            {
                _pattern.RequireCompatibleEncoding(input);
                input.PrepareForCharacterRead();
                strInput = input.ConvertToString();
            }

            return(TransformPattern(encoding, kc));
        }
Example #2
0
        public void Set(MutableString /*!*/ pattern, RubyRegexOptions options)
        {
            ContractUtils.RequiresNotNull(pattern, "pattern");

            // RubyRegexOptions.Once is only used to determine how the Regexp object should be created and cached.
            // It is not a property of the final object. /foo/ should compare equal with /foo/o.
            _options = options & ~RubyRegexOptions.Once;

            RubyEncoding encoding = RubyEncoding.GetRegexEncoding(options);

            if (encoding != null || pattern.Encoding.IsKCoding)
            {
                _pattern = MutableString.CreateBinary(pattern.ToByteArray(), encoding ?? RubyEncoding.Binary).Freeze();
            }
            else
            {
                _pattern = pattern.PrepareForCharacterRead().Clone().Freeze();
            }

            TransformPattern(encoding, options & RubyRegexOptions.EncodingMask);
        }