Example #1
0
        /// <summary>
        /// Reads the scheme and allows returns true if the scheme is in our allowed collection (e.g. http(s?):// or ftp(s?)://)
        /// @return True if the scheme was found, else false.
        /// </summary>
        private bool ReadScheme()
        {
            var originalLength = _buffer.Length;
            var numSlashes     = 0;

            while (!_reader.Eof())
            {
                var curr = _reader.Read();

                //if we match a slash, look for a second one.
                if (curr == '/')
                {
                    _buffer.Append(curr);
                    if (numSlashes == 1)
                    {
                        // return only if the buffer currently ends with an approved protocol.
                        // When we have buffered a string like: ":u(https://test.co" and are scanning the ':', we
                        // consider this to have found a scheme (the https bit only, which will be parsed out later)
                        var bufferedUrlContent = _buffer.ToString().ToLowerInvariant();

                        // return only if we detect an approved protocol at the end of the current buffer. For
                        // efficiency, first check an exact match
                        if (ValidSchemesSuffixed.Contains(bufferedUrlContent))
                        {
                            _currentUrlMarker.SetIndex(UrlPart.SCHEME, 0);
                            return(true);
                        }

                        // If no exact match found, try to find a valid scheme in the trailing content of the current buffer,
                        // starting with the longest matches available (e.g. sftp:// rather than ftp://
                        for (var i = ValidSchemesSuffixedOrdered.Length - 1; i >= 0; i--)
                        {
                            var vss = ValidSchemesSuffixedOrdered[i];
                            if (bufferedUrlContent.EndsWith(vss))
                            {
                                // see if we need to remove extra characters from the start of the buffer
                                if (bufferedUrlContent.Length > vss.Length)
                                {
                                    _buffer.Remove(0, bufferedUrlContent.Length - vss.Length);
                                }
                                _currentUrlMarker.SetIndex(UrlPart.SCHEME, 0);
                                return(true);
                            }
                        }
                        return(false);
                    }

                    numSlashes++;
                }
                else if (curr == ' ' || CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched)
                {
                    //if we find a space or end of input, then nothing found.
                    _buffer.Append(curr);
                    return(false);
                }
                else if (curr == '[')
                {
                    //if we're starting to see an ipv6 address
                    _reader.GoBack();                     //unread the '[', so that we can start looking for ipv6
                    return(false);
                }
                else if (originalLength > 0 || numSlashes > 0 || !CharUtils.IsAlpha(curr))
                {
                    // if it's not a character a-z or A-Z then assume we aren't matching scheme, but instead
                    // matching username and password.
                    _reader.GoBack();
                    return(ReadUserPass(0));
                }
            }

            return(false);
        }
Example #2
0
        /// <summary>
        /// Reads and parses the current string to make sure the domain name started where it was supposed to,
        /// and the current domain name is correct.
        /// @return The next state to use after reading the current.
        /// </summary>
        private ReaderNextState ReadCurrent()
        {
            if (_current != null)
            {
                //Handles the case where the string is ".hello"
                if (_current.Length == 1 && CharUtils.IsDot(_current[0]))
                {
                    return(ReaderNextState.InvalidDomainName);
                }

                if (_current.Length == 3 && _current.Equals("%" + HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(ReaderNextState.InvalidDomainName);
                }

                //The location where the domain name started.
                _startDomainName = _buffer.Length - _current.Length;

                //flag that the domain is currently all numbers and/or dots.
                _numeric = true;

                //If an invalid char is found, we can just restart the domain from there.
                var newStart = 0;

                var currArray = _current.ToCharArray();
                var length    = currArray.Length;

                //hex special case
                var isAllHexSoFar = length > 2 && currArray[0] == '0' && (currArray[1] == 'x' || currArray[1] == 'X');

                var index = isAllHexSoFar ? 2 : 0;
                var done  = false;

                while (index < length && !done)
                {
                    //get the current character and update length counts.
                    var curr = currArray[index];
                    _currentLabelLength++;
                    _topLevelLength = _currentLabelLength;

                    //Is the length of the last part > 64 (plus one since we just incremented)
                    if (_currentLabelLength > MAX_LABEL_LENGTH)
                    {
                        return(ReaderNextState.InvalidDomainName);
                    }

                    if (CharUtils.IsDot(curr))
                    {
                        //found a dot. Increment dot count, and reset last length
                        _dots++;
                        _currentLabelLength = 0;
                    }
                    else if (curr == '[')
                    {
                        _seenBracket = true;
                        _numeric     = false;
                    }
                    else if (curr == '%' && index + 2 < length && CharUtils.IsHex(currArray[index + 1]) &&
                             CharUtils.IsHex(currArray[index + 2]))
                    {
                        //handle url encoded dot
                        if (currArray[index + 1] == '2' && currArray[index + 2] == 'e')
                        {
                            _dots++;
                            _currentLabelLength = 0;
                        }
                        else
                        {
                            _numeric = false;
                        }

                        index += 2;
                    }
                    else if (isAllHexSoFar)
                    {
                        //if it's a valid character in the domain that is not numeric
                        if (!CharUtils.IsHex(curr))
                        {
                            _numeric      = false;
                            isAllHexSoFar = false;
                            index--;                             //backtrack to rerun last character knowing it isn't hex.
                        }
                    }
                    else if (CharUtils.IsAlpha(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START)
                    {
                        _numeric = false;
                    }
                    else if (!CharUtils.IsNumeric(curr) && !_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN))
                    {
                        //if its not _numeric and not alphabetical, then restart searching for a domain from this point.
                        newStart            = index + 1;
                        _currentLabelLength = 0;
                        _topLevelLength     = 0;
                        _numeric            = true;
                        _dots = 0;
                        done  = true;
                    }

                    index++;
                }

                //An invalid character for the domain was found somewhere in the current buffer.
                //cut the first part of the domain out. For example:
                // http://asdf%asdf.google.com <- asdf.google.com is still valid, so restart from the %
                if (newStart > 0)
                {
                    //make sure the location is not at the end. Otherwise the thing is just invalid.
                    if (newStart < _current.Length)
                    {
                        _buffer.Clear();
                        _buffer.Append(_current.Substring(newStart));

                        //_buffer.Replace(0, _buffer.Length(), _current.javaSubstring(newStart));

                        //cut out the previous part, so now the domain name has to be from here.
                        _startDomainName = 0;
                    }

                    //now after cutting if the buffer is just "." newStart > current (last character in current is invalid)
                    if (newStart >= _current.Length || _buffer.ToString().Equals("."))
                    {
                        return(ReaderNextState.InvalidDomainName);
                    }
                }
            }
            else
            {
                _startDomainName = _buffer.Length;
            }

            //all else is good, return OK
            return(ReaderNextState.ValidDomainName);
        }