Example #1
0
        /// <summary>
        /// Reads the input and looks for a username and password.
        /// Handles:
        /// http://username:password@...
        /// @return True if a valid username and password was found.
        /// </summary>
        /// <param name="beginningOfUsername">beginningOfUsername Index of the buffer of where the username began</param>
        /// <returns></returns>
        private bool ReadUserPass(int beginningOfUsername)
        {
            //The start of where we are.
            var start = _buffer.Length;

            //keep looping until "done"
            var done = false;

            //if we had a dot in the input, then it might be a domain name and not a username and password.
            var rollback = false;

            while (!done && !_reader.Eof())
            {
                var curr = _reader.Read();

                // if we hit this, then everything is ok and we are matching a domain name.
                if (curr == '@')
                {
                    _buffer.Append(curr);
                    _currentUrlMarker.SetIndex(UrlPart.USERNAME_PASSWORD, beginningOfUsername);
                    return(ReadDomainName(""));
                }

                if (CharUtils.IsDot(curr) || curr == '[')
                {
                    //everything is still ok, just remember that we found a dot or '[' in case we might need to backtrack
                    _buffer.Append(curr);
                    rollback = true;
                }
                else if (curr == '#' || curr == ' ' || curr == '/' ||
                         CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched)
                {
                    //one of these characters indicates we are invalid state and should just return.
                    rollback = true;
                    done     = true;
                }
                else
                {
                    //all else, just append character assuming its ok so far.
                    _buffer.Append(curr);
                }
            }

            if (rollback)
            {
                //got to here, so there is no username and password. (We didn't find a @)
                var distance = _buffer.Length - start;
                _buffer.Remove(start, _buffer.Length - start);

                var currIndex = Math.Max(_reader.GetPosition() - distance - (done ? 1 : 0), 0);
                _reader.Seek(currIndex);

                return(false);
            }

            return(ReadEnd(ReadEndState.InvalidUrl));
        }
Example #2
0
        /// <summary>
        /// Reads the Dns and returns the next state the state machine should take in throwing this out, or continue processing
        /// if this is a valid domain name.
        /// @return The next state to take.
        /// </summary>
        public ReaderNextState ReadDomainName()
        {
            //Read the current, and if its bad, just return.
            if (ReadCurrent() == ReaderNextState.InvalidDomainName)
            {
                return(ReaderNextState.InvalidDomainName);
            }

            //If this is the first domain part, check if it's ip address in is hexa
            //similar to what is done on 'readCurrent' method
            bool isAllHexSoFar = (_current == null || _current.Equals("")) &&
                                 _reader.CanReadChars(3) &&
                                 ("0x".Equals(_reader.Peek(2), StringComparison.InvariantCultureIgnoreCase));

            if (isAllHexSoFar)
            {
                //Append hexa radix symbol characters (0x)
                _buffer.Append(_reader.Read());
                _buffer.Append(_reader.Read());
                _currentLabelLength += 2;
                _topLevelLength      = _currentLabelLength;
            }

            //while not done and not end of string keep reading.
            var done = false;

            while (!done && !_reader.Eof())
            {
                var curr = _reader.Read();

                if (curr == '/')
                {
                    //continue by reading the path
                    return(CheckDomainNameValid(ReaderNextState.ReadPath, curr));
                }

                if (curr == ':' && (!_seenBracket || _seenCompleteBracketSet))
                {
                    //Don't check for a port if it's in the middle of an ipv6 address
                    //continue by reading the port.
                    return(CheckDomainNameValid(ReaderNextState.ReadPort, curr));
                }

                if (curr == '?')
                {
                    //continue by reading the query string
                    return(CheckDomainNameValid(ReaderNextState.ReadQueryString, curr));
                }

                if (curr == '#')
                {
                    //continue by reading the fragment
                    return(CheckDomainNameValid(ReaderNextState.ReadFragment, curr));
                }
                else if (curr == '@')
                {
                    //this may not have been a domain after all, but rather a username/password instead
                    _reader.GoBack();
                    return(ReaderNextState.ReadUserPass);
                }
                else if (CharUtils.IsDot(curr) ||
                         curr == '%' && _reader.CanReadChars(2) && _reader.Peek(2).Equals(HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase))
                {
                    //if the current character is a dot or a urlEncodedDot

                    //handles the case: hello..
                    if (_currentLabelLength < 1)
                    {
                        done = true;
                    }
                    else
                    {
                        //append the "." to the domain name
                        _buffer.Append(curr);

                        //if it was not a normal dot, then it is url encoded
                        //read the next two chars, which are the hex representation
                        if (!CharUtils.IsDot(curr))
                        {
                            _buffer.Append(_reader.Read());
                            _buffer.Append(_reader.Read());
                        }

                        //increment the dots only if it's not part of the zone index and reset the last length.
                        if (!_zoneIndex)
                        {
                            _dots++;
                            _currentLabelLength = 0;
                        }

                        //if the length of the last section is longer than or equal to 64, it's too long to be a valid domain
                        if (_currentLabelLength >= MAX_LABEL_LENGTH)
                        {
                            return(ReaderNextState.InvalidDomainName);
                        }
                    }
                }
                else if (_seenBracket && (CharUtils.IsHex(curr) || curr == ':' || curr == '[' || curr == ']' || curr == '%') &&
                         !_seenCompleteBracketSet)
                {
                    //if this is an ipv6 address.
                    switch (curr)
                    {
                    case ':':
                        _currentLabelLength = 0;
                        break;

                    case '[':
                        // if we read another '[', we need to restart by re-reading from this bracket instead.
                        _reader.GoBack();
                        return(ReaderNextState.InvalidDomainName);

                    case ']':
                        _seenCompleteBracketSet = true;                     //means that we already have a complete ipv6 address.
                        _zoneIndex = false;                                 //set this back off so that we can keep counting dots after ipv6 is over.
                        break;

                    case '%':                             //set flag to subtract subsequent dots because it's part of the zone index
                        _zoneIndex = true;
                        break;

                    default:
                        _currentLabelLength++;
                        break;
                    }

                    _numeric = false;
                    _buffer.Append(curr);
                }
                else if (CharUtils.IsAlphaNumeric(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START)
                {
                    //Valid domain name character. Either a-z, A-Z, 0-9, -, or international character
                    if (_seenCompleteBracketSet)
                    {
                        //covers case of [fe80::]www.google.com
                        _reader.GoBack();
                        done = true;
                    }
                    else
                    {
                        if (isAllHexSoFar && !CharUtils.IsHex(curr))
                        {
                            _numeric = false;
                        }
                        //if its not numeric, remember that;
                        if (!isAllHexSoFar && !CharUtils.IsNumeric(curr))
                        {
                            _numeric = false;
                        }

                        //append to the states.
                        _buffer.Append(curr);
                        _currentLabelLength++;
                        _topLevelLength = _currentLabelLength;
                    }
                }
                else if (curr == '[' && !_seenBracket)
                {
                    _seenBracket = true;
                    _numeric     = false;
                    _buffer.Append(curr);
                }
                else if (curr == '[' && _seenCompleteBracketSet)
                {
                    //Case where [::][ ...
                    _reader.GoBack();
                    done = true;
                }
                else if (curr == '%' && _reader.CanReadChars(2) && CharUtils.IsHex(_reader.PeekChar(0)) &&
                         CharUtils.IsHex(_reader.PeekChar(1)))
                {
                    //append to the states.
                    _buffer.Append(curr);
                    _buffer.Append(_reader.Read());
                    _buffer.Append(_reader.Read());
                    _currentLabelLength += 3;
                    _topLevelLength      = _currentLabelLength;
                }
                else
                {
                    //called to increment the count of matching characters
                    //_characterHandler.addCharacter(curr);

                    _characterHandler(curr);

                    //invalid character, we are done.
                    done = true;
                }
            }

            //Check the domain name to make sure its ok.
            return(CheckDomainNameValid(ReaderNextState.ValidDomainName, null));
        }
Example #3
0
        /// <summary>
        /// Reads and parses the current string to make sure the domain name started where it was supposed to,
        /// and the current domain name is correct.
        /// @return The next state to use after reading the current.
        /// </summary>
        private ReaderNextState ReadCurrent()
        {
            if (_current != null)
            {
                //Handles the case where the string is ".hello"
                if (_current.Length == 1 && CharUtils.IsDot(_current[0]))
                {
                    return(ReaderNextState.InvalidDomainName);
                }

                if (_current.Length == 3 && _current.Equals("%" + HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(ReaderNextState.InvalidDomainName);
                }

                //The location where the domain name started.
                _startDomainName = _buffer.Length - _current.Length;

                //flag that the domain is currently all numbers and/or dots.
                _numeric = true;

                //If an invalid char is found, we can just restart the domain from there.
                var newStart = 0;

                var currArray = _current.ToCharArray();
                var length    = currArray.Length;

                //hex special case
                var isAllHexSoFar = length > 2 && currArray[0] == '0' && (currArray[1] == 'x' || currArray[1] == 'X');

                var index = isAllHexSoFar ? 2 : 0;
                var done  = false;

                while (index < length && !done)
                {
                    //get the current character and update length counts.
                    var curr = currArray[index];
                    _currentLabelLength++;
                    _topLevelLength = _currentLabelLength;

                    //Is the length of the last part > 64 (plus one since we just incremented)
                    if (_currentLabelLength > MAX_LABEL_LENGTH)
                    {
                        return(ReaderNextState.InvalidDomainName);
                    }

                    if (CharUtils.IsDot(curr))
                    {
                        //found a dot. Increment dot count, and reset last length
                        _dots++;
                        _currentLabelLength = 0;
                    }
                    else if (curr == '[')
                    {
                        _seenBracket = true;
                        _numeric     = false;
                    }
                    else if (curr == '%' && index + 2 < length && CharUtils.IsHex(currArray[index + 1]) &&
                             CharUtils.IsHex(currArray[index + 2]))
                    {
                        //handle url encoded dot
                        if (currArray[index + 1] == '2' && currArray[index + 2] == 'e')
                        {
                            _dots++;
                            _currentLabelLength = 0;
                        }
                        else
                        {
                            _numeric = false;
                        }

                        index += 2;
                    }
                    else if (isAllHexSoFar)
                    {
                        //if it's a valid character in the domain that is not numeric
                        if (!CharUtils.IsHex(curr))
                        {
                            _numeric      = false;
                            isAllHexSoFar = false;
                            index--;                             //backtrack to rerun last character knowing it isn't hex.
                        }
                    }
                    else if (CharUtils.IsAlpha(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START)
                    {
                        _numeric = false;
                    }
                    else if (!CharUtils.IsNumeric(curr) && !_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN))
                    {
                        //if its not _numeric and not alphabetical, then restart searching for a domain from this point.
                        newStart            = index + 1;
                        _currentLabelLength = 0;
                        _topLevelLength     = 0;
                        _numeric            = true;
                        _dots = 0;
                        done  = true;
                    }

                    index++;
                }

                //An invalid character for the domain was found somewhere in the current buffer.
                //cut the first part of the domain out. For example:
                // http://asdf%asdf.google.com <- asdf.google.com is still valid, so restart from the %
                if (newStart > 0)
                {
                    //make sure the location is not at the end. Otherwise the thing is just invalid.
                    if (newStart < _current.Length)
                    {
                        _buffer.Clear();
                        _buffer.Append(_current.Substring(newStart));

                        //_buffer.Replace(0, _buffer.Length(), _current.javaSubstring(newStart));

                        //cut out the previous part, so now the domain name has to be from here.
                        _startDomainName = 0;
                    }

                    //now after cutting if the buffer is just "." newStart > current (last character in current is invalid)
                    if (newStart >= _current.Length || _buffer.ToString().Equals("."))
                    {
                        return(ReaderNextState.InvalidDomainName);
                    }
                }
            }
            else
            {
                _startDomainName = _buffer.Length;
            }

            //all else is good, return OK
            return(ReaderNextState.ValidDomainName);
        }