/// <summary> /// Reads the input and looks for a username and password. /// Handles: /// http://username:password@... /// @return True if a valid username and password was found. /// </summary> /// <param name="beginningOfUsername">beginningOfUsername Index of the buffer of where the username began</param> /// <returns></returns> private bool ReadUserPass(int beginningOfUsername) { //The start of where we are. var start = _buffer.Length; //keep looping until "done" var done = false; //if we had a dot in the input, then it might be a domain name and not a username and password. var rollback = false; while (!done && !_reader.Eof()) { var curr = _reader.Read(); // if we hit this, then everything is ok and we are matching a domain name. if (curr == '@') { _buffer.Append(curr); _currentUrlMarker.SetIndex(UrlPart.USERNAME_PASSWORD, beginningOfUsername); return(ReadDomainName("")); } if (CharUtils.IsDot(curr) || curr == '[') { //everything is still ok, just remember that we found a dot or '[' in case we might need to backtrack _buffer.Append(curr); rollback = true; } else if (curr == '#' || curr == ' ' || curr == '/' || CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched) { //one of these characters indicates we are invalid state and should just return. rollback = true; done = true; } else { //all else, just append character assuming its ok so far. _buffer.Append(curr); } } if (rollback) { //got to here, so there is no username and password. (We didn't find a @) var distance = _buffer.Length - start; _buffer.Remove(start, _buffer.Length - start); var currIndex = Math.Max(_reader.GetPosition() - distance - (done ? 1 : 0), 0); _reader.Seek(currIndex); return(false); } return(ReadEnd(ReadEndState.InvalidUrl)); }
/// <summary> /// Reads the Dns and returns the next state the state machine should take in throwing this out, or continue processing /// if this is a valid domain name. /// @return The next state to take. /// </summary> public ReaderNextState ReadDomainName() { //Read the current, and if its bad, just return. if (ReadCurrent() == ReaderNextState.InvalidDomainName) { return(ReaderNextState.InvalidDomainName); } //If this is the first domain part, check if it's ip address in is hexa //similar to what is done on 'readCurrent' method bool isAllHexSoFar = (_current == null || _current.Equals("")) && _reader.CanReadChars(3) && ("0x".Equals(_reader.Peek(2), StringComparison.InvariantCultureIgnoreCase)); if (isAllHexSoFar) { //Append hexa radix symbol characters (0x) _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); _currentLabelLength += 2; _topLevelLength = _currentLabelLength; } //while not done and not end of string keep reading. var done = false; while (!done && !_reader.Eof()) { var curr = _reader.Read(); if (curr == '/') { //continue by reading the path return(CheckDomainNameValid(ReaderNextState.ReadPath, curr)); } if (curr == ':' && (!_seenBracket || _seenCompleteBracketSet)) { //Don't check for a port if it's in the middle of an ipv6 address //continue by reading the port. return(CheckDomainNameValid(ReaderNextState.ReadPort, curr)); } if (curr == '?') { //continue by reading the query string return(CheckDomainNameValid(ReaderNextState.ReadQueryString, curr)); } if (curr == '#') { //continue by reading the fragment return(CheckDomainNameValid(ReaderNextState.ReadFragment, curr)); } else if (curr == '@') { //this may not have been a domain after all, but rather a username/password instead _reader.GoBack(); return(ReaderNextState.ReadUserPass); } else if (CharUtils.IsDot(curr) || curr == '%' && _reader.CanReadChars(2) && _reader.Peek(2).Equals(HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase)) { //if the current character is a dot or a urlEncodedDot //handles the case: hello.. if (_currentLabelLength < 1) { done = true; } else { //append the "." to the domain name _buffer.Append(curr); //if it was not a normal dot, then it is url encoded //read the next two chars, which are the hex representation if (!CharUtils.IsDot(curr)) { _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); } //increment the dots only if it's not part of the zone index and reset the last length. if (!_zoneIndex) { _dots++; _currentLabelLength = 0; } //if the length of the last section is longer than or equal to 64, it's too long to be a valid domain if (_currentLabelLength >= MAX_LABEL_LENGTH) { return(ReaderNextState.InvalidDomainName); } } } else if (_seenBracket && (CharUtils.IsHex(curr) || curr == ':' || curr == '[' || curr == ']' || curr == '%') && !_seenCompleteBracketSet) { //if this is an ipv6 address. switch (curr) { case ':': _currentLabelLength = 0; break; case '[': // if we read another '[', we need to restart by re-reading from this bracket instead. _reader.GoBack(); return(ReaderNextState.InvalidDomainName); case ']': _seenCompleteBracketSet = true; //means that we already have a complete ipv6 address. _zoneIndex = false; //set this back off so that we can keep counting dots after ipv6 is over. break; case '%': //set flag to subtract subsequent dots because it's part of the zone index _zoneIndex = true; break; default: _currentLabelLength++; break; } _numeric = false; _buffer.Append(curr); } else if (CharUtils.IsAlphaNumeric(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START) { //Valid domain name character. Either a-z, A-Z, 0-9, -, or international character if (_seenCompleteBracketSet) { //covers case of [fe80::]www.google.com _reader.GoBack(); done = true; } else { if (isAllHexSoFar && !CharUtils.IsHex(curr)) { _numeric = false; } //if its not numeric, remember that; if (!isAllHexSoFar && !CharUtils.IsNumeric(curr)) { _numeric = false; } //append to the states. _buffer.Append(curr); _currentLabelLength++; _topLevelLength = _currentLabelLength; } } else if (curr == '[' && !_seenBracket) { _seenBracket = true; _numeric = false; _buffer.Append(curr); } else if (curr == '[' && _seenCompleteBracketSet) { //Case where [::][ ... _reader.GoBack(); done = true; } else if (curr == '%' && _reader.CanReadChars(2) && CharUtils.IsHex(_reader.PeekChar(0)) && CharUtils.IsHex(_reader.PeekChar(1))) { //append to the states. _buffer.Append(curr); _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); _currentLabelLength += 3; _topLevelLength = _currentLabelLength; } else { //called to increment the count of matching characters //_characterHandler.addCharacter(curr); _characterHandler(curr); //invalid character, we are done. done = true; } } //Check the domain name to make sure its ok. return(CheckDomainNameValid(ReaderNextState.ValidDomainName, null)); }
/// <summary> /// Reads and parses the current string to make sure the domain name started where it was supposed to, /// and the current domain name is correct. /// @return The next state to use after reading the current. /// </summary> private ReaderNextState ReadCurrent() { if (_current != null) { //Handles the case where the string is ".hello" if (_current.Length == 1 && CharUtils.IsDot(_current[0])) { return(ReaderNextState.InvalidDomainName); } if (_current.Length == 3 && _current.Equals("%" + HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase)) { return(ReaderNextState.InvalidDomainName); } //The location where the domain name started. _startDomainName = _buffer.Length - _current.Length; //flag that the domain is currently all numbers and/or dots. _numeric = true; //If an invalid char is found, we can just restart the domain from there. var newStart = 0; var currArray = _current.ToCharArray(); var length = currArray.Length; //hex special case var isAllHexSoFar = length > 2 && currArray[0] == '0' && (currArray[1] == 'x' || currArray[1] == 'X'); var index = isAllHexSoFar ? 2 : 0; var done = false; while (index < length && !done) { //get the current character and update length counts. var curr = currArray[index]; _currentLabelLength++; _topLevelLength = _currentLabelLength; //Is the length of the last part > 64 (plus one since we just incremented) if (_currentLabelLength > MAX_LABEL_LENGTH) { return(ReaderNextState.InvalidDomainName); } if (CharUtils.IsDot(curr)) { //found a dot. Increment dot count, and reset last length _dots++; _currentLabelLength = 0; } else if (curr == '[') { _seenBracket = true; _numeric = false; } else if (curr == '%' && index + 2 < length && CharUtils.IsHex(currArray[index + 1]) && CharUtils.IsHex(currArray[index + 2])) { //handle url encoded dot if (currArray[index + 1] == '2' && currArray[index + 2] == 'e') { _dots++; _currentLabelLength = 0; } else { _numeric = false; } index += 2; } else if (isAllHexSoFar) { //if it's a valid character in the domain that is not numeric if (!CharUtils.IsHex(curr)) { _numeric = false; isAllHexSoFar = false; index--; //backtrack to rerun last character knowing it isn't hex. } } else if (CharUtils.IsAlpha(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START) { _numeric = false; } else if (!CharUtils.IsNumeric(curr) && !_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN)) { //if its not _numeric and not alphabetical, then restart searching for a domain from this point. newStart = index + 1; _currentLabelLength = 0; _topLevelLength = 0; _numeric = true; _dots = 0; done = true; } index++; } //An invalid character for the domain was found somewhere in the current buffer. //cut the first part of the domain out. For example: // http://asdf%asdf.google.com <- asdf.google.com is still valid, so restart from the % if (newStart > 0) { //make sure the location is not at the end. Otherwise the thing is just invalid. if (newStart < _current.Length) { _buffer.Clear(); _buffer.Append(_current.Substring(newStart)); //_buffer.Replace(0, _buffer.Length(), _current.javaSubstring(newStart)); //cut out the previous part, so now the domain name has to be from here. _startDomainName = 0; } //now after cutting if the buffer is just "." newStart > current (last character in current is invalid) if (newStart >= _current.Length || _buffer.ToString().Equals(".")) { return(ReaderNextState.InvalidDomainName); } } } else { _startDomainName = _buffer.Length; } //all else is good, return OK return(ReaderNextState.ValidDomainName); }