/// <summary> /// Reads the scheme and allows returns true if the scheme is in our allowed collection (e.g. http(s?):// or ftp(s?)://) /// @return True if the scheme was found, else false. /// </summary> private bool ReadScheme() { var originalLength = _buffer.Length; var numSlashes = 0; while (!_reader.Eof()) { var curr = _reader.Read(); //if we match a slash, look for a second one. if (curr == '/') { _buffer.Append(curr); if (numSlashes == 1) { // return only if the buffer currently ends with an approved protocol. // When we have buffered a string like: ":u(https://test.co" and are scanning the ':', we // consider this to have found a scheme (the https bit only, which will be parsed out later) var bufferedUrlContent = _buffer.ToString().ToLowerInvariant(); // return only if we detect an approved protocol at the end of the current buffer. For // efficiency, first check an exact match if (ValidSchemesSuffixed.Contains(bufferedUrlContent)) { _currentUrlMarker.SetIndex(UrlPart.SCHEME, 0); return(true); } // If no exact match found, try to find a valid scheme in the trailing content of the current buffer, // starting with the longest matches available (e.g. sftp:// rather than ftp:// for (var i = ValidSchemesSuffixedOrdered.Length - 1; i >= 0; i--) { var vss = ValidSchemesSuffixedOrdered[i]; if (bufferedUrlContent.EndsWith(vss)) { // see if we need to remove extra characters from the start of the buffer if (bufferedUrlContent.Length > vss.Length) { _buffer.Remove(0, bufferedUrlContent.Length - vss.Length); } _currentUrlMarker.SetIndex(UrlPart.SCHEME, 0); return(true); } } return(false); } numSlashes++; } else if (curr == ' ' || CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched) { //if we find a space or end of input, then nothing found. _buffer.Append(curr); return(false); } else if (curr == '[') { //if we're starting to see an ipv6 address _reader.GoBack(); //unread the '[', so that we can start looking for ipv6 return(false); } else if (originalLength > 0 || numSlashes > 0 || !CharUtils.IsAlpha(curr)) { // if it's not a character a-z or A-Z then assume we aren't matching scheme, but instead // matching username and password. _reader.GoBack(); return(ReadUserPass(0)); } } return(false); }
/// <summary> /// Reads and parses the current string to make sure the domain name started where it was supposed to, /// and the current domain name is correct. /// @return The next state to use after reading the current. /// </summary> private ReaderNextState ReadCurrent() { if (_current != null) { //Handles the case where the string is ".hello" if (_current.Length == 1 && CharUtils.IsDot(_current[0])) { return(ReaderNextState.InvalidDomainName); } if (_current.Length == 3 && _current.Equals("%" + HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase)) { return(ReaderNextState.InvalidDomainName); } //The location where the domain name started. _startDomainName = _buffer.Length - _current.Length; //flag that the domain is currently all numbers and/or dots. _numeric = true; //If an invalid char is found, we can just restart the domain from there. var newStart = 0; var currArray = _current.ToCharArray(); var length = currArray.Length; //hex special case var isAllHexSoFar = length > 2 && currArray[0] == '0' && (currArray[1] == 'x' || currArray[1] == 'X'); var index = isAllHexSoFar ? 2 : 0; var done = false; while (index < length && !done) { //get the current character and update length counts. var curr = currArray[index]; _currentLabelLength++; _topLevelLength = _currentLabelLength; //Is the length of the last part > 64 (plus one since we just incremented) if (_currentLabelLength > MAX_LABEL_LENGTH) { return(ReaderNextState.InvalidDomainName); } if (CharUtils.IsDot(curr)) { //found a dot. Increment dot count, and reset last length _dots++; _currentLabelLength = 0; } else if (curr == '[') { _seenBracket = true; _numeric = false; } else if (curr == '%' && index + 2 < length && CharUtils.IsHex(currArray[index + 1]) && CharUtils.IsHex(currArray[index + 2])) { //handle url encoded dot if (currArray[index + 1] == '2' && currArray[index + 2] == 'e') { _dots++; _currentLabelLength = 0; } else { _numeric = false; } index += 2; } else if (isAllHexSoFar) { //if it's a valid character in the domain that is not numeric if (!CharUtils.IsHex(curr)) { _numeric = false; isAllHexSoFar = false; index--; //backtrack to rerun last character knowing it isn't hex. } } else if (CharUtils.IsAlpha(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START) { _numeric = false; } else if (!CharUtils.IsNumeric(curr) && !_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN)) { //if its not _numeric and not alphabetical, then restart searching for a domain from this point. newStart = index + 1; _currentLabelLength = 0; _topLevelLength = 0; _numeric = true; _dots = 0; done = true; } index++; } //An invalid character for the domain was found somewhere in the current buffer. //cut the first part of the domain out. For example: // http://asdf%asdf.google.com <- asdf.google.com is still valid, so restart from the % if (newStart > 0) { //make sure the location is not at the end. Otherwise the thing is just invalid. if (newStart < _current.Length) { _buffer.Clear(); _buffer.Append(_current.Substring(newStart)); //_buffer.Replace(0, _buffer.Length(), _current.javaSubstring(newStart)); //cut out the previous part, so now the domain name has to be from here. _startDomainName = 0; } //now after cutting if the buffer is just "." newStart > current (last character in current is invalid) if (newStart >= _current.Length || _buffer.ToString().Equals(".")) { return(ReaderNextState.InvalidDomainName); } } } else { _startDomainName = _buffer.Length; } //all else is good, return OK return(ReaderNextState.ValidDomainName); }