/// <summary> /// The default input reader which looks for specific flags to start detecting the url. /// </summary> private void ReadDefault() { //Keeps track of the number of characters read to be able to later cut out the domain name. var length = 0; // keep track of how many times each character in each index of the raw input has been read var contentReadByIndexCount = new byte[_reader.ContentLength]; //until end of string read the contents while (!_reader.Eof()) { // We want to ensure that backtracking and looping on content is limited from infinite-loops, so // we take the hit and track each time an element in the input is read, and if its been hit too // many times, we step forwards until we find an element that has NOT been read too many times var currentIndex = _reader.GetPosition(); contentReadByIndexCount[currentIndex] += 1; while (contentReadByIndexCount[currentIndex] >= ContentReadByIndexMaximum) { // Forcably step to the next character in the input, so we jump out of infinite loops _reader.Read(); currentIndex = _reader.GetPosition(); } // Read the next char to process. var curr = _reader.Read(); switch (curr) { case ' ': //space was found, check if it's a valid single level domain. if (_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 0 && _hasScheme) { _reader.GoBack(); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } ; } _buffer.Append(curr); ReadEnd(ReadEndState.InvalidUrl); length = 0; break; case '%': if (_reader.CanReadChars(2)) { if (_reader.Peek(2).Equals("3a", StringComparison.InvariantCultureIgnoreCase)) { _buffer.Append(curr); _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); length = ProcessColon(length); } else if (CharUtils.IsHex(_reader.PeekChar(0)) && CharUtils.IsHex(_reader.PeekChar(1))) { _buffer.Append(curr); _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; } } break; case '\u3002': //non-standard dots case '\uFF0E': case '\uFF61': case '.': //"." was found, read the domain name using the start from length. _buffer.Append(curr); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; break; case '@': //Check the domain name after a username if (_buffer.Length > 0) { _currentUrlMarker.SetIndex(UrlPart.USERNAME_PASSWORD, length); _buffer.Append(curr); if (!ReadDomainName(null)) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; } break; case '[': if (_dontMatchIpv6) { //Check if we need to match characters. If we match characters and this is a start or stop of range, //either way reset the world and start processing again. if (CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched) { ReadEnd(ReadEndState.InvalidUrl); length = 0; } } var beginning = _reader.GetPosition(); //if it doesn't have a scheme, clear the buffer. if (!_hasScheme) { _buffer.Remove(0, _buffer.Length); } _buffer.Append(curr); if (!ReadDomainName(_buffer.ToString().Substring(length))) { //if we didn't find an ipv6 address, then check inside the brackets for urls ReadEnd(ReadEndState.InvalidUrl); _reader.Seek(beginning); _dontMatchIpv6 = true; } length = 0; break; case '/': // "/" was found, then we either read a scheme, or if we already read a scheme, then // we are reading a url in the format http://123123123/asdf if (_hasScheme || _options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 1) { //we already have the scheme, so then we already read: //http://something/ <- if something is all numeric then its a valid url. //OR we are searching for single level domains. We have buffer length > 1 condition //to weed out infinite backtrack in cases of html5 roots //unread this "/" and continue to check the domain name starting from the beginning of the domain _reader.GoBack(); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; } else { //we don't have a scheme already, then clear state, then check for html5 root such as: "//google.com/" // remember the state of the quote when clearing state just in case its "//google.com" so its not cleared. ReadEnd(ReadEndState.InvalidUrl); _buffer.Append(curr); _hasScheme = ReadHtml5Root(); length = _buffer.Length; } break; case ':': //add the ":" to the url and check for scheme/username _buffer.Append(curr); length = ProcessColon(length); break; default: //Check if we need to match characters. If we match characters and this is a start or stop of range, //either way reset the world and start processing again. if (CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched) { ReadEnd(ReadEndState.InvalidUrl); length = 0; } else { _buffer.Append(curr); } break; } } if (_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 0 && _hasScheme) { if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } } }
/// <summary> /// Reads and parses the current string to make sure the domain name started where it was supposed to, /// and the current domain name is correct. /// @return The next state to use after reading the current. /// </summary> private ReaderNextState ReadCurrent() { if (_current != null) { //Handles the case where the string is ".hello" if (_current.Length == 1 && CharUtils.IsDot(_current[0])) { return(ReaderNextState.InvalidDomainName); } if (_current.Length == 3 && _current.Equals("%" + HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase)) { return(ReaderNextState.InvalidDomainName); } //The location where the domain name started. _startDomainName = _buffer.Length - _current.Length; //flag that the domain is currently all numbers and/or dots. _numeric = true; //If an invalid char is found, we can just restart the domain from there. var newStart = 0; var currArray = _current.ToCharArray(); var length = currArray.Length; //hex special case var isAllHexSoFar = length > 2 && currArray[0] == '0' && (currArray[1] == 'x' || currArray[1] == 'X'); var index = isAllHexSoFar ? 2 : 0; var done = false; while (index < length && !done) { //get the current character and update length counts. var curr = currArray[index]; _currentLabelLength++; _topLevelLength = _currentLabelLength; //Is the length of the last part > 64 (plus one since we just incremented) if (_currentLabelLength > MAX_LABEL_LENGTH) { return(ReaderNextState.InvalidDomainName); } if (CharUtils.IsDot(curr)) { //found a dot. Increment dot count, and reset last length _dots++; _currentLabelLength = 0; } else if (curr == '[') { _seenBracket = true; _numeric = false; } else if (curr == '%' && index + 2 < length && CharUtils.IsHex(currArray[index + 1]) && CharUtils.IsHex(currArray[index + 2])) { //handle url encoded dot if (currArray[index + 1] == '2' && currArray[index + 2] == 'e') { _dots++; _currentLabelLength = 0; } else { _numeric = false; } index += 2; } else if (isAllHexSoFar) { //if it's a valid character in the domain that is not numeric if (!CharUtils.IsHex(curr)) { _numeric = false; isAllHexSoFar = false; index--; //backtrack to rerun last character knowing it isn't hex. } } else if (CharUtils.IsAlpha(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START) { _numeric = false; } else if (!CharUtils.IsNumeric(curr) && !_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN)) { //if its not _numeric and not alphabetical, then restart searching for a domain from this point. newStart = index + 1; _currentLabelLength = 0; _topLevelLength = 0; _numeric = true; _dots = 0; done = true; } index++; } //An invalid character for the domain was found somewhere in the current buffer. //cut the first part of the domain out. For example: // http://asdf%asdf.google.com <- asdf.google.com is still valid, so restart from the % if (newStart > 0) { //make sure the location is not at the end. Otherwise the thing is just invalid. if (newStart < _current.Length) { _buffer.Clear(); _buffer.Append(_current.Substring(newStart)); //_buffer.Replace(0, _buffer.Length(), _current.javaSubstring(newStart)); //cut out the previous part, so now the domain name has to be from here. _startDomainName = 0; } //now after cutting if the buffer is just "." newStart > current (last character in current is invalid) if (newStart >= _current.Length || _buffer.ToString().Equals(".")) { return(ReaderNextState.InvalidDomainName); } } } else { _startDomainName = _buffer.Length; } //all else is good, return OK return(ReaderNextState.ValidDomainName); }