/// <summary> /// The default input reader which looks for specific flags to start detecting the url. /// </summary> private void ReadDefault() { //Keeps track of the number of characters read to be able to later cut out the domain name. var length = 0; // keep track of how many times each character in each index of the raw input has been read var contentReadByIndexCount = new byte[_reader.ContentLength]; //until end of string read the contents while (!_reader.Eof()) { // We want to ensure that backtracking and looping on content is limited from infinite-loops, so // we take the hit and track each time an element in the input is read, and if its been hit too // many times, we step forwards until we find an element that has NOT been read too many times var currentIndex = _reader.GetPosition(); contentReadByIndexCount[currentIndex] += 1; while (contentReadByIndexCount[currentIndex] >= ContentReadByIndexMaximum) { // Forcably step to the next character in the input, so we jump out of infinite loops _reader.Read(); currentIndex = _reader.GetPosition(); } // Read the next char to process. var curr = _reader.Read(); switch (curr) { case ' ': //space was found, check if it's a valid single level domain. if (_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 0 && _hasScheme) { _reader.GoBack(); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } ; } _buffer.Append(curr); ReadEnd(ReadEndState.InvalidUrl); length = 0; break; case '%': if (_reader.CanReadChars(2)) { if (_reader.Peek(2).Equals("3a", StringComparison.InvariantCultureIgnoreCase)) { _buffer.Append(curr); _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); length = ProcessColon(length); } else if (CharUtils.IsHex(_reader.PeekChar(0)) && CharUtils.IsHex(_reader.PeekChar(1))) { _buffer.Append(curr); _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; } } break; case '\u3002': //non-standard dots case '\uFF0E': case '\uFF61': case '.': //"." was found, read the domain name using the start from length. _buffer.Append(curr); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; break; case '@': //Check the domain name after a username if (_buffer.Length > 0) { _currentUrlMarker.SetIndex(UrlPart.USERNAME_PASSWORD, length); _buffer.Append(curr); if (!ReadDomainName(null)) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; } break; case '[': if (_dontMatchIpv6) { //Check if we need to match characters. If we match characters and this is a start or stop of range, //either way reset the world and start processing again. if (CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched) { ReadEnd(ReadEndState.InvalidUrl); length = 0; } } var beginning = _reader.GetPosition(); //if it doesn't have a scheme, clear the buffer. if (!_hasScheme) { _buffer.Remove(0, _buffer.Length); } _buffer.Append(curr); if (!ReadDomainName(_buffer.ToString().Substring(length))) { //if we didn't find an ipv6 address, then check inside the brackets for urls ReadEnd(ReadEndState.InvalidUrl); _reader.Seek(beginning); _dontMatchIpv6 = true; } length = 0; break; case '/': // "/" was found, then we either read a scheme, or if we already read a scheme, then // we are reading a url in the format http://123123123/asdf if (_hasScheme || _options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 1) { //we already have the scheme, so then we already read: //http://something/ <- if something is all numeric then its a valid url. //OR we are searching for single level domains. We have buffer length > 1 condition //to weed out infinite backtrack in cases of html5 roots //unread this "/" and continue to check the domain name starting from the beginning of the domain _reader.GoBack(); if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } length = 0; } else { //we don't have a scheme already, then clear state, then check for html5 root such as: "//google.com/" // remember the state of the quote when clearing state just in case its "//google.com" so its not cleared. ReadEnd(ReadEndState.InvalidUrl); _buffer.Append(curr); _hasScheme = ReadHtml5Root(); length = _buffer.Length; } break; case ':': //add the ":" to the url and check for scheme/username _buffer.Append(curr); length = ProcessColon(length); break; default: //Check if we need to match characters. If we match characters and this is a start or stop of range, //either way reset the world and start processing again. if (CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched) { ReadEnd(ReadEndState.InvalidUrl); length = 0; } else { _buffer.Append(curr); } break; } } if (_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 0 && _hasScheme) { if (!ReadDomainName(_buffer.ToString().Substring(length))) { ReadEnd(ReadEndState.InvalidUrl); } } }
/// <summary> /// Reads the Dns and returns the next state the state machine should take in throwing this out, or continue processing /// if this is a valid domain name. /// @return The next state to take. /// </summary> public ReaderNextState ReadDomainName() { //Read the current, and if its bad, just return. if (ReadCurrent() == ReaderNextState.InvalidDomainName) { return(ReaderNextState.InvalidDomainName); } //If this is the first domain part, check if it's ip address in is hexa //similar to what is done on 'readCurrent' method bool isAllHexSoFar = (_current == null || _current.Equals("")) && _reader.CanReadChars(3) && ("0x".Equals(_reader.Peek(2), StringComparison.InvariantCultureIgnoreCase)); if (isAllHexSoFar) { //Append hexa radix symbol characters (0x) _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); _currentLabelLength += 2; _topLevelLength = _currentLabelLength; } //while not done and not end of string keep reading. var done = false; while (!done && !_reader.Eof()) { var curr = _reader.Read(); if (curr == '/') { //continue by reading the path return(CheckDomainNameValid(ReaderNextState.ReadPath, curr)); } if (curr == ':' && (!_seenBracket || _seenCompleteBracketSet)) { //Don't check for a port if it's in the middle of an ipv6 address //continue by reading the port. return(CheckDomainNameValid(ReaderNextState.ReadPort, curr)); } if (curr == '?') { //continue by reading the query string return(CheckDomainNameValid(ReaderNextState.ReadQueryString, curr)); } if (curr == '#') { //continue by reading the fragment return(CheckDomainNameValid(ReaderNextState.ReadFragment, curr)); } else if (curr == '@') { //this may not have been a domain after all, but rather a username/password instead _reader.GoBack(); return(ReaderNextState.ReadUserPass); } else if (CharUtils.IsDot(curr) || curr == '%' && _reader.CanReadChars(2) && _reader.Peek(2).Equals(HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase)) { //if the current character is a dot or a urlEncodedDot //handles the case: hello.. if (_currentLabelLength < 1) { done = true; } else { //append the "." to the domain name _buffer.Append(curr); //if it was not a normal dot, then it is url encoded //read the next two chars, which are the hex representation if (!CharUtils.IsDot(curr)) { _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); } //increment the dots only if it's not part of the zone index and reset the last length. if (!_zoneIndex) { _dots++; _currentLabelLength = 0; } //if the length of the last section is longer than or equal to 64, it's too long to be a valid domain if (_currentLabelLength >= MAX_LABEL_LENGTH) { return(ReaderNextState.InvalidDomainName); } } } else if (_seenBracket && (CharUtils.IsHex(curr) || curr == ':' || curr == '[' || curr == ']' || curr == '%') && !_seenCompleteBracketSet) { //if this is an ipv6 address. switch (curr) { case ':': _currentLabelLength = 0; break; case '[': // if we read another '[', we need to restart by re-reading from this bracket instead. _reader.GoBack(); return(ReaderNextState.InvalidDomainName); case ']': _seenCompleteBracketSet = true; //means that we already have a complete ipv6 address. _zoneIndex = false; //set this back off so that we can keep counting dots after ipv6 is over. break; case '%': //set flag to subtract subsequent dots because it's part of the zone index _zoneIndex = true; break; default: _currentLabelLength++; break; } _numeric = false; _buffer.Append(curr); } else if (CharUtils.IsAlphaNumeric(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START) { //Valid domain name character. Either a-z, A-Z, 0-9, -, or international character if (_seenCompleteBracketSet) { //covers case of [fe80::]www.google.com _reader.GoBack(); done = true; } else { if (isAllHexSoFar && !CharUtils.IsHex(curr)) { _numeric = false; } //if its not numeric, remember that; if (!isAllHexSoFar && !CharUtils.IsNumeric(curr)) { _numeric = false; } //append to the states. _buffer.Append(curr); _currentLabelLength++; _topLevelLength = _currentLabelLength; } } else if (curr == '[' && !_seenBracket) { _seenBracket = true; _numeric = false; _buffer.Append(curr); } else if (curr == '[' && _seenCompleteBracketSet) { //Case where [::][ ... _reader.GoBack(); done = true; } else if (curr == '%' && _reader.CanReadChars(2) && CharUtils.IsHex(_reader.PeekChar(0)) && CharUtils.IsHex(_reader.PeekChar(1))) { //append to the states. _buffer.Append(curr); _buffer.Append(_reader.Read()); _buffer.Append(_reader.Read()); _currentLabelLength += 3; _topLevelLength = _currentLabelLength; } else { //called to increment the count of matching characters //_characterHandler.addCharacter(curr); _characterHandler(curr); //invalid character, we are done. done = true; } } //Check the domain name to make sure its ok. return(CheckDomainNameValid(ReaderNextState.ValidDomainName, null)); }