C# (CSharp) urldetector.detection CharUtils примеры использования

Язык программирования: C# (CSharp)

Пространство имен/Пакет: urldetector.detection

Класс/Тип: CharUtils

Примеров на hotexamples.com: 8

C# (CSharp) urldetector.detection CharUtils - 8 примеров найдено. Это лучшие примеры C# (CSharp) кода для urldetector.detection.CharUtils, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

IsHex(4)

IsDot(3)

IsNumeric(3)

IsAlpha(2)

IsAlphaNumeric(1)

IsUnreserved(1)

IsWhiteSpace(1)

SplitByDot(1)

Пример #1

Показать файл

        /// <summary>
        /// Try to read the port of the url.
        /// </summary>
        /// <returns>True if a valid port was read.</returns>
        private bool ReadPort()
        {
            _currentUrlMarker.SetIndex(UrlPart.PORT, _buffer.Length);
            //The length of the port read.
            var portLen = 0;

            while (!_reader.Eof())
            {
                //read the next one and remember the length
                var curr = _reader.Read();
                portLen++;

                if (curr == '/')
                {
                    //continue to read path
                    _buffer.Append(curr);
                    return(ReadPath());
                }

                if (curr == '?')
                {
                    //continue to read query string
                    _buffer.Append(curr);
                    return(ReadQueryString());
                }

                if (curr == '#')
                {
                    //continue to read fragment.
                    _buffer.Append(curr);
                    return(ReadFragment());
                }

                if (CheckMatchingCharacter(curr) == CharacterMatch.CharacterMatchStop || !CharUtils.IsNumeric(curr))
                {
                    //if we got here, then what we got so far is a valid url. don't append the current character.
                    _reader.GoBack();

                    //no port found; it was something like google.com:hello.world
                    if (portLen == 1)
                    {
                        //remove the ":" from the end.
                        _buffer.Remove(_buffer.Length - 1, 1);
                    }

                    _currentUrlMarker.UnsetIndex(UrlPart.PORT);
                    return(ReadEnd(ReadEndState.ValidUrl));
                }

                //this is a valid character in the port string.
                _buffer.Append(curr);
            }

            //found a correct url
            return(ReadEnd(ReadEndState.ValidUrl));
        }

Пример #2

Показать файл

        /// <summary>
        /// The default input reader which looks for specific flags to start detecting the url.
        /// </summary>
        private void ReadDefault()
        {
            //Keeps track of the number of characters read to be able to later cut out the domain name.
            var length = 0;

            // keep track of how many times each character in each index of the raw input has been read
            var contentReadByIndexCount = new byte[_reader.ContentLength];

            //until end of string read the contents
            while (!_reader.Eof())
            {
                // We want to ensure that backtracking and looping on content is limited from infinite-loops, so
                // we take the hit and track each time an element in the input is read, and if its been hit too
                // many times, we step forwards until we find an element that has NOT been read too many times
                var currentIndex = _reader.GetPosition();
                contentReadByIndexCount[currentIndex] += 1;
                while (contentReadByIndexCount[currentIndex] >= ContentReadByIndexMaximum)
                {
                    // Forcably step to the next character in the input, so we jump out of infinite loops
                    _reader.Read();
                    currentIndex = _reader.GetPosition();
                }

                // Read the next char to process.
                var curr = _reader.Read();

                switch (curr)
                {
                case ' ':
                    //space was found, check if it's a valid single level domain.
                    if (_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 0 && _hasScheme)
                    {
                        _reader.GoBack();
                        if (!ReadDomainName(_buffer.ToString().Substring(length)))
                        {
                            ReadEnd(ReadEndState.InvalidUrl);
                        }
                        ;
                    }

                    _buffer.Append(curr);
                    ReadEnd(ReadEndState.InvalidUrl);
                    length = 0;
                    break;

                case '%':
                    if (_reader.CanReadChars(2))
                    {
                        if (_reader.Peek(2).Equals("3a", StringComparison.InvariantCultureIgnoreCase))
                        {
                            _buffer.Append(curr);
                            _buffer.Append(_reader.Read());
                            _buffer.Append(_reader.Read());
                            length = ProcessColon(length);
                        }
                        else if (CharUtils.IsHex(_reader.PeekChar(0)) && CharUtils.IsHex(_reader.PeekChar(1)))
                        {
                            _buffer.Append(curr);
                            _buffer.Append(_reader.Read());
                            _buffer.Append(_reader.Read());

                            if (!ReadDomainName(_buffer.ToString().Substring(length)))
                            {
                                ReadEnd(ReadEndState.InvalidUrl);
                            }
                            length = 0;
                        }
                    }

                    break;

                case '\u3002':                         //non-standard dots
                case '\uFF0E':
                case '\uFF61':
                case '.':                         //"." was found, read the domain name using the start from length.
                    _buffer.Append(curr);
                    if (!ReadDomainName(_buffer.ToString().Substring(length)))
                    {
                        ReadEnd(ReadEndState.InvalidUrl);
                    }
                    length = 0;
                    break;

                case '@':                         //Check the domain name after a username
                    if (_buffer.Length > 0)
                    {
                        _currentUrlMarker.SetIndex(UrlPart.USERNAME_PASSWORD, length);
                        _buffer.Append(curr);
                        if (!ReadDomainName(null))
                        {
                            ReadEnd(ReadEndState.InvalidUrl);
                        }
                        length = 0;
                    }

                    break;

                case '[':
                    if (_dontMatchIpv6)
                    {
                        //Check if we need to match characters. If we match characters and this is a start or stop of range,
                        //either way reset the world and start processing again.
                        if (CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched)
                        {
                            ReadEnd(ReadEndState.InvalidUrl);
                            length = 0;
                        }
                    }

                    var beginning = _reader.GetPosition();

                    //if it doesn't have a scheme, clear the buffer.
                    if (!_hasScheme)
                    {
                        _buffer.Remove(0, _buffer.Length);
                    }

                    _buffer.Append(curr);

                    if (!ReadDomainName(_buffer.ToString().Substring(length)))
                    {
                        //if we didn't find an ipv6 address, then check inside the brackets for urls
                        ReadEnd(ReadEndState.InvalidUrl);
                        _reader.Seek(beginning);
                        _dontMatchIpv6 = true;
                    }

                    length = 0;
                    break;

                case '/':
                    // "/" was found, then we either read a scheme, or if we already read a scheme, then
                    // we are reading a url in the format http://123123123/asdf

                    if (_hasScheme || _options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 1)
                    {
                        //we already have the scheme, so then we already read:
                        //http://something/ <- if something is all numeric then its a valid url.
                        //OR we are searching for single level domains. We have buffer length > 1 condition
                        //to weed out infinite backtrack in cases of html5 roots

                        //unread this "/" and continue to check the domain name starting from the beginning of the domain
                        _reader.GoBack();
                        if (!ReadDomainName(_buffer.ToString().Substring(length)))
                        {
                            ReadEnd(ReadEndState.InvalidUrl);
                        }

                        length = 0;
                    }
                    else
                    {
                        //we don't have a scheme already, then clear state, then check for html5 root such as: "//google.com/"
                        // remember the state of the quote when clearing state just in case its "//google.com" so its not cleared.
                        ReadEnd(ReadEndState.InvalidUrl);
                        _buffer.Append(curr);
                        _hasScheme = ReadHtml5Root();
                        length     = _buffer.Length;
                    }

                    break;

                case ':':
                    //add the ":" to the url and check for scheme/username
                    _buffer.Append(curr);
                    length = ProcessColon(length);
                    break;

                default:
                    //Check if we need to match characters. If we match characters and this is a start or stop of range,
                    //either way reset the world and start processing again.
                    if (CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched)
                    {
                        ReadEnd(ReadEndState.InvalidUrl);
                        length = 0;
                    }
                    else
                    {
                        _buffer.Append(curr);
                    }

                    break;
                }
            }

            if (_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.Length > 0 && _hasScheme)
            {
                if (!ReadDomainName(_buffer.ToString().Substring(length)))
                {
                    ReadEnd(ReadEndState.InvalidUrl);
                }
            }
        }

Пример #3

Показать файл

        /// <summary>
        /// Reads the scheme and allows returns true if the scheme is in our allowed collection (e.g. http(s?):// or ftp(s?)://)
        /// @return True if the scheme was found, else false.
        /// </summary>
        private bool ReadScheme()
        {
            var originalLength = _buffer.Length;
            var numSlashes     = 0;

            while (!_reader.Eof())
            {
                var curr = _reader.Read();

                //if we match a slash, look for a second one.
                if (curr == '/')
                {
                    _buffer.Append(curr);
                    if (numSlashes == 1)
                    {
                        // return only if the buffer currently ends with an approved protocol.
                        // When we have buffered a string like: ":u(https://test.co" and are scanning the ':', we
                        // consider this to have found a scheme (the https bit only, which will be parsed out later)
                        var bufferedUrlContent = _buffer.ToString().ToLowerInvariant();

                        // return only if we detect an approved protocol at the end of the current buffer. For
                        // efficiency, first check an exact match
                        if (ValidSchemesSuffixed.Contains(bufferedUrlContent))
                        {
                            _currentUrlMarker.SetIndex(UrlPart.SCHEME, 0);
                            return(true);
                        }

                        // If no exact match found, try to find a valid scheme in the trailing content of the current buffer,
                        // starting with the longest matches available (e.g. sftp:// rather than ftp://
                        for (var i = ValidSchemesSuffixedOrdered.Length - 1; i >= 0; i--)
                        {
                            var vss = ValidSchemesSuffixedOrdered[i];
                            if (bufferedUrlContent.EndsWith(vss))
                            {
                                // see if we need to remove extra characters from the start of the buffer
                                if (bufferedUrlContent.Length > vss.Length)
                                {
                                    _buffer.Remove(0, bufferedUrlContent.Length - vss.Length);
                                }
                                _currentUrlMarker.SetIndex(UrlPart.SCHEME, 0);
                                return(true);
                            }
                        }
                        return(false);
                    }

                    numSlashes++;
                }
                else if (curr == ' ' || CheckMatchingCharacter(curr) != CharacterMatch.CharacterNotMatched)
                {
                    //if we find a space or end of input, then nothing found.
                    _buffer.Append(curr);
                    return(false);
                }
                else if (curr == '[')
                {
                    //if we're starting to see an ipv6 address
                    _reader.GoBack();                     //unread the '[', so that we can start looking for ipv6
                    return(false);
                }
                else if (originalLength > 0 || numSlashes > 0 || !CharUtils.IsAlpha(curr))
                {
                    // if it's not a character a-z or A-Z then assume we aren't matching scheme, but instead
                    // matching username and password.
                    _reader.GoBack();
                    return(ReadUserPass(0));
                }
            }

            return(false);
        }

Пример #4

Показать файл

Файл: InputTextReader.cs Проект: eladaus/URL-Detector

        /// <summary>
        /// Reads a single char from the content stream and increments the index.
        /// @return The next available character.
        /// </summary>
        /// <returns></returns>
        public char Read()
        {
            var chr = _content[_index++];

            return(CharUtils.IsWhiteSpace(chr) ? ' ' : chr);
        }

Пример #5

Показать файл

Файл: DomainNameReader.cs Проект: eladaus/URL-Detector

        /// <summary>
        /// Sees that there's an open "[", and is now checking for ":"'s and stopping when there is a ']' or invalid character.
        /// Handles ipv4 formatted ipv6 addresses, zone indices, truncated notation.
        /// @return Returns true if it is a valid ipv6 address
        /// </summary>
        private bool IsValidIpv6(string testDomain)
        {
            var domainArray = testDomain.ToCharArray();

            // Return false if we don't see [....]
            // or if we only have '[]'
            // or if we detect [:8000: ...]; only [::8000: ...] is okay
            if (domainArray.Length < 3 || domainArray[domainArray.Length - 1] != ']' || domainArray[0] != '[' ||
                domainArray[1] == ':' && domainArray[2] != ':')
            {
                return(false);
            }

            var numSections = 1;
            var hexDigits   = 0;
            var prevChar    = '\0';
            //char prevChar = 0;

            //used to check ipv4 addresses at the end of ipv6 addresses.
            var lastSection = new StringBuilder();
            var hexSection  = true;

            // If we see a '%'. Example: http://[::ffff:0xC0.0x00.0x02.0xEB%251]
            var zoneIndiceMode = false;

            //If doubleColonFlag is true, that means we've already seen one "::"; we're not allowed to have more than one.
            var doubleColonFlag = false;

            var index = 0;

            for (; index < domainArray.Length; index++)
            {
                switch (domainArray[index])
                {
                case '[':                         //found beginning of ipv6 address
                    break;

                case '%':
                case ']':                         //found end of ipv6 address
                    if (domainArray[index] == '%')
                    {
                        //see if there's a urlencoded dot
                        if (domainArray.Length - index >= 2 && domainArray[index + 1] == '2' && domainArray[index + 2] == 'e')
                        {
                            lastSection.Append("%2e");
                            index     += 2;
                            hexSection = false;
                            break;
                        }

                        zoneIndiceMode = true;
                    }

                    if (!hexSection && (!zoneIndiceMode || domainArray[index] == '%'))
                    {
                        if (IsValidIpv4(lastSection.ToString()))
                        {
                            numSections++;                                     //ipv4 takes up 2 sections.
                        }
                        else
                        {
                            return(false);
                        }
                    }

                    break;

                case ':':
                    if (prevChar == ':')
                    {
                        if (doubleColonFlag)
                        {
                            //only allowed to have one "::" in an ipv6 address.
                            return(false);
                        }

                        doubleColonFlag = true;
                    }

                    //This means that we reached invalid characters in the previous section
                    if (!hexSection)
                    {
                        return(false);
                    }

                    hexSection = true;                         //reset hex to true
                    hexDigits  = 0;                            //reset count for hex digits
                    numSections++;
                    lastSection.Remove(0, lastSection.Length); //clear last section
                    break;

                default:
                    if (zoneIndiceMode)
                    {
                        if (!CharUtils.IsUnreserved(domainArray[index]))
                        {
                            return(false);
                        }
                    }
                    else
                    {
                        lastSection.Append(domainArray[index]);                                 //collect our possible ipv4 address
                        if (hexSection && CharUtils.IsHex(domainArray[index]))
                        {
                            hexDigits++;
                        }
                        else
                        {
                            hexSection = false;                                     //non hex digit.
                        }
                    }

                    break;
                }

                if (hexDigits > 4 || numSections > 8)
                {
                    return(false);
                }

                prevChar = domainArray[index];
            }

            //numSections != 1 checks for things like: [adf]
            //If there are more than 8 sections for the address or there isn't a double colon, then it's invalid.
            return(numSections != 1 && (numSections >= 8 || doubleColonFlag));
        }

Пример #6

Показать файл

Файл: DomainNameReader.cs Проект: eladaus/URL-Detector

        /// <summary>
        /// Handles Hexadecimal, octal, decimal, dotted decimal, dotted hex, dotted octal.
        /// @param testDomain the string we're testing
        /// @return Returns true if it's a valid ipv4 address
        /// </summary>
        private bool IsValidIpv4(string testDomain)
        {
            var valid = false;

            if (testDomain.Length > 0)
            {
                //handling format without dots. Ex: http://2123123123123/path/a, http://0x8242343/aksdjf
                if (_dots == 0)
                {
                    try
                    {
                        long value;
                        if (testDomain.Length > 2 && testDomain[0] == '0' && testDomain[1] == 'x')
                        {
                            // hex
                            var isParsed = long.TryParse(testDomain.Substring(2), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture, out value);
                            if (!isParsed)
                            {
                                return(true);
                            }
                        }
                        else if (testDomain[0] == '0')
                        {
                            // octal
                            var possibleDomain = testDomain.Substring(1);
                            if (OctalEncodingHelper.LooksLikeOctal(possibleDomain.AsSpan()))
                            {
                                value = Convert.ToInt64(possibleDomain, 8);
                            }
                            else
                            {
                                return(false);
                            }
                        }
                        else
                        {
                            // decimal
                            var isParsed = long.TryParse(testDomain, out value);
                            if (!isParsed)
                            {
                                return(false);
                            }
                        }

                        valid = value <= MAX_NUMERIC_DOMAIN_VALUE && value >= MIN_NUMERIC_DOMAIN_VALUE;
                    }
                    catch (Exception)
                    {
                        valid = false;
                    }
                }
                else if (_dots == 3)
                {
                    //Dotted decimal/hex/octal format
                    var parts = CharUtils.SplitByDot(testDomain);
                    valid = true;

                    //check each part of the ip and make sure its valid.
                    for (var i = 0; i < parts.Length && valid; i++)
                    {
                        var part = parts[i];
                        if (part.Length > 0)
                        {
                            string parsedNum;
                            int    @base;
                            if (part.Length > 2 && part[0] == '0' && part[1] == 'x')
                            {
                                //dotted hex
                                parsedNum = part.Substring(2);
                                @base     = 16;
                            }
                            else if (part[0] == '0')
                            {
                                //dotted octal
                                parsedNum = part.Substring(1);
                                @base     = 8;
                            }
                            else
                            {
                                //dotted decimal
                                parsedNum = part;
                                @base     = 10;
                            }

                            int section;
                            if (parsedNum.Length == 0)
                            {
                                section = 0;
                            }
                            else
                            {
                                // For efficiency, we try to avoid try/catch and instead use tryparse
                                if (@base == 16)
                                {
                                    var isParsed = int.TryParse(parsedNum, NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture, out section);
                                    if (!isParsed)
                                    {
                                        return(false);
                                    }
                                }
                                else if (@base == 10)
                                {
                                    var isParsed = int.TryParse(parsedNum, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out section);
                                    if (!isParsed)
                                    {
                                        return(false);
                                    }
                                }
                                else
                                {
                                    // for other bases, fall back to try/catch
                                    if (@base == 8 && OctalEncodingHelper.LooksLikeOctal(parsedNum.AsSpan()))
                                    {
                                        try
                                        {
                                            section = Convert.ToInt32(parsedNum, @base);
                                        }
                                        catch (Exception)
                                        {
                                            return(false);
                                        }
                                    }
                                    else
                                    {
                                        return(false);
                                    }
                                }
                            }

                            if (section < MIN_IP_PART || section > MAX_IP_PART)
                            {
                                valid = false;
                            }
                        }
                        else
                        {
                            valid = false;
                        }
                    }
                }
            }

            return(valid);
        }

Пример #7

Показать файл

Файл: DomainNameReader.cs Проект: eladaus/URL-Detector

        /// <summary>
        /// Reads the Dns and returns the next state the state machine should take in throwing this out, or continue processing
        /// if this is a valid domain name.
        /// @return The next state to take.
        /// </summary>
        public ReaderNextState ReadDomainName()
        {
            //Read the current, and if its bad, just return.
            if (ReadCurrent() == ReaderNextState.InvalidDomainName)
            {
                return(ReaderNextState.InvalidDomainName);
            }

            //If this is the first domain part, check if it's ip address in is hexa
            //similar to what is done on 'readCurrent' method
            bool isAllHexSoFar = (_current == null || _current.Equals("")) &&
                                 _reader.CanReadChars(3) &&
                                 ("0x".Equals(_reader.Peek(2), StringComparison.InvariantCultureIgnoreCase));

            if (isAllHexSoFar)
            {
                //Append hexa radix symbol characters (0x)
                _buffer.Append(_reader.Read());
                _buffer.Append(_reader.Read());
                _currentLabelLength += 2;
                _topLevelLength      = _currentLabelLength;
            }

            //while not done and not end of string keep reading.
            var done = false;

            while (!done && !_reader.Eof())
            {
                var curr = _reader.Read();

                if (curr == '/')
                {
                    //continue by reading the path
                    return(CheckDomainNameValid(ReaderNextState.ReadPath, curr));
                }

                if (curr == ':' && (!_seenBracket || _seenCompleteBracketSet))
                {
                    //Don't check for a port if it's in the middle of an ipv6 address
                    //continue by reading the port.
                    return(CheckDomainNameValid(ReaderNextState.ReadPort, curr));
                }

                if (curr == '?')
                {
                    //continue by reading the query string
                    return(CheckDomainNameValid(ReaderNextState.ReadQueryString, curr));
                }

                if (curr == '#')
                {
                    //continue by reading the fragment
                    return(CheckDomainNameValid(ReaderNextState.ReadFragment, curr));
                }
                else if (curr == '@')
                {
                    //this may not have been a domain after all, but rather a username/password instead
                    _reader.GoBack();
                    return(ReaderNextState.ReadUserPass);
                }
                else if (CharUtils.IsDot(curr) ||
                         curr == '%' && _reader.CanReadChars(2) && _reader.Peek(2).Equals(HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase))
                {
                    //if the current character is a dot or a urlEncodedDot

                    //handles the case: hello..
                    if (_currentLabelLength < 1)
                    {
                        done = true;
                    }
                    else
                    {
                        //append the "." to the domain name
                        _buffer.Append(curr);

                        //if it was not a normal dot, then it is url encoded
                        //read the next two chars, which are the hex representation
                        if (!CharUtils.IsDot(curr))
                        {
                            _buffer.Append(_reader.Read());
                            _buffer.Append(_reader.Read());
                        }

                        //increment the dots only if it's not part of the zone index and reset the last length.
                        if (!_zoneIndex)
                        {
                            _dots++;
                            _currentLabelLength = 0;
                        }

                        //if the length of the last section is longer than or equal to 64, it's too long to be a valid domain
                        if (_currentLabelLength >= MAX_LABEL_LENGTH)
                        {
                            return(ReaderNextState.InvalidDomainName);
                        }
                    }
                }
                else if (_seenBracket && (CharUtils.IsHex(curr) || curr == ':' || curr == '[' || curr == ']' || curr == '%') &&
                         !_seenCompleteBracketSet)
                {
                    //if this is an ipv6 address.
                    switch (curr)
                    {
                    case ':':
                        _currentLabelLength = 0;
                        break;

                    case '[':
                        // if we read another '[', we need to restart by re-reading from this bracket instead.
                        _reader.GoBack();
                        return(ReaderNextState.InvalidDomainName);

                    case ']':
                        _seenCompleteBracketSet = true;                     //means that we already have a complete ipv6 address.
                        _zoneIndex = false;                                 //set this back off so that we can keep counting dots after ipv6 is over.
                        break;

                    case '%':                             //set flag to subtract subsequent dots because it's part of the zone index
                        _zoneIndex = true;
                        break;

                    default:
                        _currentLabelLength++;
                        break;
                    }

                    _numeric = false;
                    _buffer.Append(curr);
                }
                else if (CharUtils.IsAlphaNumeric(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START)
                {
                    //Valid domain name character. Either a-z, A-Z, 0-9, -, or international character
                    if (_seenCompleteBracketSet)
                    {
                        //covers case of [fe80::]www.google.com
                        _reader.GoBack();
                        done = true;
                    }
                    else
                    {
                        if (isAllHexSoFar && !CharUtils.IsHex(curr))
                        {
                            _numeric = false;
                        }
                        //if its not numeric, remember that;
                        if (!isAllHexSoFar && !CharUtils.IsNumeric(curr))
                        {
                            _numeric = false;
                        }

                        //append to the states.
                        _buffer.Append(curr);
                        _currentLabelLength++;
                        _topLevelLength = _currentLabelLength;
                    }
                }
                else if (curr == '[' && !_seenBracket)
                {
                    _seenBracket = true;
                    _numeric     = false;
                    _buffer.Append(curr);
                }
                else if (curr == '[' && _seenCompleteBracketSet)
                {
                    //Case where [::][ ...
                    _reader.GoBack();
                    done = true;
                }
                else if (curr == '%' && _reader.CanReadChars(2) && CharUtils.IsHex(_reader.PeekChar(0)) &&
                         CharUtils.IsHex(_reader.PeekChar(1)))
                {
                    //append to the states.
                    _buffer.Append(curr);
                    _buffer.Append(_reader.Read());
                    _buffer.Append(_reader.Read());
                    _currentLabelLength += 3;
                    _topLevelLength      = _currentLabelLength;
                }
                else
                {
                    //called to increment the count of matching characters
                    //_characterHandler.addCharacter(curr);

                    _characterHandler(curr);

                    //invalid character, we are done.
                    done = true;
                }
            }

            //Check the domain name to make sure its ok.
            return(CheckDomainNameValid(ReaderNextState.ValidDomainName, null));
        }

Пример #8

Показать файл

Файл: DomainNameReader.cs Проект: eladaus/URL-Detector

        /// <summary>
        /// Reads and parses the current string to make sure the domain name started where it was supposed to,
        /// and the current domain name is correct.
        /// @return The next state to use after reading the current.
        /// </summary>
        private ReaderNextState ReadCurrent()
        {
            if (_current != null)
            {
                //Handles the case where the string is ".hello"
                if (_current.Length == 1 && CharUtils.IsDot(_current[0]))
                {
                    return(ReaderNextState.InvalidDomainName);
                }

                if (_current.Length == 3 && _current.Equals("%" + HEX_ENCODED_DOT, StringComparison.InvariantCultureIgnoreCase))
                {
                    return(ReaderNextState.InvalidDomainName);
                }

                //The location where the domain name started.
                _startDomainName = _buffer.Length - _current.Length;

                //flag that the domain is currently all numbers and/or dots.
                _numeric = true;

                //If an invalid char is found, we can just restart the domain from there.
                var newStart = 0;

                var currArray = _current.ToCharArray();
                var length    = currArray.Length;

                //hex special case
                var isAllHexSoFar = length > 2 && currArray[0] == '0' && (currArray[1] == 'x' || currArray[1] == 'X');

                var index = isAllHexSoFar ? 2 : 0;
                var done  = false;

                while (index < length && !done)
                {
                    //get the current character and update length counts.
                    var curr = currArray[index];
                    _currentLabelLength++;
                    _topLevelLength = _currentLabelLength;

                    //Is the length of the last part > 64 (plus one since we just incremented)
                    if (_currentLabelLength > MAX_LABEL_LENGTH)
                    {
                        return(ReaderNextState.InvalidDomainName);
                    }

                    if (CharUtils.IsDot(curr))
                    {
                        //found a dot. Increment dot count, and reset last length
                        _dots++;
                        _currentLabelLength = 0;
                    }
                    else if (curr == '[')
                    {
                        _seenBracket = true;
                        _numeric     = false;
                    }
                    else if (curr == '%' && index + 2 < length && CharUtils.IsHex(currArray[index + 1]) &&
                             CharUtils.IsHex(currArray[index + 2]))
                    {
                        //handle url encoded dot
                        if (currArray[index + 1] == '2' && currArray[index + 2] == 'e')
                        {
                            _dots++;
                            _currentLabelLength = 0;
                        }
                        else
                        {
                            _numeric = false;
                        }

                        index += 2;
                    }
                    else if (isAllHexSoFar)
                    {
                        //if it's a valid character in the domain that is not numeric
                        if (!CharUtils.IsHex(curr))
                        {
                            _numeric      = false;
                            isAllHexSoFar = false;
                            index--;                             //backtrack to rerun last character knowing it isn't hex.
                        }
                    }
                    else if (CharUtils.IsAlpha(curr) || curr == '-' || curr >= INTERNATIONAL_CHAR_START)
                    {
                        _numeric = false;
                    }
                    else if (!CharUtils.IsNumeric(curr) && !_options.HasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN))
                    {
                        //if its not _numeric and not alphabetical, then restart searching for a domain from this point.
                        newStart            = index + 1;
                        _currentLabelLength = 0;
                        _topLevelLength     = 0;
                        _numeric            = true;
                        _dots = 0;
                        done  = true;
                    }

                    index++;
                }

                //An invalid character for the domain was found somewhere in the current buffer.
                //cut the first part of the domain out. For example:
                // http://asdf%asdf.google.com <- asdf.google.com is still valid, so restart from the %
                if (newStart > 0)
                {
                    //make sure the location is not at the end. Otherwise the thing is just invalid.
                    if (newStart < _current.Length)
                    {
                        _buffer.Clear();
                        _buffer.Append(_current.Substring(newStart));

                        //_buffer.Replace(0, _buffer.Length(), _current.javaSubstring(newStart));

                        //cut out the previous part, so now the domain name has to be from here.
                        _startDomainName = 0;
                    }

                    //now after cutting if the buffer is just "." newStart > current (last character in current is invalid)
                    if (newStart >= _current.Length || _buffer.ToString().Equals("."))
                    {
                        return(ReaderNextState.InvalidDomainName);
                    }
                }
            }
            else
            {
                _startDomainName = _buffer.Length;
            }

            //all else is good, return OK
            return(ReaderNextState.ValidDomainName);
        }