Пример #1
0
        /// <summary>
        /// Reads and tokenizes a line of input.
        /// </summary>
        /// <returns>None if we're at the end of the input stream; otherwise, the
        /// possibly empty list of tokens.</returns>
        private Maybe <IReadOnlyList <string> > ReadInput()
        {
            const TokenizerOptions tokenizerOptions =
                TokenizerOptions.HandleDoubleQuoteAsTokenDelimiter;

            var line = _client.ReadLine();

            // Return None if we're at the end of the input stream.
            if (line == null)
            {
                return(new None());
            }

            // Preprocess the line.
            line = Preprocess(line);

            try
            {
                // Parse the string into tokens.
                return(StringUtilities.Tokenize(line, tokenizerOptions)
                       .Select(token => token.ToString())
                       .ToArray());
            }
            catch (ArgumentException ex)
            {
                _client.OnError(string.Format(CultureInfo.CurrentCulture, Strings.ExceptionWasThrownParsingInputLine, ex));
                return(Array.Empty <string>());
            }
        }
Пример #2
0
        /// <summary>
        /// Generate possible completions for the specified command line.
        /// </summary>
        /// <param name="type">Type of the parsed arguments object.</param>
        /// <param name="commandLineToComplete">The command line to complete. Both single and double quote
        /// characters are interpreted as denoting single tokens that may contain embedded whitespace.</param>
        /// <param name="charIndexOfCursor">Character index of the completion cursor.
        /// </param>
        /// <param name="tokensToSkip">Number of tokens to skip from start of command
        /// line.</param>
        /// <param name="options">Parsing options.</param>
        /// <returns>The candidate completions for the specified token.
        /// </returns>
        public static IEnumerable <string> GetCompletions(Type type, string commandLineToComplete, int charIndexOfCursor, int tokensToSkip, CommandLineParserOptions options)
        {
            const TokenizerOptions tokenizerOptions =
                TokenizerOptions.AllowPartialInput |
                TokenizerOptions.HandleDoubleQuoteAsTokenDelimiter |
                TokenizerOptions.HandleSingleQuoteAsTokenDelimiter;

            var tokens = StringUtilities.Tokenize(commandLineToComplete, tokenizerOptions).ToList();

            int index;

            for (index = 0; index < tokens.Count; ++index)
            {
                var token = tokens[index];
                if (charIndexOfCursor >= token.OuterStartingOffset &&
                    charIndexOfCursor <= token.OuterEndingOffset)
                {
                    break;
                }
            }

            if (index < tokensToSkip)
            {
                return(Enumerable.Empty <string>());
            }

            return(GetCompletions(
                       type,
                       tokens.Skip(tokensToSkip).Select(t => t.ToString()),
                       index - tokensToSkip,
                       options));
        }
Пример #3
0
        private Tokenizer MakeTokenizer(PythonLanguageVersion version, TokenizerOptions optionSet, StringReader reader,
                                        SourceLocation?initialSourceLocation = null)
        {
            var tokenizer = new Tokenizer(version, options: optionSet);

            tokenizer.Initialize(null, reader, initialSourceLocation ?? SourceLocation.MinValue);
            return(tokenizer);
        }
Пример #4
0
        /// <summary>
        /// Initializes a new instance of the <see cref="StringTokenizer"/> class.
        /// </summary>
        /// <param name="text">The text for tokenizing.</param>
        /// <param name="start">The start index.</param>
        /// <param name="end">The end index.</param>
        /// <param name="localTextOffset">The local text offset.</param>
        /// <param name="options">The options.</param>
        public StringTokenizer(string text, int start, int end, int localTextOffset, TokenizerOptions options)
            : base(localTextOffset, options)
        {
            this.Text = text;
            this.Start = start;
            this.End = end;

            this.Next = this.Start;
        }
Пример #5
0
    /// <summary>
    /// Initializes a new instance of the <see cref="TokenizingEnumerator"/> struct.
    /// </summary>
    /// <param name="value">The value to tokenize.</param>
    /// <param name="tokenizerOptions">The tokenizer options.</param>
    public TokenizingEnumerator(ReadOnlySpan <char> value, TokenizerOptions?tokenizerOptions = null)
    {
        _tokenizerOptions = tokenizerOptions ?? new TokenizerOptions();

        _isInCombinedShortNameSegment = default;
        _segment         = default;
        _splitEnumerator = new SpanSplitEnumerator(value, _tokenizerOptions);
        _current         = default;
    }
Пример #6
0
        public void Test1()
        {
            var input    = "ปลาที่ใหญ่ที่สุดในโลกคือปารีสชุบแป้งทอด";
            var expected = GlobalExpectedResult.GetExpectedResult1();

            var options = new TokenizerOptions {
                MatchingMode = MatchingMode.Longest
            };
            var tokenizer = new ThaiTokenizer(options);

            Verify(tokenizer, input, expected);
        }
Пример #7
0
        public void Test2()
        {
            var input    = "เจริญ";
            var expected = new List <string> {
                "เจริญ"
            };

            var options = new TokenizerOptions {
                MatchingMode = MatchingMode.Shortest, PreferDecodableWord = true
            };
            var tokenizer = new ThaiTokenizer(options);

            Verify(tokenizer, input, expected);
        }
Пример #8
0
 /// <summary>
 /// Initializes a new instance of the <see cref="ConsentCheckingPreExecutionEvent"/> class.
 /// </summary>
 /// <param name="privacy">The privacy service.</param>
 /// <param name="commandService">The command service.</param>
 /// <param name="options">The responder options.</param>
 /// <param name="interactionAPI">The interaction API.</param>
 /// <param name="tokenizerOptions">The tokenizer options.</param>
 /// <param name="treeSearchOptions">The tree search options.</param>
 /// <param name="feedback">The feedback service.</param>
 public ConsentCheckingPreExecutionEvent
 (
     PrivacyService privacy,
     CommandService commandService,
     IOptions <CommandResponderOptions> options,
     IDiscordRestInteractionAPI interactionAPI,
     IOptions <TokenizerOptions> tokenizerOptions,
     IOptions <TreeSearchOptions> treeSearchOptions,
     FeedbackService feedback
 )
 {
     _privacy           = privacy;
     _commandService    = commandService;
     _interactionAPI    = interactionAPI;
     _feedback          = feedback;
     _options           = options.Value;
     _tokenizerOptions  = tokenizerOptions.Value;
     _treeSearchOptions = treeSearchOptions.Value;
 }
Пример #9
0
        public TokenParser(TokenizerOptions options)
        {
            log = LogProvider.For <TokenParser>();

            Options = options;

            transformers = new List <Type>();
            validators   = new List <Type>();

            // Add default transformers/validators
            RegisterTransformer <ToDateTimeTransformer>();
            RegisterTransformer <ToDateTimeUtcTransformer>();
            RegisterTransformer <ToLowerTransformer>();
            RegisterTransformer <ToUpperTransformer>();
            RegisterTransformer <TrimTransformer>();
            RegisterTransformer <SubstringAfterTransformer>();
            RegisterTransformer <SubstringBeforeTransformer>();
            RegisterTransformer <SetTransformer>();
            RegisterTransformer <ReplaceTransformer>();
            RegisterTransformer <RemoveTransformer>();
            RegisterTransformer <SubstringAfterLastTransformer>();
            RegisterTransformer <SubstringBeforeLastTransformer>();
            RegisterTransformer <RemoveEndTransformer>();
            RegisterTransformer <RemoveStartTransformer>();
            RegisterTransformer <SplitTransformer>();

            RegisterValidator <IsNumericValidator>();
            RegisterValidator <MaxLengthValidator>();
            RegisterValidator <MinLengthValidator>();
            RegisterValidator <IsDomainNameValidator>();
            RegisterValidator <IsPhoneNumberValidator>();
            RegisterValidator <IsEmailValidator>();
            RegisterValidator <IsUrlValidator>();
            RegisterValidator <IsLooseUrlValidator>();
            RegisterValidator <IsLooseAbsoluteUrlValidator>();
            RegisterValidator <IsDateTimeValidator>();
            RegisterValidator <IsNotEmptyValidator>();
            RegisterValidator <IsNotValidator>();
            RegisterValidator <StartsWithValidator>();
            RegisterValidator <EndsWithValidator>();
            RegisterValidator <ContainsValidator>();
        }
Пример #10
0
    internal void RetainsQuotationMarksCorrectly
    (
        string value,
        IEnumerable <TokenType> expectedTokenTypes,
        IEnumerable <string> expectedTokenValues
    )
    {
        var actualTokenTypes  = new List <TokenType>();
        var actualTokenValues = new List <string>();

        var tokenizerOptions = new TokenizerOptions(RetainQuotationMarks: true);

        foreach (var token in new TokenizingEnumerator(value, tokenizerOptions))
        {
            actualTokenTypes.Add(token.Type);
            actualTokenValues.Add(token.Value.ToString());
        }

        Assert.Equal(expectedTokenTypes, actualTokenTypes);
        Assert.Equal(expectedTokenValues, actualTokenValues);
    }
Пример #11
0
        public TokenParser(TokenizerOptions options)
        {
            log = LogProvider.For <TokenParser>();

            Options = options;

            transformers = new List <Type>();
            validators   = new List <Type>();

            // Add default transformers/validators
            RegisterTransformer <ToDateTimeTransformer>();
            RegisterTransformer <ToDateTimeUtcTransformer>();
            RegisterTransformer <ToLowerTransformer>();
            RegisterTransformer <ToUpperTransformer>();
            RegisterTransformer <TrimTransformer>();
            RegisterTransformer <SubstringAfterTransformer>();
            RegisterTransformer <SubstringBeforeTransformer>();

            RegisterValidator <IsNumericValidator>();
            RegisterValidator <MaxLengthValidator>();
            RegisterValidator <MinLengthValidator>();
        }
        private static List <TokenWithSpan> TestOneString(PythonLanguageVersion version, TokenizerOptions optionSet, string originalText)
        {
            StringBuilder output = new StringBuilder();

            var tokenizer = new Tokenizer(version, options: optionSet);

            tokenizer.Initialize(new StringReader(originalText));
            Token token;
            int   prevOffset = 0;

            List <TokenWithSpan> tokens = new List <TokenWithSpan>();

            while ((token = tokenizer.GetNextToken()) != Tokens.EndOfFileToken)
            {
                tokens.Add(new TokenWithSpan(token, tokenizer.TokenSpan));

                output.Append(tokenizer.PrecedingWhiteSpace);
                output.Append(token.VerbatimImage);

                const int contextSize = 50;
                for (int i = prevOffset; i < originalText.Length && i < output.Length; i++)
                {
                    if (originalText[i] != output[i])
                    {
                        // output some context
                        StringBuilder x = new StringBuilder();
                        StringBuilder y = new StringBuilder();
                        StringBuilder z = new StringBuilder();
                        for (int j = Math.Max(0, i - contextSize); j < Math.Min(Math.Min(originalText.Length, output.Length), i + contextSize); j++)
                        {
                            x.AppendRepr(originalText[j]);
                            y.AppendRepr(output[j]);
                            if (j == i)
                            {
                                z.Append("^");
                            }
                            else
                            {
                                z.Append(" ");
                            }
                        }

                        Console.WriteLine("Mismatch context at {0}:", i);
                        Console.WriteLine("Original: {0}", x.ToString());
                        Console.WriteLine("New     : {0}", y.ToString());
                        Console.WriteLine("Differs : {0}", z.ToString());
                        Console.WriteLine("Token   : {0}", token);

                        Assert.AreEqual(originalText[i], output[i], String.Format("Characters differ at {0}, got {1}, expected {2}", i, output[i], originalText[i]));
                    }
                }

                prevOffset = output.Length;
            }
            output.Append(tokenizer.PrecedingWhiteSpace);

            Assert.AreEqual(originalText.Length, output.Length);
            return(tokens);
        }
Пример #13
0
        private void ParseTokenName(PreTemplate template, ref PreToken token, PreTokenEnumerator enumerator, ref FlatTokenParserState state, ref bool inFrontMatterToken, ref StringBuilder tokenContent, TokenizerOptions options)
        {
            var next = enumerator.Next();
            var peek = enumerator.Peek();

            tokenContent.Append(next);

            switch (next)
            {
            case "{":
                throw new ParsingException($"Unexpected character '{{' in token '{token.Name}'", enumerator);

            case "}":
                if (inFrontMatterToken)
                {
                    throw new ParsingException($"Invalid character '{next}' in token '{token.Name}'", enumerator);
                }
                else
                {
                    AppendToken(template, token, ref tokenContent, options);
                    token = new PreToken();
                    state = FlatTokenParserState.InPreamble;
                }
                break;

            case "$":
                token.TerminateOnNewline = true;
                switch (peek)
                {
                case " ":
                case "?":
                case "*":
                case "}":
                case ":":
                case "!":
                    break;

                default:
                    throw new ParsingException($"Invalid character '{peek}' in token '{token.Name}'", enumerator);
                }
                break;

            case "?":
                token.Optional = true;
                switch (peek)
                {
                case " ":
                case "$":
                case "*":
                case "}":
                case ":":
                case "!":
                    break;

                default:
                    throw new ParsingException($"Invalid character '{peek}' in token '{token.Name}'", enumerator);
                }

                if (token.Required)
                {
                    throw new ParsingException($"Required token {token.Name} can't be Optional", enumerator);
                }

                break;

            case "*":
                token.Repeating = true;
                token.Optional  = true;
                switch (peek)
                {
                case " ":
                case "$":
                case "?":
                case "}":
                case ":":
                case "!":
                    break;

                default:
                    throw new ParsingException($"Invalid character '{peek}' in token '{token.Name}'", enumerator);
                }
                break;

            case "!":
                token.Required = true;
                switch (peek)
                {
                case " ":
                case "*":
                case "$":
                case "?":
                case "}":
                case ":":
                    break;

                default:
                    throw new ParsingException($"Invalid character '{peek}' in token '{token.Name}'", enumerator);
                }

                if (token.Optional)
                {
                    throw new ParsingException($"Optional token {token.Name} can't be Required", enumerator);
                }

                break;

            case ":":
                state = FlatTokenParserState.InDecorator;
                break;

            case "=":
                state = FlatTokenParserState.InTokenValue;
                break;

            case " ":
                switch (peek)
                {
                case " ":
                case "*":
                case "$":
                case "?":
                case "}":
                case ":":
                case "!":
                case "=":
                    break;

                case "\n" when inFrontMatterToken:
                    break;

                default:
                    if (string.IsNullOrWhiteSpace(token.Name) == false)
                    {
                        throw new ParsingException($"Invalid character '{peek}' in token '{token.Name}'", enumerator);
                    }
                    break;
                }

                break;

            case "\n":
                if (inFrontMatterToken)
                {
                    token.IsFrontMatterToken = true;
                    AppendToken(template, token, ref tokenContent, options);
                    token = new PreToken();
                    inFrontMatterToken = false;
                    state = FlatTokenParserState.InFrontMatter;
                }
                else
                {
                    throw new ParsingException($"Invalid character '{next}' in token '{token.Name}'", enumerator);
                }
                break;

            default:
                if (ValidTokenNameCharacters.Contains(next))
                {
                    token.AppendName(next);
                }
                else
                {
                    throw new ParsingException($"Invalid character '{next}' in token '{token.Name}'", enumerator);
                }
                break;
            }
        }
Пример #14
0
        private void ParseDecorator(PreTemplate template, ref PreToken token, PreTokenEnumerator enumerator, ref FlatTokenParserState state, ref PreTokenDecorator decorator, ref bool inFrontMatterToken, ref StringBuilder tokenContent, TokenizerOptions options)
        {
            var next = enumerator.Next();

            tokenContent.Append(next);

            if (string.IsNullOrWhiteSpace(next))
            {
                if (inFrontMatterToken == false)
                {
                    return;
                }
                if (next != "\n")
                {
                    return;
                }
            }

            switch (next)
            {
            case "}" when inFrontMatterToken == false:
            case "\n" when inFrontMatterToken:
                token.IsFrontMatterToken = inFrontMatterToken;
                AppendDecorator(enumerator, token, decorator);
                AppendToken(template, token, ref tokenContent, options);
                token     = new PreToken();
                decorator = new PreTokenDecorator();
                if (inFrontMatterToken)
                {
                    inFrontMatterToken = false;
                    state = FlatTokenParserState.InFrontMatter;
                }
                else
                {
                    state = FlatTokenParserState.InPreamble;
                }
                break;

            case ",":
                AppendDecorator(enumerator, token, decorator);
                decorator = new PreTokenDecorator();
                break;

            case "(":
                state = FlatTokenParserState.InDecoratorArgument;
                break;

            case "}" when inFrontMatterToken:
            case "\n" when inFrontMatterToken == false:
                throw  new ParsingException($"'{decorator.Name}' unexpected character: {next}", enumerator);

            case "!":
                if (string.IsNullOrWhiteSpace(decorator.Name))
                {
                    decorator.IsNotDecorator = true;
                }
                else
                {
                    throw  new ParsingException($"'{decorator.Name}' unexpected character: {next}", enumerator);
                }
                break;

            default:
                decorator.AppendName(next);
                break;
            }
        }
        private static void TestOneFile(string filename, PythonLanguageVersion version, TokenizerOptions optionSet)
        {
            var originalText = File.ReadAllText(filename);

            TestOneString(version, optionSet, originalText);
        }
Пример #16
0
        /// <summary>
        /// Generate completions for the "current" token in the specified input
        /// text.
        /// </summary>
        /// <param name="inputText">The input text string.</param>
        /// <param name="cursorIndex">The current cursor index into the string.
        /// </param>
        /// <param name="tokenCompleter">Token completion handler to invoke.
        /// </param>
        /// <param name="existingTokenStartIndex">Receives the start index of
        /// the current token.</param>
        /// <param name="existingTokenLength">Receives the length of the current
        /// token.</param>
        /// <returns>The generated completions.</returns>
        private static IReadOnlyList <string> Create(string inputText, int cursorIndex, ITokenCompleter tokenCompleter, out int existingTokenStartIndex, out int existingTokenLength)
        {
            const TokenizerOptions tokenizerOptions =
                TokenizerOptions.AllowPartialInput |
                TokenizerOptions.HandleDoubleQuoteAsTokenDelimiter;

            //
            // Try to parse the line.  If we fail to parse it, then just
            // return immediately.
            //

            var tokens = StringUtilities.Tokenize(inputText, tokenizerOptions).ToList();

            //
            // Figure out which token we're in
            //

            int tokenIndex;

            for (tokenIndex = 0; tokenIndex < tokens.Count; ++tokenIndex)
            {
                var token = tokens[tokenIndex];
                if (cursorIndex > token.OuterEndingOffset)
                {
                    continue;
                }

                if (cursorIndex >= token.OuterStartingOffset)
                {
                    break;
                }

                // Insert an empty token here.
                tokens.Insert(
                    tokenIndex,
                    new Token(new Substring(inputText, cursorIndex, 0)));

                break;
            }

            if (tokenIndex < tokens.Count)
            {
                var token = tokens[tokenIndex];

                existingTokenStartIndex = token.OuterStartingOffset;
                existingTokenLength     = token.OuterLength;
            }
            else
            {
                existingTokenStartIndex = cursorIndex;
                existingTokenLength     = 0;
            }

            //
            // Ask for completions.
            //

            var tokenStrings = tokens.Select(token => RemoveQuotes(token.ToString())).ToArray();

            var completions = tokenCompleter.GetCompletions(tokenStrings, tokenIndex).ToList();

            // If necessary quote!
            for (var j = 0; j < completions.Count; j++)
            {
                var completion = completions[j];
                if (!completion.StartsWith(QuoteStr, StringComparison.OrdinalIgnoreCase))
                {
                    completions[j] = StringUtilities.QuoteIfNeeded(completions[j], QuoteChar);
                }
            }

            return(completions);
        }
Пример #17
0
        public RawTemplate Parse(string pattern, TokenizerOptions options)
        {
            var template = new RawTemplate();

            template.Options = options.Clone();

            var enumerator = new RawTokenEnumerator(pattern);

            if (enumerator.IsEmpty)
            {
                return(template);
            }

            var state            = FlatTokenParserState.AtStart;
            var token            = new RawToken();
            var decorator        = new RawTokenDecorator();
            var argument         = string.Empty;
            var frontMatterName  = new StringBuilder();
            var frontMatterValue = new StringBuilder();

            while (enumerator.IsEmpty == false)
            {
                switch (state)
                {
                case FlatTokenParserState.AtStart:
                    ParseStart(enumerator, ref state);
                    break;

                case FlatTokenParserState.InFrontMatter:
                    ParseFrontMatter(enumerator, ref frontMatterName, ref state);
                    break;

                case FlatTokenParserState.InFrontMatterComment:
                    ParseFrontMatterComment(enumerator, ref state);
                    break;

                case FlatTokenParserState.InFrontMatterOption:
                    ParseFrontMatterOption(enumerator, ref frontMatterName, ref state);
                    break;

                case FlatTokenParserState.InFrontMatterOptionValue:
                    ParseFrontMatterOptionValue(template, enumerator, ref frontMatterName, ref frontMatterValue, ref state);
                    break;

                case FlatTokenParserState.InPreamble:
                    ParsePreamble(ref token, enumerator, ref state);
                    break;

                case FlatTokenParserState.InTokenName:
                    ParseTokenName(template, ref token, enumerator, ref state);
                    break;

                case FlatTokenParserState.InDecorator:
                    ParseDecorator(template, ref token, enumerator, ref state, ref decorator);
                    break;

                case FlatTokenParserState.InDecoratorArgument:
                    ParseDecoratorArgument(enumerator, ref state, ref decorator, ref argument);
                    break;

                case FlatTokenParserState.InDecoratorArgumentSingleQuotes:
                    ParseDecoratorArgumentInSingleQuotes(enumerator, ref state, ref decorator, ref argument);
                    break;

                case FlatTokenParserState.InDecoratorArgumentDoubleQuotes:
                    ParseDecoratorArgumentInDoubleQuotes(enumerator, ref state, ref decorator, ref argument);
                    break;

                case FlatTokenParserState.InDecoratorArgumentRunOff:
                    ParseDecoratorArgumentRunOff(enumerator, ref state, ref decorator, ref argument);
                    break;


                default:
                    throw new TokenizerException($"Unknown FlatTokenParserState: {state}");
                }
            }

            // Append current token if it has contents
            // Note: allow empty token values, as these will serve to truncate the last
            // token in the template
            if (string.IsNullOrWhiteSpace(token.Preamble) == false)
            {
                template.Tokens.Add(token);
            }

            return(template);
        }
Пример #18
0
        private void AppendToken(PreTemplate template, PreToken token, ref StringBuilder tokenContent, TokenizerOptions options)
        {
            token.Content = tokenContent.ToString();
            token.Id      = template.Tokens.Count + 1;
            token.IsNull  = string.Compare(token.Name, "null", StringComparison.InvariantCultureIgnoreCase) == 0;

            if (options.TrimPreambleBeforeNewLine)
            {
                token.TrimPreambleBeforeNewLine();
            }

            if (options.TerminateOnNewline)
            {
                token.TerminateOnNewline = true;
            }

            tokenContent.Clear();

            var preamble = GetRepeatingMultilinePreamble(token);

            if (string.IsNullOrEmpty(preamble) == false && token.Repeating)
            {
                token.Repeating = false;
                template.Tokens.Add(token);

                var repeat = new PreToken
                {
                    Optional           = true,
                    Repeating          = true,
                    TerminateOnNewline = token.TerminateOnNewline,
                    Content            = token.Content
                };

                repeat.AppendName(token.Name);
                repeat.AppendPreamble(preamble);
                repeat.AppendDecorators(token.Decorators);

                repeat.Id          = template.Tokens.Count + 1;
                repeat.DependsOnId = token.Id;
                template.Tokens.Add(repeat);
            }
            else
            {
                template.Tokens.Add(token);
            }
        }
Пример #19
0
 /// <summary>
 /// Initializes a new instance of the <see cref="StringTokenizer"/> class.
 /// </summary>
 /// <param name="text">The text for tokenizing.</param>
 /// <param name="localTextOffset">The local text offset.</param>
 /// <param name="options">The options.</param>
 public StringTokenizer(string text, int localTextOffset, TokenizerOptions options)
     : this(text, 0, text.Length, localTextOffset, options)
 {
 }
Пример #20
0
        /// <summary>
        /// Tokenizes the provided input text line, observing quotes.
        /// </summary>
        /// <param name="line">Input line to parse.</param>
        /// <param name="options">Options for tokenizing.</param>
        /// <returns>Enumeration of tokens.</returns>
        public static IEnumerable <Token> Tokenize(string line, TokenizerOptions options)
        {
            //
            // State variables.
            //

            // This should be true if the current token started with a quote
            // character, regardless of whether we're still "inside" the quotes.
            var quoted = false;

            // This should be non-null only if we're in a quoted token and we
            // haven't yet seen the end quote character.  When non-null, its
            // value should be the specific quote character that opened the
            // token.
            char?inQuotes = null;

            // This should be true if an end quote was present in this token.
            // It would be false if "partial input" is allowed and the current
            // token starts with a quote character but has no end quote.
            var endQuotePresent = false;

            // The start index for the current token, or null if we haven't
            // yet seen any part of the next token.
            int?tokenStartIndex = null;

            // The end index for the current token, or null if we haven't seen
            // the end (yet).
            int?tokenEndIndex = null;

            //
            // Main loop.
            //

            // Iterate through each character of the input string, and then once
            // more after having reached the end of string so we can finalize
            // any last token in progress.
            for (var index = 0; index <= line.Length; ++index)
            {
                // If we've reached the end of the input string or a whitespace
                // character, then this may be the end of the token.  Remember,
                // though, that we need to skip past whitespace embedded within
                // a quoted token.
                if ((index == line.Length) || char.IsWhiteSpace(line[index]))
                {
                    var completeToken = false;

                    // If we're in the middle of parsing a token (i.e. we've
                    // either seen the open quotes for a quoted token, or
                    // we've seen at least one non-whitespace character for
                    // all other tokens), and if we're not currently still
                    // inside the quotes of a quoted token, then this must
                    // be the end of the token.
                    if (tokenStartIndex.HasValue && !inQuotes.HasValue)
                    {
                        completeToken   = true;
                        endQuotePresent = quoted;
                    }

                    // Otherwise, if we're at the end of the input string,
                    // we're still inside the quotes from the last token,
                    // and we were told by our caller to allow partial input,
                    // then end the token here but make a note that we did
                    // *not* see the end quote for this last token.
                    else if ((index == line.Length) &&
                             inQuotes.HasValue &&
                             options.HasFlag(TokenizerOptions.AllowPartialInput))
                    {
                        Debug.Assert(tokenStartIndex.HasValue);
                        completeToken = true;
                    }

                    // If this is the end of a token, then it's time to yield
                    // it to our caller and reset our internal state for the
                    // next iteration.
                    if (completeToken)
                    {
                        if (!tokenEndIndex.HasValue)
                        {
                            tokenEndIndex = index;
                        }

                        yield return(new Token(
                                         new Substring(line, tokenStartIndex.Value, tokenEndIndex.Value - tokenStartIndex.Value),
                                         quoted,
                                         endQuotePresent));

                        tokenStartIndex = null;
                        tokenEndIndex   = null;
                        quoted          = false;
                        inQuotes        = null;
                        endQuotePresent = false;
                    }
                }

                // Otherwise, specially handle quote characters.  We'll need
                // to decide whether the quote character marks the beginning
                // or end of a quoted token, or if it's embedded in the middle
                // of an unquoted token, or if it's errant.
                else if ((line[index] == '\"' && options.HasFlag(TokenizerOptions.HandleDoubleQuoteAsTokenDelimiter)) ||
                         (line[index] == '\'' && options.HasFlag(TokenizerOptions.HandleSingleQuoteAsTokenDelimiter)))
                {
                    // If we're not in the midst of parsing a token, then this
                    // must be the start of a new token.  Update the parse state
                    // appropriately to reflect this.
                    if (!tokenStartIndex.HasValue)
                    {
                        Debug.Assert(!inQuotes.HasValue);

                        inQuotes        = line[index];
                        quoted          = true;
                        tokenStartIndex = index + 1;
                    }

                    // Otherwise, we must be in the midst of parsing a token.
                    // If we're still inside the quotes for the token, then
                    // this may be the terminating quotes.  Otherwise, we'll
                    // fall through and just consider the quote character
                    // a normal character embedded within the current token.
                    else if (quoted)
                    {
                        Debug.Assert(inQuotes.HasValue);

                        // If this quote character is different from the one
                        // that opened this token, then we consider it a normal
                        // character.
                        if (inQuotes.Value != line[index])
                        {
                            // Nothing to do here.
                        }

                        // If this quote character isn't the last in the
                        // input string, and if the character following this
                        // one is *not* a whitespace character, then we've
                        // encountered something wrong.  Unless we were told
                        // to allow "partial input" (i.e. ignore errors like
                        // these), we'll throw an exception.
                        else if ((index + 1 != line.Length) &&
                                 !char.IsWhiteSpace(line[index + 1]))
                        {
                            if (!options.HasFlag(TokenizerOptions.AllowPartialInput))
                            {
                                throw new ArgumentOutOfRangeException(nameof(line), Strings.TerminatingQuotesNotEndOfToken);
                            }
                        }
                        else
                        {
                            // Okay, this was the end quote for the token.
                            // Mark it as such.
                            inQuotes        = null;
                            endQuotePresent = true;
                            tokenEndIndex   = index;
                        }
                    }
                }

                // Otherwise, it's a normal character.  It will end up in the
                // current token.  If we're not in the midst of a token, then
                // it's time to start a new one here.
                else if (!tokenStartIndex.HasValue)
                {
                    tokenStartIndex = index;
                }
            }

            // Now that we've gone past the end of the input string, check to
            // make sure we're not still inside quotes.  If we are, and if we're
            // not allowing partial input, then we throw an exception.  It's
            // bogus.
            if (tokenStartIndex.HasValue)
            {
                Debug.Assert(inQuotes.HasValue);
                Debug.Assert(!options.HasFlag(TokenizerOptions.AllowPartialInput));

                throw new ArgumentOutOfRangeException(nameof(line), Strings.UnterminatedQuotes);
            }
        }
Пример #21
0
        private void ParseTokenValueRunOff(PreTokenEnumerator enumerator, ref PreTemplate template, ref PreToken token, ref FlatTokenParserState state, ref bool inFrontMatterToken, ref StringBuilder tokenContent, TokenizerOptions options)
        {
            var next = enumerator.Next();

            tokenContent.Append(next);

            if (string.IsNullOrWhiteSpace(next))
            {
                if (inFrontMatterToken == false)
                {
                    return;
                }
                if (next != "\n")
                {
                    return;
                }
            }

            switch (next)
            {
            case ":":
                state = FlatTokenParserState.InDecorator;
                break;

            case "}" when inFrontMatterToken == false:
            case "\n" when inFrontMatterToken:
                token.IsFrontMatterToken = inFrontMatterToken;
                AppendToken(template, token, ref tokenContent, options);
                token = new PreToken();
                if (inFrontMatterToken)
                {
                    inFrontMatterToken = false;
                    state = FlatTokenParserState.InFrontMatter;
                }
                else
                {
                    state = FlatTokenParserState.InPreamble;
                }
                break;

            default:
                throw new TokenizerException($"Unexpected character: '{next}'");
            }
        }
Пример #22
0
 private Tokenizer MakeTokenizer(PythonLanguageVersion version, TokenizerOptions optionSet, string text,
                                 SourceLocation?initialSourceLocation = null)
 {
     return(MakeTokenizer(version, optionSet, new StringReader(text), initialSourceLocation));
 }
Пример #23
0
        private static void TestOneFile(string filename, PythonLanguageVersion version, TokenizerOptions optionSet) {
            var originalText = File.ReadAllText(filename);

            TestOneString(version, optionSet, originalText);
        }
Пример #24
0
        private static List<TokenWithSpan> TestOneString(PythonLanguageVersion version, TokenizerOptions optionSet, string originalText) {
            StringBuilder output = new StringBuilder();

            var tokenizer = new Tokenizer(version, options: optionSet);
            tokenizer.Initialize(new StringReader(originalText));
            Token token;
            int prevOffset = 0;

            List<TokenWithSpan> tokens = new List<TokenWithSpan>();
            while ((token = tokenizer.GetNextToken()) != Tokens.EndOfFileToken) {
                tokens.Add(new TokenWithSpan(token, tokenizer.TokenSpan));

                output.Append(tokenizer.PreceedingWhiteSpace);
                output.Append(token.VerbatimImage);

                const int contextSize = 50;
                for (int i = prevOffset; i < originalText.Length && i < output.Length; i++) {
                    if (originalText[i] != output[i]) {
                        // output some context
                        StringBuilder x = new StringBuilder();
                        StringBuilder y = new StringBuilder();
                        StringBuilder z = new StringBuilder();
                        for (int j = Math.Max(0, i - contextSize); j < Math.Min(Math.Min(originalText.Length, output.Length), i + contextSize); j++) {
                            x.AppendRepr(originalText[j]);
                            y.AppendRepr(output[j]);
                            if (j == i) {
                                z.Append("^");
                            } else {
                                z.Append(" ");
                            }
                        }

                        Console.WriteLine("Mismatch context at {0}:", i);
                        Console.WriteLine("Original: {0}", x.ToString());
                        Console.WriteLine("New     : {0}", y.ToString());
                        Console.WriteLine("Differs : {0}", z.ToString());
                        Console.WriteLine("Token   : {0}", token);

                        Assert.AreEqual(originalText[i], output[i], String.Format("Characters differ at {0}, got {1}, expected {2}", i, output[i], originalText[i]));
                    }
                }

                prevOffset = output.Length;
            }
            output.Append(tokenizer.PreceedingWhiteSpace);

            Assert.AreEqual(originalText.Length, output.Length);
            return tokens;
        }
Пример #25
0
 /// <summary>
 /// Initializes a new instance of the <see cref="ITokenizerBase"/> class.
 /// </summary>
 /// <param name="localTokenOffset">The local token offset.</param>
 /// <param name="options">The options.</param>
 protected ITokenizerBase(int localTokenOffset, TokenizerOptions options)
 {
     this.LocalTokenOffset = localTokenOffset;
     this.Options = options;
 }
Пример #26
0
        private void ParseTokenValue(PreTemplate template, ref PreToken token, PreTokenEnumerator enumerator, ref FlatTokenParserState state, ref bool inFrontMatterToken, ref StringBuilder tokenContent, TokenizerOptions options)
        {
            var next = enumerator.Next();
            var peek = enumerator.Peek();

            tokenContent.Append(next);

            switch (next)
            {
            case "{":
                throw new ParsingException($"Unexpected character '{{' in token '{token.Name}'", enumerator);

            case "}" when inFrontMatterToken == false:
            case "\n" when inFrontMatterToken:
                token.IsFrontMatterToken = inFrontMatterToken;
                AppendToken(template, token, ref tokenContent, options);
                token = new PreToken();
                if (inFrontMatterToken)
                {
                    inFrontMatterToken = false;
                    state = FlatTokenParserState.InFrontMatter;
                }
                else
                {
                    state = FlatTokenParserState.InPreamble;
                }
                break;

            case ":":
                state = FlatTokenParserState.InDecorator;
                break;

            case "'":
                state = FlatTokenParserState.InTokenValueSingleQuotes;
                break;

            case "\"":
                state = FlatTokenParserState.InTokenValueDoubleQuotes;
                break;

            case " ":
                switch (peek)
                {
                case " ":
                case "}" when inFrontMatterToken == false:
                case "\n" when inFrontMatterToken:
                case ":":
                    break;

                default:
                    if (token.HasValue)
                    {
                        throw new ParsingException($"Invalid character '{peek}' in token '{token.Name}'", enumerator);
                    }
                    break;
                }

                break;

            case "}" when inFrontMatterToken:
            case "\n" when inFrontMatterToken == false:
                throw  new ParsingException($"'{token.Name}' unexpected character: {next}", enumerator);

            default:
                token.AppendValue(next);
                break;
            }
        }