public void IgnoreTokenLazyQuantificationTest(Encoding encoding) { var tokenizer = new RegExpTokenizer(); tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); var number = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Range('0', '9', encoding))); var whitespace = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Choice( RegExp.Literal(' ', encoding), RegExp.Literal('\t', encoding), RegExp.Literal('\n', encoding)))); tokenizer.IgnoreTerminal(RegExp.Sequence(RegExp.Literal("/*", encoding), RegExp.AnyNumberOf(RegExp.Range((char)0, (char)255, encoding)), RegExp.Literal("*/", encoding))); tokenizer.BuildTransitions(); // Number of tokens: 1 23 45 67 89 01 // Indices: 012345678901234567890123456789 const string input = "123 456 /*cdnp*/ 87 /*ae*/ 789"; int bufferLength = encoding.GetByteCount(input); int[] tokenClasses = new int[bufferLength]; int[] tokenIndices = new int[bufferLength]; int[] tokenLengths = new int[bufferLength]; int numClass = number.TokenClassID; int wsClass = whitespace.TokenClassID; int[] expectedTokenClasses = new[] { numClass, wsClass, numClass, wsClass, wsClass, numClass, wsClass, wsClass, numClass }; var expectedTokenIndices = new List<int>(15); //new[] { 0, 3, 4, 7, 16, 17, 19, 26, 27 }; var tokens = new[] {"123", " ", "456", " ", "/*cdnp*/", " ", "87", " ", "/*ae*/", " ", "789"}; expectedTokenIndices.Add(0); for (int i = 0; i < tokens.Length; i++) { string token = tokens[i]; expectedTokenIndices.Add(expectedTokenIndices[i] + encoding.GetByteCount(token)); } // Delete ingored tokens expectedTokenIndices.RemoveAt(8); expectedTokenIndices.RemoveAt(4); var rawInput = encoding.GetBytes(input); tokenizer.TokensClasses = tokenClasses; tokenizer.TokensIndices = tokenIndices; tokenizer.TokensLengths = tokenLengths; int tokensNum = tokenizer.Tokenize(rawInput, 0, rawInput.Length) + 1; Assert.That(tokensNum, Is.EqualTo(expectedTokenClasses.Length)); for (int i = 0; i < tokensNum; i++) { Assert.That(tokenClasses[i], Is.EqualTo(expectedTokenClasses[i]), "Error On token class comparison: " + i); Assert.That(tokenIndices[i], Is.EqualTo(expectedTokenIndices[i]), "Error On token index comparison: " + i); } }
public void Temptest(string encodingStr) { var encoding = CommonTestRoutines.GetEncoding(encodingStr); var tokenizer = new RegExpTokenizer(); tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); //tokenizer.IgnoreTerminal(RegExp.Sequence(RegExp.Literal("/*", encoding), RegExp.AnyNumberOf(RegExp.Range((char)0, (char)255, encoding)), // RegExp.Literal("*/", encoding))); tokenizer.IgnoreTerminal(RegExp.Sequence( RegExp.Literal("/*", encoding), //RegExp.AnyNumberOf( // RegExp.Range((char)0, (char)255, encoding) //), //RegExp.Choice( // RegExp.AnyNumberOf( // RegExp.Range((char)0, (char)255, encoding) // ), RegExp.Not(RegExp.Literal("*/", encoding), false) //) )); tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Range('0', '9', encoding))); tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Literal(' ', encoding))); tokenizer.BuildTransitions(); const string input = "/*111*/ 222 /*333*/ 444"; int bufferLength = encoding.GetByteCount(input); tokenizer.TokensClasses = new int[bufferLength]; tokenizer.TokensIndices = new int[bufferLength]; tokenizer.TokensLengths = new int[bufferLength]; var rawInput = encoding.GetBytes(input); //rawInput = new byte[] {00, 49, 00, 50, 00, 51, 00, 32}; int tokensNum = tokenizer.Tokenize(rawInput, 0, rawInput.Length) + 1; }