public void DoesTokenizationWorksAtAll(Encoding encoding) { // Indices: 01234567890123456 const string SOURCE = @"lexem123ley,6.7&#"; var buffer = encoding.GetBytes(SOURCE); var lexemes = new[] { "lexem", "123", "ley", ",", "6", ".", "7", "&", "#" }; int[] tokensIndices = new int[SOURCE.Length + 1]; int[] tokensClasses = new int[SOURCE.Length]; int[] tokenLengths = new int[SOURCE.Length]; var tokenizer = new RegExpTokenizer() { TokensClasses = tokensClasses, TokensIndices = tokensIndices, TokensLengths = tokenLengths }; tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); Array.ForEach(lexemes, (s) => tokenizer.UseTerminal(RegExp.Literal(s, encoding))); tokenizer.BuildTransitions(); var lastTokenIndex = tokenizer.Tokenize(buffer, 0, buffer.Length); Assert.That(lastTokenIndex + 1 == lexemes.Length); // correct #tokens? // Check whether each token has been recognized correctly for (int i = 0; i < lastTokenIndex; i++) { var tokenValue = encoding.GetString(buffer, tokensIndices[i], tokenLengths[i]); Assert.AreEqual(lexemes[i], tokenValue); } }
public void UnicodeTest() { ITokenizer tokenizer = new RegExpTokenizer(); tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); var hebrewWord = tokenizer.UseTerminal(RegExp.Literal("עברית", Encoding.Unicode)); // 1 var russianWord = tokenizer.UseTerminal(RegExp.Literal("русский", Encoding.Unicode)); // 2 var englishWord = tokenizer.UseTerminal(RegExp.Literal("english", Encoding.Unicode)); // 3 var whitespace = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Choice( RegExp.Literal(' ', Encoding.Unicode), RegExp.Literal('\t', Encoding.Unicode), RegExp.Literal('\n', Encoding.Unicode)))); tokenizer.BuildTransitions(); const string tokens = "1 23 45 67 89 01 23 "; const string indices = "01234567890123456789012345678901234567890123456789"; const string input = "עברית русский english עברית english русский עברית"; int[] tokenClasses = new int[input.Length]; int[] tokenIndices = new int[input.Length]; int[] tokenLengths = new int[input.Length]; int hebClass = hebrewWord.TokenClassID; int engClass = englishWord.TokenClassID; int rusClass = russianWord.TokenClassID; int wsClass = whitespace.TokenClassID; int[] expectedTokenClasses = new[] { hebClass, wsClass, rusClass, wsClass, engClass, wsClass, hebClass, wsClass, engClass, wsClass, rusClass, wsClass, hebClass}; int[] expectedTokenIndices = new[] { 0, 5, 6, 13, 14, 21, 22, 27, 28, 35, 36, 43, 44 }; var rawInput = Encoding.Unicode.GetBytes(input); tokenizer.TokensClasses = tokenClasses; tokenizer.TokensIndices = tokenIndices; tokenizer.TokensLengths = tokenLengths; int tokensNum = tokenizer.Tokenize(rawInput, 0, rawInput.Length) + 1; Assert.That(tokensNum, Is.EqualTo(expectedTokenClasses.Length)); for (int i = 0; i < tokensNum; i++) { Assert.That(tokenClasses[i], Is.EqualTo(expectedTokenClasses[i])); Assert.That(tokenIndices[i], Is.EqualTo(expectedTokenIndices[i]*2)); // Each symbol takes 2 bytes } }
public void GeneralTest() { ITokenizer tokenizer = new RegExpTokenizer(); tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); var number = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Choice( RegExp.Literal('0'), RegExp.Literal('1'), RegExp.Literal('2'), RegExp.Literal('3'), RegExp.Literal('4'), RegExp.Literal('5'), RegExp.Literal('6'), RegExp.Literal('7'), RegExp.Literal('8'), RegExp.Literal('9')))); var whitespace = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Choice( RegExp.Literal(' '), RegExp.Literal('\t'), RegExp.Literal('\n')))); tokenizer.BuildTransitions(); // Number of tokens: 1 23 45 67 890123 45 // Indices: 01234567890123456789012345678901 const string input = "123 456 789 02348 0 3 452 55555"; int[] tokenClasses = new int[input.Length]; int[] tokenIndices = new int[input.Length]; int[] tokenLengths = new int[input.Length]; int numClass = number.TokenClassID; int wsClass = whitespace.TokenClassID; int[] expectedTokenClasses = new[] { numClass, wsClass, numClass, wsClass, numClass, wsClass, numClass, wsClass, numClass, wsClass, numClass,wsClass,numClass,wsClass,numClass}; int[] expectedTokenIndices = new[] {0, 3, 4, 7, 8, 11, 12, 17, 18, 19, 20, 21, 22, 25, 26, 31}; var rawInput = Encoding.ASCII.GetBytes(input); tokenizer.TokensClasses = tokenClasses; tokenizer.TokensIndices = tokenIndices; tokenizer.TokensLengths = tokenLengths; int tokensNum = tokenizer.Tokenize(rawInput, 0, rawInput.Length) + 1; Assert.That(tokensNum, Is.EqualTo(expectedTokenClasses.Length)); for (int i = 0; i < tokensNum; i++) { Assert.That(tokenClasses[i], Is.EqualTo(expectedTokenClasses[i])); Assert.That(tokenIndices[i], Is.EqualTo(expectedTokenIndices[i])); } }
public void IgnoreTokenLazyQuantificationTest(Encoding encoding) { var tokenizer = new RegExpTokenizer(); tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); var number = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Range('0', '9', encoding))); var whitespace = tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Choice( RegExp.Literal(' ', encoding), RegExp.Literal('\t', encoding), RegExp.Literal('\n', encoding)))); tokenizer.IgnoreTerminal(RegExp.Sequence(RegExp.Literal("/*", encoding), RegExp.AnyNumberOf(RegExp.Range((char)0, (char)255, encoding)), RegExp.Literal("*/", encoding))); tokenizer.BuildTransitions(); // Number of tokens: 1 23 45 67 89 01 // Indices: 012345678901234567890123456789 const string input = "123 456 /*cdnp*/ 87 /*ae*/ 789"; int bufferLength = encoding.GetByteCount(input); int[] tokenClasses = new int[bufferLength]; int[] tokenIndices = new int[bufferLength]; int[] tokenLengths = new int[bufferLength]; int numClass = number.TokenClassID; int wsClass = whitespace.TokenClassID; int[] expectedTokenClasses = new[] { numClass, wsClass, numClass, wsClass, wsClass, numClass, wsClass, wsClass, numClass }; var expectedTokenIndices = new List<int>(15); //new[] { 0, 3, 4, 7, 16, 17, 19, 26, 27 }; var tokens = new[] {"123", " ", "456", " ", "/*cdnp*/", " ", "87", " ", "/*ae*/", " ", "789"}; expectedTokenIndices.Add(0); for (int i = 0; i < tokens.Length; i++) { string token = tokens[i]; expectedTokenIndices.Add(expectedTokenIndices[i] + encoding.GetByteCount(token)); } // Delete ingored tokens expectedTokenIndices.RemoveAt(8); expectedTokenIndices.RemoveAt(4); var rawInput = encoding.GetBytes(input); tokenizer.TokensClasses = tokenClasses; tokenizer.TokensIndices = tokenIndices; tokenizer.TokensLengths = tokenLengths; int tokensNum = tokenizer.Tokenize(rawInput, 0, rawInput.Length) + 1; Assert.That(tokensNum, Is.EqualTo(expectedTokenClasses.Length)); for (int i = 0; i < tokensNum; i++) { Assert.That(tokenClasses[i], Is.EqualTo(expectedTokenClasses[i]), "Error On token class comparison: " + i); Assert.That(tokenIndices[i], Is.EqualTo(expectedTokenIndices[i]), "Error On token index comparison: " + i); } }
public void Temptest(string encodingStr) { var encoding = CommonTestRoutines.GetEncoding(encodingStr); var tokenizer = new RegExpTokenizer(); tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); //tokenizer.IgnoreTerminal(RegExp.Sequence(RegExp.Literal("/*", encoding), RegExp.AnyNumberOf(RegExp.Range((char)0, (char)255, encoding)), // RegExp.Literal("*/", encoding))); tokenizer.IgnoreTerminal(RegExp.Sequence( RegExp.Literal("/*", encoding), //RegExp.AnyNumberOf( // RegExp.Range((char)0, (char)255, encoding) //), //RegExp.Choice( // RegExp.AnyNumberOf( // RegExp.Range((char)0, (char)255, encoding) // ), RegExp.Not(RegExp.Literal("*/", encoding), false) //) )); tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Range('0', '9', encoding))); tokenizer.UseTerminal(RegExp.AtLeastOneOf(RegExp.Literal(' ', encoding))); tokenizer.BuildTransitions(); const string input = "/*111*/ 222 /*333*/ 444"; int bufferLength = encoding.GetByteCount(input); tokenizer.TokensClasses = new int[bufferLength]; tokenizer.TokensIndices = new int[bufferLength]; tokenizer.TokensLengths = new int[bufferLength]; var rawInput = encoding.GetBytes(input); //rawInput = new byte[] {00, 49, 00, 50, 00, 51, 00, 32}; int tokensNum = tokenizer.Tokenize(rawInput, 0, rawInput.Length) + 1; }
public void LexicalActionTest() { // Classes 2 13 14 15 // Indices 01234567890 const string SOURCE = @"aa bb cc dd"; var buffer = Encoding.ASCII.GetBytes(SOURCE); int[] tokensIndices = new int[SOURCE.Length]; int[] tokensClasses = new int[SOURCE.Length]; int[] tokenLengths = new int[SOURCE.Length]; var tokenizer = new RegExpTokenizer() { TokensClasses = tokensClasses, TokensIndices = tokensIndices, TokensLengths = tokenLengths }; tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); var lexicalActionExecuted = false; Token tokenBB=null, tokenCC=null; tokenizer.UseTerminal(RegExp.Literal(" ")); // class 1 tokenizer.UseTerminal(RegExp.Literal("aa")); // class 2 tokenizer.UseTerminal(RegExp.Literal("bb"), (t) => { tokenBB = t; lexicalActionExecuted = true; return true; // Pass this token to parser }); tokenizer.UseTerminal(RegExp.Literal("cc"), (t) => { tokenCC = t; lexicalActionExecuted &= true; return false; // Ignore token }); tokenizer.UseTerminal(RegExp.Literal("dd")); tokenizer.BuildTransitions(); var tokensCount = tokenizer.Tokenize(buffer, 0, SOURCE.Length) + 1; Assert.That(tokenBB.Buffer == buffer); Assert.That(tokenBB.Offset, Is.EqualTo(3)); Assert.That(tokenBB.Class, Is.EqualTo(3)); Assert.That(tokenBB.Length, Is.EqualTo(2)); Assert.That(tokenCC.Buffer == buffer); Assert.That(tokenCC.Offset, Is.EqualTo(6)); Assert.That(tokenCC.Class, Is.EqualTo(4)); Assert.That(tokenCC.Length, Is.EqualTo(2)); Assert.True(lexicalActionExecuted, "The lexical actions were not executed partially or at all"); Assert.That(tokensCount, Is.EqualTo(6)); Assert.That(tokensClasses[0], Is.EqualTo(2)); Assert.That(tokensClasses[1], Is.EqualTo(1)); Assert.That(tokensClasses[2], Is.EqualTo(3)); Assert.That(tokensClasses[3], Is.EqualTo(1)); Assert.That(tokensClasses[4], Is.EqualTo(1)); Assert.That(tokensClasses[5], Is.EqualTo(5)); }
public void UnendedTokenAtTheEndTest() { // Indices: 00000000001111111111222222222 // 01234567890123456789012345678 const string SAMPLE = "windows.bugsNum=long.Max;Linu"; var tokensClasses = new int[SAMPLE.Length]; var tokensIndices = new int[SAMPLE.Length]; var tokensLengths = new int[SAMPLE.Length]; var tokenizer = new RegExpTokenizer() { TokensClasses = tokensClasses, TokensIndices = tokensIndices, TokensLengths = tokensLengths }; tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); tokenizer.UseTerminal(RegExp.Literal("windows")); // class: 1 tokenizer.UseTerminal(RegExp.Literal(".")); // class: 2 tokenizer.UseTerminal(RegExp.Literal("bugsNum")); // class: 3 tokenizer.UseTerminal(RegExp.Literal("=")); // class: 4 tokenizer.UseTerminal(RegExp.Literal("long")); // class: 5 tokenizer.UseTerminal(RegExp.Literal("Max")); // class: 6 tokenizer.UseTerminal(RegExp.Literal(";")); // class: 7 tokenizer.UseTerminal(RegExp.Literal("Linux")); // class: 8 tokenizer.BuildTransitions(); var tokensNum = tokenizer.Tokenize(Encoding.ASCII.GetBytes(SAMPLE), 0, SAMPLE.Length) + 1; Assert.That(tokensNum == 9); Assert.That(tokensIndices[0] == 0); Assert.That(tokensIndices[1] == 7); Assert.That(tokensIndices[2] == 8); Assert.That(tokensIndices[3] == 15); Assert.That(tokensIndices[4] == 16); Assert.That(tokensIndices[5] == 20); Assert.That(tokensIndices[6] == 21); Assert.That(tokensIndices[7] == 24); Assert.That(tokensIndices[8] == 25); Assert.That(tokensClasses[0] == 1); Assert.That(tokensClasses[1] == 2); Assert.That(tokensClasses[2] == 3); Assert.That(tokensClasses[3] == 4); Assert.That(tokensClasses[4] == 5); Assert.That(tokensClasses[5] == 2); Assert.That(tokensClasses[6] == 6); Assert.That(tokensClasses[7] == 7); }
public void TokenizerClassTest() { // Should be: aa, aab, abc const string SAMPLE = "aaaababc"; var tokensClasses = new int[SAMPLE.Length]; var tokensIndices = new int[SAMPLE.Length]; var tokensLengths = new int[SAMPLE.Length]; var tokenizer = new RegExpTokenizer() { TokensClasses = tokensClasses, TokensIndices = tokensIndices, TokensLengths = tokensLengths }; tokenizer.SetTransitionFunction(new TableDrivenTransitionFunction()); tokenizer.UseTerminal(RegExp.Literal("aab")); // class 1 tokenizer.UseTerminal(RegExp.Literal("acb")); // class 2 tokenizer.UseTerminal(RegExp.Literal("abc")); // class 3 tokenizer.UseTerminal(RegExp.Literal("aa")); // class 4 tokenizer.BuildTransitions(); var tokensNum = tokenizer.Tokenize(Encoding.ASCII.GetBytes(SAMPLE), 0, SAMPLE.Length); tokensNum++; Assert.That(tokensNum == 3); Assert.That(tokensClasses[0] == 4); Assert.That(tokensClasses[1] == 1); Assert.That(tokensClasses[2] == 3); Assert.That(tokensIndices[0] == 0); Assert.That(tokensIndices[1] == 2); Assert.That(tokensIndices[2] == 5); }