public void EmbeddingTest() { //Set up tokenizer WaebricLexer lexer = new WaebricLexer(new StringReader("\"pre<\"\\\">\">post\"")); lexer.LexicalizeStream(); TokenIterator tokens = lexer.GetTokenIterator(); //Test token Assert.AreEqual(1, tokens.GetSize()); Assert.AreEqual(TokenType.EMBEDDING, tokens.Peek(1).GetType()); //Get embedding and test inner tokens EmbeddingToken parsedToken = (EmbeddingToken)tokens.NextToken(); TokenIterator embeddingTokens = parsedToken.GetTokenIterator(); Assert.AreEqual(7, embeddingTokens.GetSize()); Assert.AreEqual("\"", embeddingTokens.Peek(1).GetValue().ToString()); Assert.AreEqual("pre", embeddingTokens.Peek(2).GetValue().ToString()); Assert.AreEqual("<", embeddingTokens.Peek(3).GetValue().ToString()); Assert.AreEqual("\\\">", embeddingTokens.Peek(4).GetValue().ToString()); Assert.AreEqual(">", embeddingTokens.Peek(5).GetValue().ToString()); Assert.AreEqual("post", embeddingTokens.Peek(6).GetValue().ToString()); Assert.AreEqual("\"", embeddingTokens.Peek(7).GetValue().ToString()); }
public void ComplexEmbeddingTest() { //Set up tokenizer WaebricLexer lexer = new WaebricLexer(new StringReader("\"<a(href=\"http://www.microsoft.com\") \"Microsoft Corp\">\"")); lexer.LexicalizeStream(); TokenIterator tokens = lexer.GetTokenIterator(); //Test token Assert.AreEqual(1, tokens.GetSize()); Assert.AreEqual(TokenType.EMBEDDING, tokens.Peek(1).GetType()); //Test tokens in embedding EmbeddingToken embeddingToken = (EmbeddingToken)tokens.NextToken(); TokenIterator embeddingTokens = embeddingToken.GetTokenIterator(); Assert.AreEqual(12, embeddingTokens.GetSize()); Assert.AreEqual("\"", embeddingTokens.Peek(1).GetValue().ToString()); Assert.AreEqual("", embeddingTokens.Peek(2).GetValue().ToString()); Assert.AreEqual("<", embeddingTokens.Peek(3).GetValue().ToString()); Assert.AreEqual("a", embeddingTokens.Peek(4).GetValue().ToString()); Assert.AreEqual("(", embeddingTokens.Peek(5).GetValue().ToString()); Assert.AreEqual("href", embeddingTokens.Peek(6).GetValue().ToString()); Assert.AreEqual("=", embeddingTokens.Peek(7).GetValue().ToString()); Assert.AreEqual("http://www.microsoft.com", embeddingTokens.Peek(8).GetValue().ToString()); Assert.AreEqual(")", embeddingTokens.Peek(9).GetValue().ToString()); Assert.AreEqual("Microsoft Corp", embeddingTokens.Peek(10).GetValue().ToString()); Assert.AreEqual(">", embeddingTokens.Peek(11).GetValue().ToString()); Assert.AreEqual("\"", embeddingTokens.Peek(12).GetValue().ToString()); }
/// <summary> /// Lexicalizes an embedding /// </summary> private void LexicalizeEmbedding(String text) { List <Token> embeddingTokens = new List <Token>(); String buffer = ""; char currentChar = '\0'; char previousChar = '\0'; bool embedded = false; bool quoted = false; //Add " token embeddingTokens.Add(new Token('"', TokenType.SYMBOL, tokenizer.GetScannedLines())); //Add text to buffer buffer = text; int tempLinenumber = tokenizer.GetScannedLines(); //Scan until end of embedding found currentChar = tokenizer.GetCharacterValue(); do { if (CurrentToken == StreamTokenizer.EOF) { //Abrupt stop of stream throw new StreamTokenizerException("Unclosed embedding", tokenizer.GetScannedLines()); } if (currentChar == '"' && previousChar != '\\') { quoted = !quoted; } if (currentChar == '<' && !quoted) { // Detected start of embed, process pre-text embeddingTokens.Add(new Token(buffer, TokenType.TEXT, tokenizer.GetScannedLines())); buffer = ""; // Clean buffer embedded = true; } buffer += tokenizer.ToString(); previousChar = currentChar; if (currentChar == '>' && !quoted) { // Detected end of embed, process content LexicalizeBuffer(embeddingTokens, buffer, tempLinenumber); buffer = ""; embedded = false; } CurrentToken = tokenizer.NextToken(); currentChar = tokenizer.GetCharacterValue(); } while ((currentChar != '"' || previousChar == '\\') || embedded); if (!buffer.Equals("")) { // Process post text embeddingTokens.Add(new Token(buffer, TokenType.TEXT, tokenizer.GetScannedLines())); } if (CurrentToken != StreamTokenizer.EOF) { // Attach closure quote embeddingTokens.Add(new Token('"', TokenType.SYMBOL, tokenizer.GetScannedLines())); //Skip " token CurrentToken = tokenizer.NextToken(); } // Create token from buffered content Token embedding = new EmbeddingToken(embeddingTokens, TokenType.EMBEDDING, tokenizer.GetScannedLines()); TokenStream.Add(embedding); }