private void Tokenize(TextReader reader) { if (reader == null) { throw new ArgumentNullException("reader was null."); } tokenizer.Start(); bool swallowBom = true; try { char[] buffer = new char[2048]; UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); bool lastWasCR = false; int len = -1; if ((len = reader.Read(buffer, 0, buffer.Length)) != 0) { int streamOffset = 0; int offset = 0; int length = len; if (swallowBom) { if (buffer[0] == '\uFEFF') { streamOffset = -1; offset = 1; length--; } } if (length > 0) { tokenizer.SetTransitionBaseOffset(streamOffset); bufr.Start = offset; bufr.End = offset + length; while (bufr.HasMore) { bufr.Adjust(lastWasCR); lastWasCR = false; if (bufr.HasMore) { lastWasCR = tokenizer.TokenizeBuffer(bufr); } } } streamOffset = length; while ((len = reader.Read(buffer, 0, buffer.Length)) != 0) { tokenizer.SetTransitionBaseOffset(streamOffset); bufr.Start = 0; bufr.End = len; while (bufr.HasMore) { bufr.Adjust(lastWasCR); lastWasCR = false; if (bufr.HasMore) { lastWasCR = tokenizer.TokenizeBuffer(bufr); } } streamOffset += len; } } tokenizer.Eof(); } finally { tokenizer.End(); } }
public bool TokenizeBuffer(UTF16Buffer buffer) { TokenizerState state = stateSave; TokenizerState returnState = returnStateSave; char c = '\u0000'; shouldSuspend = false; lastCR = false; int start = buffer.Start; /** * The index of the last <code>char</code> read from <code>buf</code>. */ int pos = start - 1; /** * The index of the first <code>char</code> in <code>buf</code> that is * part of a coalesced run of character tokens or * <code>Integer.MAX_VALUE</code> if there is not a current run being * coalesced. */ switch (state) { case TokenizerState.DATA: case TokenizerState.RCDATA: case TokenizerState.SCRIPT_DATA: case TokenizerState.PLAINTEXT: case TokenizerState.RAWTEXT: case TokenizerState.CDATA_SECTION: case TokenizerState.SCRIPT_DATA_ESCAPED: case TokenizerState.SCRIPT_DATA_ESCAPE_START: case TokenizerState.SCRIPT_DATA_ESCAPE_START_DASH: case TokenizerState.SCRIPT_DATA_ESCAPED_DASH: case TokenizerState.SCRIPT_DATA_ESCAPED_DASH_DASH: case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_START: case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED: case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH: case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_END: cstart = start; break; default: cstart = int.MaxValue; break; } /** * The number of <code>char</code>s in <code>buf</code> that have * meaning. (The rest of the array is garbage and should not be * examined.) */ // [NOCPP[ pos = StateLoop(state, c, pos, buffer.Buffer, false, returnState, buffer.End); // ]NOCPP] if (pos == buffer.End) { // exiting due to end of buffer buffer.Start = pos; } else { buffer.Start = pos + 1; } return lastCR; }
public bool TokenizeBuffer(UTF16Buffer buffer) { InterLexerState state = stateSave; InterLexerState returnState = returnStateSave; char c = '\u0000'; shouldSuspend = false; lastCR = false; int start = buffer.Start; /** * The index of the last <code>char</code> read from <code>buf</code>. */ int pos = start - 1; /** * The index of the first <code>char</code> in <code>buf</code> that is * part of a coalesced run of character tokens or * <code>Integer.MAX_VALUE</code> if there is not a current run being * coalesced. */ switch (state) { case InterLexerState.s01_DATA_i: case (InterLexerState)CDataLexerState.s03_RCDATA_p: case (InterLexerState)ScriptDataLexerState.s06_SCRIPT_DATA_p: case (InterLexerState)CDataLexerState.s07_PLAINTEXT_p: case (InterLexerState)CDataLexerState.s05_RAWTEXT_p: case (InterLexerState)CDataLexerState.s68_CDATA_SECTION_p: case (InterLexerState)ScriptDataLexerState.s22_SCRIPT_DATA_ESCAPED_p: case (InterLexerState)ScriptDataLexerState.s20_SCRIPT_DATA_ESCAPE_START_p: case (InterLexerState)ScriptDataLexerState.s21_SCRIPT_DATA_ESCAPE_START_DASH_p: case (InterLexerState)ScriptDataLexerState.s23_SCRIPT_DATA_ESCAPED_DASH_p: case (InterLexerState)ScriptDataLexerState.s24_SCRIPT_DATA_ESCAPED_DASH_DASH_p: case (InterLexerState)ScriptDataLexerState.s28_SCRIPT_DATA_DOUBLE_ESCAPE_START_p: case (InterLexerState)ScriptDataLexerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED_p: case (InterLexerState)ScriptDataLexerState.s32_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_p: case (InterLexerState)ScriptDataLexerState.s30_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_p: case (InterLexerState)ScriptDataLexerState.s31_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_p: case (InterLexerState)ScriptDataLexerState.s33_SCRIPT_DATA_DOUBLE_ESCAPE_END_p: cstart = start; break; default: cstart = int.MaxValue; break; } /** * The number of <code>char</code>s in <code>buf</code> that have * meaning. (The rest of the array is garbage and should not be * examined.) */ // [NOCPP[ //pos = StateLoop(state, c, pos, buffer.Buffer, false, returnState, buffer.End); StateLoop3(state, returnState); // ]NOCPP] if (pos == buffer.End) { // exiting due to end of buffer buffer.Start = pos; } else { buffer.Start = pos + 1; } return lastCR; }
private void Tokenize() { if (ActiveStreamReader == null) { throw new ArgumentNullException("reader was null."); } ConfigureTreeBuilderForParsingMode(); tokenizer.Start(); bool swallowBom = true; try { char[] buffer = new char[tokenizerBlockChars]; UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); bool lastWasCR = false; int len = -1; if ((len = ActiveStreamReader.Read(buffer, 0, buffer.Length)) != 0) { int offset = 0; int length = len; if (swallowBom) { if (buffer[0] == '\uFEFF') { ActiveStreamOffset = -1; offset = 1; length--; } } if (length > 0) { tokenizer.SetTransitionBaseOffset(ActiveStreamOffset); bufr.Start = offset; bufr.End = offset + length; while (bufr.HasMore && !tokenizer.IsSuspended) { bufr.Adjust(lastWasCR); lastWasCR = false; if (bufr.HasMore && !tokenizer.IsSuspended) { lastWasCR = tokenizer.TokenizeBuffer(bufr); } } } CheckForReEncode(); ActiveStreamOffset = length; while (!tokenizer.IsSuspended && (len = ActiveStreamReader.Read(buffer, 0, buffer.Length)) != 0) { tokenizer.SetTransitionBaseOffset(ActiveStreamOffset); bufr.Start = 0; bufr.End = len; while (bufr.HasMore && !tokenizer.IsSuspended) { bufr.Adjust(lastWasCR); lastWasCR = false; if (bufr.HasMore && !tokenizer.IsSuspended) { lastWasCR = tokenizer.TokenizeBuffer(bufr); } } ActiveStreamOffset += len; CheckForReEncode(); } } if (!tokenizer.IsSuspended) { tokenizer.Eof(); } } finally { tokenizer.End(); } }
private void Tokenize(TextReader reader) { if (reader == null) { throw new ArgumentNullException("reader was null."); } if (HtmlParsingMode != HtmlParsingMode.Auto) { ConfigureTreeBuilderForParsingMode(); tokenizer.Start(); } bool swallowBom = true; try { char[] buffer = new char[2048]; UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); bool lastWasCR = false; int len = -1; if ((len = reader.Read(buffer, 0, buffer.Length)) != 0) { if (HtmlParsingMode == HtmlParser.HtmlParsingMode.Auto) { string ctx = GetContext(buffer); switch (ctx) { case "*document": HtmlParsingMode = HtmlParsingMode.Document; break; case "*content": HtmlParsingMode = HtmlParsingMode.Content; break; default: HtmlParsingMode = HtmlParsingMode.Fragment; treeBuilder.SetFragmentContext(ctx); break; } ConfigureTreeBuilderForParsingMode(); tokenizer.Start(); } int streamOffset = 0; int offset = 0; int length = len; if (swallowBom) { if (buffer[0] == '\uFEFF') { streamOffset = -1; offset = 1; length--; } } if (length > 0) { tokenizer.SetTransitionBaseOffset(streamOffset); bufr.Start = offset; bufr.End = offset + length; while (bufr.HasMore) { bufr.Adjust(lastWasCR); lastWasCR = false; if (bufr.HasMore) { lastWasCR = tokenizer.TokenizeBuffer(bufr); } } } streamOffset = length; while ((len = reader.Read(buffer, 0, buffer.Length)) != 0) { tokenizer.SetTransitionBaseOffset(streamOffset); bufr.Start = 0; bufr.End = len; while (bufr.HasMore) { bufr.Adjust(lastWasCR); lastWasCR = false; if (bufr.HasMore) { lastWasCR = tokenizer.TokenizeBuffer(bufr); } } streamOffset += len; } } tokenizer.Eof(); } finally { tokenizer.End(); } }