public void Run() { if (endOfFile) { return; } char[] buffer = null; var start = 0; var current = 0; var end = 0; if (!input.ReadMore(ref buffer, ref start, ref current, ref end)) { // cannot decode more data until next input chunk is available return; } if (input.EndOfFile) { endOfFile = true; } if (end - start != 0) { if (!gotAnyText) { if (output is ConverterEncodingOutput) { var encodingOutput = output as ConverterEncodingOutput; if (encodingOutput.CodePageSameAsInput) { if (input is ConverterDecodingInput) { encodingOutput.Encoding = (input as ConverterDecodingInput).Encoding; } else { encodingOutput.Encoding = Encoding.UTF8; } } } gotAnyText = true; } output.Write(buffer, start, end - start); input.ReportProcessed(end - start); } if (endOfFile) { output.Flush(); } }
public TextTokenId Parse() { char ch, chT; CharClass charClass, charClassT; bool forceFlushToken; int runStart; if (tokenBuilder.Valid) { // start the new token input.ReportProcessed(parseCurrent - parseStart); parseStart = parseCurrent; tokenBuilder.Reset(); InternalDebug.Assert(tokenBuilder.TotalLength == 0); } while (true) { InternalDebug.Assert(parseThreshold > 0); // try to read and decode more input data if necessary forceFlushToken = false; if (parseCurrent + parseThreshold > parseEnd) { if (!endOfFile) { if (!input.ReadMore(ref parseBuffer, ref parseStart, ref parseCurrent, ref parseEnd)) { // cannot decode more data until next input chunk is available // we may have incomplete token at this point InternalDebug.Assert(!tokenBuilder.Valid); return(TextTokenId.None); } // NOTE: in case of success, ReadMore can move the token in the buffer and / or // switch to a new buffer tokenBuilder.BufferChanged(parseBuffer, parseStart); var decodingInput = input as ConverterDecodingInput; if (decodingInput != null && decodingInput.EncodingChanged) { // reset the flag as required by ConverterInput protocol decodingInput.EncodingChanged = false; // signal encoding change to the caller return(tokenBuilder.MakeEmptyToken(TextTokenId.EncodingChange, decodingInput.Encoding.CodePage)); } if (input.EndOfFile) { endOfFile = true; } if (!endOfFile && parseEnd - parseStart < input.MaxTokenSize) { // we have successfuly read "something", ensure this something is above threshold continue; } } // end of file or token is too long, need to flush the token as is (split) forceFlushToken = true; } // we should have read something unless this is EOF InternalDebug.Assert(parseEnd > parseCurrent || forceFlushToken); // compact, so that the next character (or parseThreshold next characters) are valid. // get the next input character ch = parseBuffer[parseCurrent]; charClass = ParseSupport.GetCharClass(ch); if (ParseSupport.InvalidUnicodeCharacter(charClass) || parseThreshold > 1) { while (ParseSupport.InvalidUnicodeCharacter(charClass) && parseCurrent < parseEnd) { ch = parseBuffer[++parseCurrent]; charClass = ParseSupport.GetCharClass(ch); } if (parseThreshold > 1 && parseCurrent + 1 < parseEnd) { InternalDebug.Assert(parseCurrent == parseStart); InternalDebug.Assert(!ParseSupport.InvalidUnicodeCharacter(ParseSupport.GetCharClass(parseBuffer[parseCurrent]))); var src = parseCurrent + 1; var dst = parseCurrent + 1; while (src < parseEnd && dst < parseCurrent + parseThreshold) { chT = parseBuffer[src]; charClassT = ParseSupport.GetCharClass(chT); if (!ParseSupport.InvalidUnicodeCharacter(charClassT)) { if (src != dst) { InternalDebug.Assert(ParseSupport.InvalidUnicodeCharacter(ParseSupport.GetCharClass(parseBuffer[dst]))); parseBuffer[dst] = chT; // move source character parseBuffer[src] = '\0'; // replace source character with invalid (zero) } dst++; } src++; } if (src == parseEnd && parseCurrent + parseThreshold > dst) { Array.Copy(parseBuffer, parseCurrent, parseBuffer, parseEnd - (dst - parseCurrent), dst - parseCurrent); parseCurrent = parseEnd - (dst - parseCurrent); // reporting all invalid characters consumed input.ReportProcessed(parseCurrent - parseStart); parseStart = parseCurrent; } } if (parseCurrent + parseThreshold > parseEnd) { // we still below threshold... if (!forceFlushToken) { // go back and try to read more continue; } // this is the end of file if (parseCurrent == parseEnd && !tokenBuilder.IsStarted && endOfFile) { // EOF and token is empty, just return EOF token break; } // this is the end of file, we cannot make it above threshold but still have some input data. } // reset the threshold to its default value parseThreshold = 1; } // now parse the buffer content runStart = parseCurrent; InternalDebug.Assert(!tokenBuilder.IsStarted); tokenBuilder.StartText(runStart); while (tokenBuilder.PrepareToAddMoreRuns(9, runStart, RunKind.Text)) { while (ParseSupport.TextUriCharacter(charClass)) { ch = parseBuffer[++parseCurrent]; charClass = ParseSupport.GetCharClass(ch); } if (ParseSupport.TextNonUriCharacter(charClass)) { if (parseCurrent != runStart) { // we have nonempty NWSP run AddTextRun(RunTextType.NonSpace, runStart, parseCurrent); } runStart = parseCurrent; do { ch = parseBuffer[++parseCurrent]; charClass = ParseSupport.GetCharClass(ch); }while (ParseSupport.NbspCharacter(charClass)); AddTextRun(RunTextType.NonSpace, runStart, parseCurrent); } else if (ParseSupport.WhitespaceCharacter(charClass)) { if (parseCurrent != runStart) { // we have nonempty NWSP run AddTextRun(RunTextType.NonSpace, runStart, parseCurrent); } runStart = parseCurrent; if (ch == ' ') { // ordinary space chT = parseBuffer[parseCurrent + 1]; charClassT = ParseSupport.GetCharClass(chT); if (!ParseSupport.WhitespaceCharacter(charClassT)) { // add single space to text run ch = chT; charClass = charClassT; parseCurrent++; AddTextRun(RunTextType.Space, runStart, parseCurrent); runStart = parseCurrent; continue; } } // this is a potentially collapsable whitespace, accumulate whitespace run(s) ParseWhitespace(ch, charClass); if (parseThreshold > 1) { // terminate the text parse loop to read more data break; } runStart = parseCurrent; ch = parseBuffer[parseCurrent]; charClass = ParseSupport.GetCharClass(ch); } else if (ParseSupport.NbspCharacter(charClass)) { if (parseCurrent != runStart) { AddTextRun(RunTextType.NonSpace, runStart, parseCurrent); } runStart = parseCurrent; do { ch = parseBuffer[++parseCurrent]; charClass = ParseSupport.GetCharClass(ch); }while (ParseSupport.NbspCharacter(charClass)); AddTextRun(RunTextType.Nbsp, runStart, parseCurrent); } else { InternalDebug.Assert(ParseSupport.InvalidUnicodeCharacter(charClass)); // finish the "non-whitespace" run if (parseCurrent != runStart) { AddTextRun(RunTextType.NonSpace, runStart, parseCurrent); } if (parseCurrent >= parseEnd) { // end of available input (EOB), flush the current text token break; } // this is just an embedded invalid character, skip any such invalid // characters and try to continue collecting text do { ch = parseBuffer[++parseCurrent]; charClass = ParseSupport.GetCharClass(ch); }while (ParseSupport.InvalidUnicodeCharacter(charClass) && parseCurrent < parseEnd); } // prepare for a new run runStart = parseCurrent; } if (token.IsEmpty) { // text token is empty, we need more data tokenBuilder.Reset(); // reset open text... // reporting everything below parseCurrent consumed input.ReportProcessed(parseCurrent - parseStart); parseStart = parseCurrent; continue; } // finish the current text token, return anything we have collected so far tokenBuilder.EndText(); return((TextTokenId)token.TokenId); } return(tokenBuilder.MakeEmptyToken(TextTokenId.EndOfFile)); }