/// <summary> /// Internal: parses tag that started from current position /// </summary> /// <returns>HTMLchunk with tag information</returns> internal HTMLchunk ParseTag(ref int curPos) { /* * WARNING: this code was optimised for performance rather than for readability, * so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML * * This routine takes about 60% of CPU time, in theory its the best place to gain extra speed, * but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post * your changes for everyone to enjoy! * */ //var whiteSpaceHere = false; //var paramValue = false; byte c = 0; byte cPeek = 0; // if true it means we have parsed complete tag //var gotTag = false; //var equalIdx = 0; // we reach this function immediately after tag's byte (<) was // detected, so we need to save it in order to keep correct HTML copy // _hunk.Append((byte)'<'); // (byte)'<' /* * _chunk.Buffer[0] = 60; * _chunk.BufPos = 1; * _chunk.HTMLen = 1; */ // initialise peeked char - this will point to the next after < character if (curPos < _dataLength) { cPeek = _html[curPos]; // in case of comments ! must follow immediately after < if (cPeek == (byte)'!') { if (curPos + 2 < _dataLength && _html[curPos + 1] == (byte)'-' && _html[curPos + 2] == (byte)'-') { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively _chunk.Tag = "!--"; _chunk.Type = HTMLchunkType.Comment; _chunk.Comments = true; // _chunk.Append((byte)'!'); // _chunk.Append((byte)'-'); // _chunk.Append((byte)'-'); curPos += 3; _chunk = ParseComments(ref curPos, out bool fullTag); _chunk.ChunkLength = curPos - _chunk.ChunkOffset; if (_p.AutoKeepComments || _p.KeepRawHTML) { if (!_p.AutoExtractBetweenTagsOnly) { _chunk.Html = GetString(_chunk.ChunkOffset, _chunk.ChunkLength); } else { _chunk.Html = GetString(_chunk.ChunkOffset + 4, _chunk.ChunkLength - fullTag ? 7 : 4); } } return(_chunk); } // ok we might have here CDATA element of XML: // ref: http://www.w3schools.com/xml/xml_cdata.asp if (curPos + 7 < _dataLength && _html[curPos + 1] == (byte)'[' && _html[curPos + 2] == (byte)'C' && _html[curPos + 3] == (byte)'D' && _html[curPos + 4] == (byte)'A' && _html[curPos + 5] == (byte)'T' && _html[curPos + 6] == (byte)'A' && _html[curPos + 7] == (byte)'[') { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively _chunk.Tag = "![CDATA["; _chunk.Type = HTMLchunkType.Comment; _chunk.Comments = true; // _chunk.Append((byte)'!'); // _chunk.Append((byte)'-'); // _chunk.Append((byte)'-'); curPos += 8; _chunk = ParseCDATA(ref curPos, out bool fullTag); _chunk.ChunkLength = curPos - _chunk.ChunkOffset; if (_p.AutoKeepComments || _p.KeepRawHTML) { if (!_p.AutoExtractBetweenTagsOnly) { _chunk.Html = GetString(_chunk.ChunkOffset, _chunk.ChunkLength); } else { _chunk.Html = GetString(_chunk.ChunkOffset + 4 + 5, _chunk.ChunkLength - fullTag ? 7 + 5 : 4 + 5); } } return(_chunk); } } } else { // empty tag but its not closed, so we will call it open... _chunk.Type = HTMLchunkType.OpenTag; // end of data... before it started return(_chunk); } // tag ID, non-zero if matched by heuristics engine var tagId = 0; // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags // that should be present most of the time, this should save a lot of looping and string creation if (EnableHeuristics && curPos < _maxHeuDataLength) { // check if we have got closure of the tag if (cPeek == (byte)'/') { _chunk.Closure = true; _chunk.EndClosure = false; _chunk.Type = HTMLchunkType.CloseTag; curPos++; cPeek = _html[curPos]; } c = _html[curPos + 1]; // probability of having a match is very high (or so we expect) tagId = _he.MatchTag(cPeek, c); if (tagId != 0) { if (tagId < 0) { tagId *= -1; // single character tag _chunk.Tag = _he.GetString(tagId); // see if we got fully closed tag if (c == (byte)'>') { curPos += 2; goto ReturnChunk; } cPeek = c; curPos++; // everything else means we need to continue scanning as we may have params and stuff goto AttributeParsing; } else { // ok, we have here 2 or more character string that we need to check further // often when we have full 2 char match the next char will be >, if that's the case // then we definately matched our tag var nextChar = _html[curPos + 2]; if (nextChar == (byte)'>') { //oChunk.sTag=oHE.GetString(iTagID); _chunk.Tag = _he.GetTwoCharString(cPeek, c); curPos += 3; goto ReturnChunk; } // ok, check next char for space, if that's the case we still got our tag // but need to skip to attribute parsing if (nextChar == (byte)' ') { //_chunk.Tag = _he.GetString(tagId); _chunk.Tag = _he.GetTwoCharString(cPeek, c); curPos += 2; cPeek = nextChar; goto AttributeParsing; } // ok, we are not very lucky, but it is still worth fighting for // now we need to check fully long string against what we have matched, maybe // we got exact match and we can avoid full parsing of the tag var tag = _he.GetStringData(tagId); if (curPos + tag.Length + 5 >= _dataLength) { goto TagParsing; } // in a loop (and this is not an ideal solution, but still) for (int i = 2; i < tag.Length; i++) { // if a single char is not matched, then we if (tag[i] != _html[curPos + i]) { goto TagParsing; } } // ok we matched full long word, but we need to be sure that char // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer word nextChar = _html[curPos + tag.Length]; if (nextChar == (byte)'>') { _chunk.Tag = _he.GetString(tagId); curPos += tag.Length + 1; goto ReturnChunk; } if (nextChar == (byte)' ') { cPeek = nextChar; _chunk.Tag = _he.GetString(tagId); curPos += tag.Length; goto AttributeParsing; } // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o( } } } TagParsing: _text.Clear(); var charType = 0; // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute) while (cPeek != 0) { charType = _tagCharTypes[cPeek]; //if (cPeek <= 32 && whiteSpace[cPeek] == 1) if (charType == (byte)TagCharType.WhiteSpace) { curPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (curPos < _dataLength) { c = _html[curPos++]; } else { c = 0; } charType = _tagCharTypes[c]; //if (c == ' ' || c == '\t' || c == 13 || c == 10) //if (c <= 32 && whiteSpace[c] == 1) if (charType == (byte)TagCharType.WhiteSpace) { while (curPos < _dataLength) { c = _html[curPos++]; charType = _tagCharTypes[c]; if (charType == (byte)TagCharType.WhiteSpace) //if(c != ' ' && c != '\t' && c != 13 && c != 10) { //cPeek = _html[curPos]; continue; } break; } if (curPos >= _dataLength) { c = 0; } } //whiteSpaceHere = true; // now, if we have already got tag it means that we are most likely // going to need to parse tag attributes if (_text._bufPos > 0) { _chunk.Tag = _text.SetToStringASCII(); // _chunk.Append((byte)' '); curPos--; if (curPos < _dataLength) { cPeek = _html[curPos]; } else { cPeek = 0; } break; } } else { // reuse Peeked char from previous run //c = cPeek; curPos++; if (curPos < _dataLength) { c = _html[curPos++]; } else { c = 0; } } if (curPos < _dataLength) { cPeek = _html[curPos]; } else { cPeek = 0; } // most likely we should have lower-cased ASCII char if (charType == (byte)TagCharType.LowerCasedASCIIorDigit) { _text._buffer[_text._bufPos++] = c; // _chunk.Append(c); continue; } // tag end - we did not have any params if (c == (byte)'>') { if (_text._bufPos > 0) { _chunk.Tag = _text.SetToStringASCII(); } if (!_chunk.Closure) { _chunk.Type = HTMLchunkType.OpenTag; } return(_chunk); } // closure of tag sign if (c == (byte)'/') { _chunk.Closure = true; _chunk.EndClosure = (_text._bufPos > 0); _chunk.Type = HTMLchunkType.CloseTag; continue; } // 03/08/08 XML support: ?xml tags - grrr if (c == (byte)'?') { _text._buffer[_text._bufPos++] = c; continue; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if (c >= 65 && c <= 90) if (charType > 32) { // bCharType in this case contains already lower-cased char _text._buffer[_text._bufPos++] = charType; // _chunk.Append(bCharType); continue; } // we might have namespace : sign here - all text before would have to be // saved as namespace and we will need to continue parsing actual tag if (charType == (byte)TagCharType.NameSpaceColon) { // ok here we got a choice - we can just continue and treat the whole // thing as a single tag with namespace stuff prefixed, OR // we can separate first part into namespace and keep tag as normal _text._buffer[_text._bufPos++] = (byte)':'; continue; } // ok, we have got some other char - we break out to deal with it in attributes part break; } if (cPeek == 0) { return(_chunk); } // if true then equal sign was found //var equalsSign = false; // STAGE 2: parse attributes (if any available) // attribute name can be standalone or with value after = // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters AttributeParsing: string attrName; if (tagId != 0) { // first, skip whitespace: if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { // most likely next char is not-whitespace curPos++; if (curPos >= _dataLength) { goto ReturnChunk; } cPeek = _html[curPos]; if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { // ok long loop here then while (curPos < _dataLength) { cPeek = _html[curPos++]; if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { continue; } break; } if (cPeek == (byte)'>') { goto ReturnChunk; } curPos--; if (curPos >= _dataLength) { goto ReturnChunk; } } if (curPos >= _dataLength) { goto ReturnChunk; } } // ok we have got matched tag, it is possible that we might be able to quickly match // attribute name known to be used for that tag: var attrId = _he.MatchAttr(cPeek, tagId); if (attrId > 0) { var attr = _he.GetAttrData(attrId); if (curPos + attr.Length + 2 >= _dataLength) { goto ActualAttributeParsing; } // in a loop (and this is not an ideal solution, but still) for (var i = 1; i < attr.Length; i++) { // if a single char is not matched, then we if (attr[i] != _html[curPos + i]) { goto ActualAttributeParsing; } } var nextChar = _html[curPos + attr.Length]; // ok, we expect next symbol to be = if (nextChar == (byte)'=') { attrName = _he.GetAttr(attrId); curPos += attr.Length + 1; cPeek = _html[curPos]; goto AttributeValueParsing; } } } ActualAttributeParsing: _text.Clear(); // doing exactly the same thing as in tag parsing while (cPeek != 0) { charType = _tagCharTypes[cPeek]; //if (cPeek <= 32 && whiteSpace[cPeek] == 1) if (charType == (byte)TagCharType.WhiteSpace) { curPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (curPos < _dataLength) { c = _html[curPos++]; } else { cPeek = 0; break; } charType = _tagCharTypes[c]; //if (c == ' ' || c == '\t' || c == 13 || c == 10) //if (c <= 32 && whiteSpace[c] == 1) if (charType == (byte)TagCharType.WhiteSpace) { while (curPos < _dataLength) { c = _html[curPos++]; charType = _tagCharTypes[c]; if (charType == (byte)TagCharType.WhiteSpace) //if(c != ' ' && c != '\t' && c != 13 && c != 10) { //cPeek = _html[curPos]; continue; } //if (c == (byte)'>') // goto ReturnChunk; //curPos--; break; } if (curPos >= _dataLength) { c = 0; cPeek = 0; break; } } //whiteSpaceHere = true; // now, if we have already got attribute name it means that we need to go to parse value (which may not be present) if (_text._bufPos > 0) { // _chunk.Append((byte)' '); curPos--; if (curPos < _dataLength) { cPeek = _html[curPos]; } else { cPeek = 0; } // ok, we have got attribute name and now we have got next char there // most likely we have got = here and then value if (cPeek == (byte)'=') { //equalsSign = true; // move forward one char curPos++; if (curPos < _dataLength) { cPeek = _html[curPos]; } else { cPeek = 0; } break; } // or we can have end of tag itself, doh! if (cPeek == (byte)'>') { // move forward one char curPos++; if (_text._bufPos > 0) { _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' '); } if (!_chunk.Closure) { _chunk.Type = HTMLchunkType.OpenTag; } return(_chunk); } // closure if (cPeek == (byte)'/') { _chunk.Closure = true; _chunk.EndClosure = true; _chunk.Type = HTMLchunkType.CloseTag; continue; } // ok, we have got new char starting after current attribute name is fully parsed // this means the attribute name is on its own and the char we found is start // of a new attribute _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' '); _text.Clear(); goto AttributeParsing; } } else { // reuse Peeked char from previous run //c = cPeek; curPos++; if (curPos < _dataLength) { c = _html[curPos++]; } else { c = 0; } } if (curPos < _dataLength) { cPeek = _html[curPos]; } else { cPeek = 0; } // most likely we should have lower-cased ASCII char here if (charType == (byte)TagCharType.LowerCasedASCIIorDigit) { _text._buffer[_text._bufPos++] = c; // _chunk.Append(cChar); continue; } // = with attribute value to follow if (c == (byte)'=') { //equalsSign=true; break; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(c >= 65 && c <= 90) if (charType > 32) { // bCharType in this case contains already lower-cased char _text._buffer[_text._bufPos++] = charType; // _chunk.Append(bCharType); continue; } // tag end - we did not have any params if (c == (byte)'>') { if (_text._bufPos > 0) { _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' '); } if (!_chunk.Closure) { _chunk.Type = HTMLchunkType.OpenTag; } return(_chunk); } // closure of tag sign if (c == (byte)'/') { _chunk.Closure = true; _chunk.EndClosure = true; _chunk.Type = HTMLchunkType.CloseTag; continue; } // some other char _text._buffer[_text._bufPos++] = c; // _chunk.Append(cChar); } if (cPeek == 0) { if (_text._bufPos > 0) { _chunk.AddParam(_text.SetToStringASCII(), "", (byte)' '); } if (!_chunk.Closure) { _chunk.Type = HTMLchunkType.OpenTag; } return(_chunk); } attrName = _text.SetToStringASCII(); AttributeValueParsing: /// *********************************************************************** /// STAGE 3: parse attribute value /// *********************************************************************** // the value could be just string, or in quotes (single or double) // or we can have next attribute name start, in which case we will jump back to attribute parsing // for tracking quotes purposes var quotes = cPeek; int valueStartOffset; // skip whitespace if any if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { curPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (curPos < _dataLength) { cPeek = _html[curPos]; } else { valueStartOffset = curPos - 1; goto AttributeValueEnd; } //if (c == ' ' || c == '\t' || c == 13 || c == 10) //if (c <= 32 && whiteSpace[c] == 1) if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { while (curPos < _dataLength) { cPeek = _html[curPos++]; if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) //if(c != ' ' && c != '\t' && c != 13 && c != 10) { //cPeek = _html[curPos]; continue; } curPos--; break; } if (curPos >= _dataLength) { valueStartOffset = curPos - 1; goto AttributeValueEnd; } } quotes = cPeek; } // because we deal with VALUE of the attribute it means we can't lower-case it, // or skip whitespace (if in quotes), which in practice means that we don't need to copy // it to temporary string buffer, we can just remember starting offset and then create string from // data in bHTML // ok, first char can be one of the quote chars or something else if (cPeek != '\"' && cPeek != '\'') { valueStartOffset = curPos; quotes = (byte)' '; // any other char here means we have value up until next whitespace or end of tag // this gives us good opportunity to scan fairly quickly without otherwise redundant // checks - this should happen fairly rarely, however loop dealing with data between quotes // will happen often enough and its best to eliminate as much stuff from it as possible //sText.bBuffer[sText.iBufPos++]=cPeek; // move to next char if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { goto AttributeValueEnd; } while (cPeek != 0) { // if whitespace then we got our value and need to go back to param if (cPeek <= 32 && _tagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), (byte)' '); curPos--; goto AttributeParsing; } // end of tag? if (cPeek == (byte)'>') { //curPos--; break; } if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { curPos = _dataLength + 1; goto AttributeValueEnd; } } // ok we are done, add outstanding attribute _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), (byte)' '); goto ReturnChunk; } // move one step forward curPos++; valueStartOffset = curPos; if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { goto AttributeValueEnd; } // attribute value parsing from between two quotes while (cPeek != 0) { // check whether we have got possible entity (can be anything starting with &) if (cPeek == 38) { var prevPos = curPos; var entityChar = _e.CheckForEntity(_html, ref curPos, _dataLength); // restore current symbol if (entityChar == 0) { if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { break; } //_text.Buffer[_text.BufPos++] = 38; //(byte)'&';; continue; } else { // okay we have got an entity, our hope of not having to copy stuff into variable // is over, we have to continue in a slower fashion :( // but thankfully this should happen very rarely, so, annoying to code, but // most codepaths will run very fast! var preEntLen = prevPos - valueStartOffset - 1; // 14/05/08 need to clear text - it contains attribute name text _text.Clear(); // copy previous data if (preEntLen > 0) { Array.Copy(_html, valueStartOffset, _text._buffer, 0, preEntLen); _text._bufPos = preEntLen; } // we have to skip now to next byte, since // some converted chars might well be control chars like > _chunk.Entities = true; if (c == (byte)'<') { _chunk.LtEntity = true; } // unless is space we will ignore it // note that this won't work if is defined as it should // byte int value of 160, rather than 32. //if (c != ' ') _text.Append(entityChar); if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { goto AttributeValueEnd; } // okay, we continue here using in effect new inside loop as we might have more entities here // attribute value parsing from between two quotes while (cPeek != 0) { // check whether we have got possible entity (can be anything starting with &) if (cPeek == 38) { var newEntityChar = _e.CheckForEntity(_html, ref curPos, _dataLength); // restore current symbol if (newEntityChar != 0) { if (newEntityChar == (byte)'<') { _chunk.LtEntity = true; } _text.Append(newEntityChar); if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { goto AttributeValueEnd; } continue; } } // check if is end of quotes if (cPeek == quotes) { // ok we finished scanning it: add param with value and then go back to param name parsing _chunk.AddParam(attrName, _text.SetToString(), quotes); if (curPos < _dataLength) { cPeek = _html[curPos]; } else { break; } goto AttributeParsing; } _text._buffer[_text._bufPos++] = cPeek; //_text.Append(cPeek); if (curPos < _dataLength) { cPeek = _html[curPos++]; } else { break; } } _chunk.AddParam(attrName, _text.SetToString(), quotes); goto ReturnChunk; } } // check if is end of quotes if (cPeek == quotes) { // ok we finished scanning it: add param with value and then go back to param name parsing //_text.Clear(); _chunk.AddParam(attrName, GetString(valueStartOffset, curPos - valueStartOffset - 1), quotes); if (curPos < _dataLength) { cPeek = _html[curPos]; } else /*curPos++;*/ break { ; } goto AttributeParsing; } if (curPos < _dataLength) { cPeek = _html[curPos++]; } else /*curPos++;*/ break { ; }
/// <summary> /// Parses next chunk and returns it with /// </summary> /// <returns>HTMLchunk or null if end of data reached</returns> public HTMLchunk ParseNext() { if (_curPos >= _dataLength) { return(null); } _chunk.Clear(); _chunk.ChunkOffset = _curPos; var c = _html[_curPos++]; // most likely what we have here is a normal char, if (c == (byte)'<') { // tag parsing route - we know for sure that we have not had some text chars before // that point to worry about return(GetNextTag()); } else { // check if it's whitespace - typically happens after tag end and before new tag starts // so it makes sense make it special case if (CompressWhiteSpaceBeforeTag && c <= 32 && _whiteSpace[c] == 1) { // ok first char is empty space, this can often lead to new tag // thus causing us to create essentially empty strings where as we could have // returned fixed single space string when it is necessary while (_curPos < _dataLength) { c = _html[_curPos++]; if (c <= 32 && _whiteSpace[c] == 1) { continue; } // ok we got tag, but all we had before it was spaces, most likely end of lines // so we will return compact representation of that text data if (c == (byte)'<') { _curPos--; _chunk.Type = HTMLchunkType.Text; _chunk.Html = " "; return(_chunk); } break; } } // ok normal text, we just scan it until tag or end of text // statistically this loop will have plenty of iterations // thus it makes sense to deal with pointers, we only do that if // we have got plenty of bytes to scan left var quadBytes = ((_dataLength - _curPos) >> 2) - 1; if (!_e.CanDecodeEntities && !_e.MiniEntities) { while (_curPos < _dataLength) { // ok we got tag, but all we had before it was spaces, most likely end of lines // so we will return compact representation of that text data if (_html[_curPos++] == (byte)'<') { _curPos--; break; } } } else { // TODO: might help skipping data in quads but we need to perfect bitmap operations for that: // stop when at least one & or < is detected in quad /* * fixed (byte* bpData=&bHTML[iCurPos]) * { * uint* uiData=(uint*)bpData; * for(int i=0; i<iQuadBytes; i++) * { * // use bitmask operation to quickly check if any of the 4 bytes * // has got < in them - should be FAIRLY unlikely thus allowing us to skip * // few bytes in one go * if((~(*uiData & 0x3C3C3C3C)) ) * { * iCurPos+=4; * uiData++; * continue; * } * break; * } * } */ // we might have entity here, which is first char of the text: if (c == (byte)'&') { var lastCurPos = _curPos - 1; var entityChar = _e.CheckForEntity(_html, ref _curPos, _dataLength); // restore current symbol if (entityChar != 0) { // ok, we have got entity on our hand, it means that we can't just copy // data from start of the buffer to end of text thereby avoiding having to // accumulate same data elsewhere _text.Clear(); _chunk.Entities = true; if (entityChar == (byte)'<') { _chunk.LtEntity = true; } _text.Append(entityChar); return(ParseTextWithEntities()); } } while (_curPos < _dataLength) { c = _html[_curPos++]; // ok we got tag, but all we had before it was spaces, most likely end of lines // so we will return compact representation of that text data if (c == (byte)'<') { _curPos--; break; } // check if we got entity if (c == (byte)'&') { var lastCurPos = _curPos - 1; var entityChar = _e.CheckForEntity(_html, ref _curPos, _dataLength); // restore current symbol if (entityChar != 0) { // ok, we have got entity on our hand, it means that we can't just copy // data from start of the buffer to end of text thereby avoiding having to // accumulate same data elsewhere _text.Clear(); var len = lastCurPos - _chunk.ChunkOffset; if (len > 0) { Array.Copy(_html, _chunk.ChunkOffset, _text._buffer, 0, len); _text._bufPos = len; } _chunk.Entities = true; if (entityChar == (byte)'<') { _chunk.LtEntity = true; } _text.Append(entityChar); return(ParseTextWithEntities()); } } } } _chunk.ChunkLength = _curPos - _chunk.ChunkOffset; if (_chunk.ChunkLength == 0) { return(null); } _chunk.Type = HTMLchunkType.Text; // oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); _chunk.Html = _originalHtml.Substring(_chunk.ChunkOffset, _chunk.ChunkLength); return(_chunk); } }