/// <summary> /// Internally parses tag and returns it from point when left angular bracket was found /// </summary> /// <returns>Chunk</returns> internal HTMLchunk GetNextTag() { oChunk = oTP.ParseTag(ref iCurPos); // for backwards compatibility mark closed tags with params as open if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag) { oChunk.oType = HTMLchunkType.OpenTag; } // 012345 // check for start of script if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script") { if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.Script; oChunk = oTP.ParseScript(ref iCurPos); return(oChunk); } } oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (bKeepRawHTML) { oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); } return(oChunk); }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; if (oChunk != null) { oChunk.Dispose(); oChunk = null; } if (sText != null) { sText.Dispose(); sText = null; } bHTML = null; if (oE != null) { oE.Dispose(); oE = null; } if (oTP != null) { oTP.Dispose(); oTP = null; } } }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; bHTML = null; oChunk = null; sText = null; oE = null; oP = null; } }
/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HtmlParser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE) { oP = p_oP; oChunk = p_oChunk; sText = p_sText; bHTML = p_bHTML; iDataLength = p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS; oE = p_oE; oHE = p_oHE; }
/// <summary> /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk. /// </summary> /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that /// was initiated with the same HTML data that this chunk belongs to</param> public void SetRawHTML(HTMLchunk oChunk) { oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); }
public int Parse(string content, ArrayList lines) { lines.Clear(); result = lines; HtmlParser parser = new HtmlParser(); parser.SetChunkHashMode(true); parser.Init(content); parser.SetEncoding(System.Text.Encoding.UTF8); HTMLchunk chunk = null; Start(); while (!finished && (chunk = parser.ParseNext()) != null) { switch (chunk.oType) { case HTMLchunkType.OpenTag: HandleTag(chunk); break; case HTMLchunkType.CloseTag: HandleClosure(chunk.sTag); break; case HTMLchunkType.Text: offset = chunk.iChunkOffset; string text = ClearExtraBlanks(chunk.oHTML); text = HTMLentities.DecodeEntities(text); HandleWords(text); break; } } NewLine(true); if (!finished) { TextLine textLine = new TextLine(); /* Raffaele Russo - 19/04/2011 - Start - Modificato l'argomento del costruttore TextSegment, il quarto parametro * da "context.Font.Height" è stato modificato in "context.Font.GetHeight(96)" */ textLine.SetSegments(new ArrayList(line), context.Font.GetHeight(96)); // Raffaele Russo - 19/04/2011 - End textLine.Alignment = context.Alignment; if (height + textLine.Height <= bounds.Height) { result.Add(textLine); } previousLine = null; } byte[] b = new byte[parser.iCurPos]; Array.Copy(parser.bHTML, 0, b, 0, parser.iCurPos); int a = System.Text.Encoding.UTF8.GetString(b).Length; return(a); }
private void HandleTag(HTMLchunk tag) { if (tag.sTag == "b") { Font f = new Font(context.Font.FontFamily, context.Font.Size, context.Font.Style | FontStyle.Bold); Context c = new Context("b", f, context.Color, context.Alignment); stack.Push(context); context = c; } else if (tag.sTag == "i") { Font f = new Font(context.Font.FontFamily, context.Font.Size, context.Font.Style | FontStyle.Italic); Context c = new Context("i", f, context.Color, context.Alignment); stack.Push(context); context = c; } else if (tag.sTag == "div") { if (tag.oParams["align"] != null) { NewLine(false); if (tag.oParams["align"].ToString() == "center") { Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Center); stack.Push(context); context = c; } else if (tag.oParams["align"].ToString() == "right") { Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Right); stack.Push(context); context = c; } else if (tag.oParams["align"].ToString() == "justify") { Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Justified); stack.Push(context); context = c; } else { Context c = new Context("div", context.Font, context.Color, TextField.TextAlignmentType.Left); stack.Push(context); context = c; } } } else if (tag.sTag == "font") { float fontSize = context.Font.Size; string fontFamily = context.Font.FontFamily.Name; FontStyle fontStyle = context.Font.Style; Color color = context.Color; if (tag.oParams["size"] != null) { try { fontSize = ResolveFontSize(tag.oParams["size"].ToString().Trim()); } catch (Exception) {} } if (tag.oParams["face"] != null) { try { fontFamily = tag.oParams["face"].ToString(); } catch (Exception) {} } if (tag.oParams["color"] != null) { try { color = ResolveColor("ff" + tag.oParams["color"].ToString().TrimStart('#')); } catch (Exception) {} } Font f = new Font(fontFamily, fontSize, fontStyle); Context c = new Context("font", f, color, context.Alignment); stack.Push(context); context = c; } else if (tag.sTag == "br") { NewLine(true); } else if (tag.sTag == "p") { NewLine(true); } }
/// <summary> /// Internal: parses tag that started from current position /// </summary> /// <returns>HTMLchunk with tag information</returns> internal HTMLchunk ParseTag(ref int iCurPos) { byte cChar = 0; byte cPeek = 0; // initialise peeked char - this will point to the next after < character if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; // in case of comments ! must follow immediately after < if (cPeek == (byte)'!') { if (iCurPos + 2 < iDataLength && bHTML[iCurPos + 1] == (byte)'-' && bHTML[iCurPos + 2] == (byte)'-') { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag = "!--"; oChunk.oType = HTMLchunkType.Comment; oChunk.bComments = true; iCurPos += 3; bool bFullTag; oChunk = ParseComments(ref iCurPos, out bFullTag); oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (oP.bAutoKeepComments || oP.bKeepRawHTML) { if (!oP.bAutoExtractBetweenTagsOnly) { oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength); } else { oChunk.oHTML = GetString(oChunk.iChunkOffset + 4, oChunk.iChunkLength - (bFullTag ? 7 : 4)); } } return(oChunk); } } } else { // empty tag but its not closed, so we will call it open... oChunk.oType = HTMLchunkType.OpenTag; // end of data... before it started return(oChunk); } // tag ID, non-zero if matched by heuristics engine int iTagID = 0; // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags // that should be present most of the time, this should save a lot of looping and string creation if (bEnableHeuristics && iCurPos < iMaxHeuDataLength) { // check if we have got closure of the tag if (cPeek == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = false; oChunk.oType = HTMLchunkType.CloseTag; iCurPos++; cPeek = bHTML[iCurPos]; } cChar = bHTML[iCurPos + 1]; // probability of having a match is very high (or so we expect) iTagID = oHE.MatchTag(cPeek, cChar); if (iTagID != 0) { if (iTagID < 0) { iTagID *= -1; // single character tag oChunk.sTag = oHE.GetString(iTagID); // see if we got fully closed tag if (cChar == (byte)'>') { iCurPos += 2; goto ReturnChunk; } cPeek = cChar; iCurPos++; // everything else means we need to continue scanning as we may have params and stuff goto AttributeParsing; } else { // ok, we have here 2 or more character string that we need to check further // often when we have full 2 char match the next char will be >, if that's the case // then we definately matched our tag byte cNextChar = bHTML[iCurPos + 2]; if (cNextChar == (byte)'>') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar); iCurPos += 3; goto ReturnChunk; } // ok, check next char for space, if that's the case we still got our tag // but need to skip to attribute parsing if (cNextChar == (byte)' ') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar); iCurPos += 2; cPeek = cNextChar; goto AttributeParsing; } // ok, we are not very lucky, but it is still worth fighting for // now we need to check fully long string against what we have matched, maybe // we got exact match and we can avoid full parsing of the tag byte[] bTag = oHE.GetStringData(iTagID); if (iCurPos + bTag.Length + 5 >= iDataLength) { goto TagParsing; } // in a loop (and this is not an ideal solution, but still) for (int i = 2; i < bTag.Length; i++) { // if a single char is not matched, then we if (bTag[i] != bHTML[iCurPos + i]) { goto TagParsing; } } // ok we matched full long word, but we need to be sure that char // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer // word cNextChar = bHTML[iCurPos + bTag.Length]; if (cNextChar == (byte)'>') { oChunk.sTag = oHE.GetString(iTagID); iCurPos += bTag.Length + 1; goto ReturnChunk; } if (cNextChar == (byte)' ') { cPeek = cNextChar; oChunk.sTag = oHE.GetString(iTagID); iCurPos += bTag.Length; goto AttributeParsing; } // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o( } } } TagParsing: sText.Clear(); byte bCharType = 0; // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute) while (cPeek != 0) { bCharType = bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cChar = 0; } bCharType = bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { while (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; bCharType = bTagCharTypes[cChar]; if (bCharType == (byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } break; } if (iCurPos >= iDataLength) { cChar = 0; } } //bWhiteSpaceHere=true; // now, if we have already got tag it means that we are most likely // going to need to parse tag attributes if (sText.iBufPos > 0) { oChunk.sTag = sText.SetToStringASCII(); // oChunk.Append((byte)' '); iCurPos--; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } break; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cChar = 0; } } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } // most likely we should have lower-cased ASCII char if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++] = cChar; // oChunk.Append(cChar); continue; } // tag end - we did not have any params if (cChar == (byte)'>') { if (sText.iBufPos > 0) { oChunk.sTag = sText.SetToStringASCII(); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } // closure of tag sign if (cChar == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = (sText.iBufPos > 0); oChunk.oType = HTMLchunkType.CloseTag; continue; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if (bCharType > 32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++] = bCharType; // oChunk.Append(bCharType); continue; } // ok, we have got some other char - we break out to deal with it in attributes part break; } if (cPeek == 0) { return(oChunk); } // if true then equal sign was found //bool bEqualsSign=false; // STAGE 2: parse attributes (if any available) // attribute name can be standalone or with value after = // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters AttributeParsing: string sAttrName; if (iTagID != 0) { // first, skip whitespace: if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { // most likely next char is not-whitespace iCurPos++; if (iCurPos >= iDataLength) { goto ReturnChunk; } cPeek = bHTML[iCurPos]; if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { // ok long loop here then while (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { continue; } break; } if (cPeek == (byte)'>') { goto ReturnChunk; } iCurPos--; if (iCurPos >= iDataLength) { goto ReturnChunk; } } if (iCurPos >= iDataLength) { goto ReturnChunk; } } // ok we have got matched tag, it is possible that we might be able to quickly match // attribute name known to be used for that tag: int iAttrID = oHE.MatchAttr(cPeek, iTagID); if (iAttrID > 0) { byte[] bAttr = oHE.GetAttrData(iAttrID); if (iCurPos + bAttr.Length + 2 >= iDataLength) { goto ActualAttributeParsing; } // in a loop (and this is not an ideal solution, but still) for (int i = 1; i < bAttr.Length; i++) { // if a single char is not matched, then we if (bAttr[i] != bHTML[iCurPos + i]) { goto ActualAttributeParsing; } } byte cNextChar = bHTML[iCurPos + bAttr.Length]; // ok, we expect next symbol to be = if (cNextChar == (byte)'=') { sAttrName = oHE.GetAttr(iAttrID); iCurPos += bAttr.Length + 1; cPeek = bHTML[iCurPos]; goto AttributeValueParsing; } } } ActualAttributeParsing: sText.Clear(); // doing exactly the same thing as in tag parsing while (cPeek != 0) { bCharType = bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cPeek = 0; break; } bCharType = bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { while (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; bCharType = bTagCharTypes[cChar]; if (bCharType == (byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } //if(cChar==(byte)'>') // goto ReturnChunk; //iCurPos--; break; } if (iCurPos >= iDataLength) { cChar = 0; cPeek = 0; break; } } //bWhiteSpaceHere=true; // now, if we have already got attribute name it means that we need to go to parse value (which may not be present) if (sText.iBufPos > 0) { // oChunk.Append((byte)' '); iCurPos--; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } // ok, we have got attribute name and now we have got next char there // most likely we have got = here and then value if (cPeek == (byte)'=') { //bEqualsSign=true; // move forward one char iCurPos++; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } break; } // or we can have end of tag itself, doh! if (cPeek == (byte)'>') { // move forward one char iCurPos++; if (sText.iBufPos > 0) { oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } // closure if (cPeek == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = true; oChunk.oType = HTMLchunkType.CloseTag; continue; } // ok, we have got new char starting after current attribute name is fully parsed // this means the attribute name is on its own and the char we found is start // of a new attribute oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); sText.Clear(); goto AttributeParsing; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cChar = 0; } } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } // most likely we should have lower-cased ASCII char here if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++] = cChar; // oChunk.Append(cChar); continue; } // = with attribute value to follow if (cChar == (byte)'=') { //bEqualsSign=true; break; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if (bCharType > 32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++] = bCharType; // oChunk.Append(bCharType); continue; } // tag end - we did not have any params if (cChar == (byte)'>') { if (sText.iBufPos > 0) { oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } // closure of tag sign if (cChar == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = true; oChunk.oType = HTMLchunkType.CloseTag; continue; } // some other char sText.bBuffer[sText.iBufPos++] = cChar; // oChunk.Append(cChar); } if (cPeek == 0) { if (sText.iBufPos > 0) { oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } sAttrName = sText.SetToStringASCII(); AttributeValueParsing: /// *********************************************************************** /// STAGE 3: parse attribute value /// *********************************************************************** // the value could be just string, or in quotes (single or double) // or we can have next attribute name start, in which case we will jump back to attribute parsing // for tracking quotes purposes byte cQuotes = cPeek; int iValueStartOffset; // skip whitespace if any if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { iValueStartOffset = iCurPos - 1; goto AttributeValueEnd; } //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { while (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } iCurPos--; break; } if (iCurPos >= iDataLength) { iValueStartOffset = iCurPos - 1; goto AttributeValueEnd; } } cQuotes = cPeek; } // because we deal with VALUE of the attribute it means we can't lower-case it, // or skip whitespace (if in quotes), which in practice means that we don't need to copy // it to temporary string buffer, we can just remember starting offset and then create string from // data in bHTML // ok, first char can be one of the quote chars or something else if (cPeek != '\"' && cPeek != '\'') { iValueStartOffset = iCurPos; cQuotes = (byte)' '; // any other char here means we have value up until next whitespace or end of tag // this gives us good opportunity to scan fairly quickly without otherwise redundant // checks - this should happen fairly rarely, however loop dealing with data between quotes // will happen often enough and its best to eliminate as much stuff from it as possible //sText.bBuffer[sText.iBufPos++]=cPeek; // move to next char if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } while (cPeek != 0) { // if whitespace then we got our value and need to go back to param if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' '); iCurPos--; goto AttributeParsing; } // end of tag? if (cPeek == (byte)'>') { //iCurPos--; break; } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { iCurPos = iDataLength + 1; goto AttributeValueEnd; } } // ok we are done, add outstanding attribute oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' '); goto ReturnChunk; } // move one step forward iCurPos++; iValueStartOffset = iCurPos; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } // attribute value parsing from between two quotes while (cPeek != 0) { // check whether we have got possible entity (can be anything starting with &) if (cPeek == 38) { int iPrevPos = iCurPos; char cEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength); // restore current symbol if (cEntityChar == 0) { if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { break; } //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';; continue; } else { // okay we have got an entity, our hope of not having to copy stuff into variable // is over, we have to continue in a slower fashion :( // but thankfully this should happen very rarely, so, annoying to code, but // most codepaths will run very fast! int iPreEntLen = iPrevPos - iValueStartOffset - 1; // 14/05/08 need to clear text - it contains attribute name text sText.Clear(); // copy previous data if (iPreEntLen > 0) { Array.Copy(bHTML, iValueStartOffset, sText.bBuffer, 0, iPreEntLen); sText.iBufPos = iPreEntLen; } // we have to skip now to next byte, since // some converted chars might well be control chars like > oChunk.bEntities = true; if (cChar == (byte)'<') { oChunk.bLtEntity = true; } // unless is space we will ignore it // note that this won't work if is defined as it should // byte int value of 160, rather than 32. //if(cChar!=' ') sText.Append(cEntityChar); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } // okay, we continue here using in effect new inside loop as we might have more entities here // attribute value parsing from between two quotes while (cPeek != 0) { // check whether we have got possible entity (can be anything starting with &) if (cPeek == 38) { char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength); // restore current symbol if (cNewEntityChar != 0) { if (cNewEntityChar == (byte)'<') { oChunk.bLtEntity = true; } sText.Append(cNewEntityChar); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } continue; } } // check if is end of quotes if (cPeek == cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { break; } goto AttributeParsing; } sText.bBuffer[sText.iBufPos++] = cPeek; //sText.Append(cPeek); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { break; } } oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes); goto ReturnChunk; } } // check if is end of quotes if (cPeek == cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing //sText.Clear(); oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), cQuotes); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { //iCurPos++; break; } goto AttributeParsing; } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { //iCurPos++; break; } } AttributeValueEnd: // ok we are done, add outstanding attribute int iLen = iCurPos - iValueStartOffset - 1; if (iLen > 0) { oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iLen), cQuotes); } else { oChunk.AddParam(sAttrName, "", cQuotes); } ReturnChunk: if (oChunk.bClosure) { oChunk.oType = HTMLchunkType.CloseTag; } else { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); }