/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE) { oP=p_oP; oChunk=p_oChunk; sText=p_sText; bHTML=p_bHTML; iDataLength=p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS; oE=p_oE; oHE=p_oHE; }
/// <summary> /// Internal: parses tag that started from current position /// </summary> /// <returns>HTMLchunk with tag information</returns> internal HTMLchunk ParseTag(ref int iCurPos) { /* * WARNING: this code was optimised for performance rather than for readability, * so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML * * This routine takes about 60% of CPU time, in theory its the best place to gain extra speed, * but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post * your changes for everyone to enjoy! * * * */ //bool bWhiteSpaceHere=false; //bool bParamValue=false; byte cChar=0; byte cPeek=0; // if true it means we have parsed complete tag //bool bGotTag=false; //int iEqualIdx=0; // we reach this function immediately after tag's byte (<) was // detected, so we need to save it in order to keep correct HTML copy // oChunk.Append((byte)'<'); // (byte)'<' /* oChunk.bBuffer[0]=60; oChunk.iBufPos=1; oChunk.iHTMLen=1; */ // initialise peeked char - this will point to the next after < character if(iCurPos<iDataLength) { cPeek=bHTML[iCurPos]; // in case of comments ! must follow immediately after < if(cPeek==(byte)'!') { if(iCurPos+2<iDataLength && bHTML[iCurPos+1]==(byte)'-' && bHTML[iCurPos+2]==(byte)'-') { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag="!--"; oChunk.oType=HTMLchunkType.Comment; oChunk.bComments=true; // oChunk.Append((byte)'!'); // oChunk.Append((byte)'-'); // oChunk.Append((byte)'-'); iCurPos+=3; bool bFullTag; oChunk=ParseComments(ref iCurPos,out bFullTag); oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset; if(oP.bAutoKeepComments || oP.bKeepRawHTML) { if(!oP.bAutoExtractBetweenTagsOnly) oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength); else { oChunk.oHTML=GetString(oChunk.iChunkOffset+4,oChunk.iChunkLength-(bFullTag ? 7 : 4)); } } return oChunk; } // ok we might have here CDATA element of XML: // ref: http://www.w3schools.com/xml/xml_cdata.asp if(iCurPos+7<iDataLength && bHTML[iCurPos+1]==(byte)'[' && bHTML[iCurPos+2]==(byte)'C' && bHTML[iCurPos+3]==(byte)'D' && bHTML[iCurPos+4]==(byte)'A' && bHTML[iCurPos+5]==(byte)'T' && bHTML[iCurPos+6]==(byte)'A' && bHTML[iCurPos+7]==(byte)'[' ) { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag="![CDATA["; oChunk.oType=HTMLchunkType.Comment; oChunk.bComments=true; // oChunk.Append((byte)'!'); // oChunk.Append((byte)'-'); // oChunk.Append((byte)'-'); iCurPos+=8; bool bFullTag; oChunk=ParseCDATA(ref iCurPos,out bFullTag); oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset; if(oP.bAutoKeepComments || oP.bKeepRawHTML) { if(!oP.bAutoExtractBetweenTagsOnly) oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength); else { oChunk.oHTML=GetString(oChunk.iChunkOffset+4+5, oChunk.iChunkLength-(bFullTag ? 7+5 : 4+5)); } } return oChunk; } } } else { // empty tag but its not closed, so we will call it open... oChunk.oType=HTMLchunkType.OpenTag; // end of data... before it started return oChunk; } // tag ID, non-zero if matched by heuristics engine int iTagID=0; // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags // that should be present most of the time, this should save a lot of looping and string creation if(bEnableHeuristics && iCurPos<iMaxHeuDataLength) { // check if we have got closure of the tag if(cPeek==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=false; oChunk.oType=HTMLchunkType.CloseTag; iCurPos++; cPeek=bHTML[iCurPos]; } cChar=bHTML[iCurPos+1]; // probability of having a match is very high (or so we expect) iTagID=oHE.MatchTag(cPeek,cChar); if(iTagID!=0) { if(iTagID<0) { iTagID*=-1; // single character tag oChunk.sTag=oHE.GetString(iTagID); // see if we got fully closed tag if(cChar==(byte)'>') { iCurPos+=2; goto ReturnChunk; } cPeek=cChar; iCurPos++; // everything else means we need to continue scanning as we may have params and stuff goto AttributeParsing; } else { // ok, we have here 2 or more character string that we need to check further // often when we have full 2 char match the next char will be >, if that's the case // then we definately matched our tag byte cNextChar=bHTML[iCurPos+2]; if(cNextChar==(byte)'>') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar); iCurPos+=3; goto ReturnChunk; } // ok, check next char for space, if that's the case we still got our tag // but need to skip to attribute parsing if(cNextChar==(byte)' ') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar); iCurPos+=2; cPeek=cNextChar; goto AttributeParsing; } // ok, we are not very lucky, but it is still worth fighting for // now we need to check fully long string against what we have matched, maybe // we got exact match and we can avoid full parsing of the tag byte[] bTag=oHE.GetStringData(iTagID); if(iCurPos+bTag.Length+5>=iDataLength) goto TagParsing; // in a loop (and this is not an ideal solution, but still) for(int i=2; i<bTag.Length; i++) { // if a single char is not matched, then we if(bTag[i]!=bHTML[iCurPos+i]) { goto TagParsing; } } // ok we matched full long word, but we need to be sure that char // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer // word cNextChar=bHTML[iCurPos+bTag.Length]; if(cNextChar==(byte)'>') { oChunk.sTag=oHE.GetString(iTagID); iCurPos+=bTag.Length+1; goto ReturnChunk; } if(cNextChar==(byte)' ') { cPeek=cNextChar; oChunk.sTag=oHE.GetString(iTagID); iCurPos+=bTag.Length; goto AttributeParsing; } // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o( } } } TagParsing: sText.Clear(); byte bCharType=0; // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute) while(cPeek!=0) { bCharType=bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else cChar=0; bCharType=bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { while(iCurPos<iDataLength) { cChar=bHTML[iCurPos++]; bCharType=bTagCharTypes[cChar]; if(bCharType==(byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } break; } if(iCurPos>=iDataLength) cChar=0; } //bWhiteSpaceHere=true; // now, if we have already got tag it means that we are most likely // going to need to parse tag attributes if(sText.iBufPos>0) { oChunk.sTag=sText.SetToStringASCII(); // oChunk.Append((byte)' '); iCurPos--; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; break; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else cChar=0; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; // most likely we should have lower-cased ASCII char if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++]=cChar; // oChunk.Append(cChar); continue; } // tag end - we did not have any params if(cChar==(byte)'>') { if(sText.iBufPos>0) oChunk.sTag=sText.SetToStringASCII(); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } // closure of tag sign if(cChar==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=(sText.iBufPos>0); oChunk.oType=HTMLchunkType.CloseTag; continue; } // 03/08/08 XML support: ?xml tags - grrr if(cChar==(byte)'?') { sText.bBuffer[sText.iBufPos++]=cChar; continue; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if(bCharType>32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++]=bCharType; // oChunk.Append(bCharType); continue; } // we might have namespace : sign here - all text before would have to be // saved as namespace and we will need to continue parsing actual tag if(bCharType==(byte)TagCharType.NameSpaceColon) { // ok here we got a choice - we can just continue and treat the whole // thing as a single tag with namespace stuff prefixed, OR // we can separate first part into namespace and keep tag as normal sText.bBuffer[sText.iBufPos++]=(byte)':'; continue; } // ok, we have got some other char - we break out to deal with it in attributes part break; } if(cPeek==0) { return oChunk; } // if true then equal sign was found //bool bEqualsSign=false; // STAGE 2: parse attributes (if any available) // attribute name can be standalone or with value after = // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters AttributeParsing: string sAttrName; if(iTagID!=0) { // first, skip whitespace: if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { // most likely next char is not-whitespace iCurPos++; if(iCurPos>=iDataLength) goto ReturnChunk; cPeek=bHTML[iCurPos]; if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { // ok long loop here then while(iCurPos<iDataLength) { cPeek=bHTML[iCurPos++]; if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) continue; break; } if(cPeek==(byte)'>') goto ReturnChunk; iCurPos--; if(iCurPos>=iDataLength) goto ReturnChunk; } if(iCurPos>=iDataLength) goto ReturnChunk; } // ok we have got matched tag, it is possible that we might be able to quickly match // attribute name known to be used for that tag: int iAttrID=oHE.MatchAttr(cPeek,iTagID); if(iAttrID>0) { byte[] bAttr=oHE.GetAttrData(iAttrID); if(iCurPos+bAttr.Length+2>=iDataLength) goto ActualAttributeParsing; // in a loop (and this is not an ideal solution, but still) for(int i=1; i<bAttr.Length; i++) { // if a single char is not matched, then we if(bAttr[i]!=bHTML[iCurPos+i]) { goto ActualAttributeParsing; } } byte cNextChar=bHTML[iCurPos+bAttr.Length]; // ok, we expect next symbol to be = if(cNextChar==(byte)'=') { sAttrName=oHE.GetAttr(iAttrID); iCurPos+=bAttr.Length+1; cPeek=bHTML[iCurPos]; goto AttributeValueParsing; } } } ActualAttributeParsing: sText.Clear(); // doing exactly the same thing as in tag parsing while(cPeek!=0) { bCharType=bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else { cPeek=0; break; } bCharType=bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { while(iCurPos<iDataLength) { cChar=bHTML[iCurPos++]; bCharType=bTagCharTypes[cChar]; if(bCharType==(byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } //if(cChar==(byte)'>') // goto ReturnChunk; //iCurPos--; break; } if(iCurPos>=iDataLength) { cChar=0; cPeek=0; break; } } //bWhiteSpaceHere=true; // now, if we have already got attribute name it means that we need to go to parse value (which may not be present) if(sText.iBufPos>0) { // oChunk.Append((byte)' '); iCurPos--; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; // ok, we have got attribute name and now we have got next char there // most likely we have got = here and then value if(cPeek==(byte)'=') { //bEqualsSign=true; // move forward one char iCurPos++; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; break; } // or we can have end of tag itself, doh! if(cPeek==(byte)'>') { // move forward one char iCurPos++; if(sText.iBufPos>0) oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } // closure if(cPeek==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=true; oChunk.oType=HTMLchunkType.CloseTag; continue; } // ok, we have got new char starting after current attribute name is fully parsed // this means the attribute name is on its own and the char we found is start // of a new attribute oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); sText.Clear(); goto AttributeParsing; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else cChar=0; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; // most likely we should have lower-cased ASCII char here if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++]=cChar; // oChunk.Append(cChar); continue; } // = with attribute value to follow if(cChar==(byte)'=') { //bEqualsSign=true; break; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if(bCharType>32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++]=bCharType; // oChunk.Append(bCharType); continue; } // tag end - we did not have any params if(cChar==(byte)'>') { if(sText.iBufPos>0) oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } // closure of tag sign if(cChar==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=true; oChunk.oType=HTMLchunkType.CloseTag; continue; } // some other char sText.bBuffer[sText.iBufPos++]=cChar; // oChunk.Append(cChar); } if(cPeek==0) { if(sText.iBufPos>0) oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } sAttrName=sText.SetToStringASCII(); AttributeValueParsing: /// *********************************************************************** /// STAGE 3: parse attribute value /// *********************************************************************** // the value could be just string, or in quotes (single or double) // or we can have next attribute name start, in which case we will jump back to attribute parsing // for tracking quotes purposes byte cQuotes=cPeek; int iValueStartOffset; // skip whitespace if any if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else { iValueStartOffset=iCurPos-1; goto AttributeValueEnd; } //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { while(iCurPos<iDataLength) { cPeek=bHTML[iCurPos++]; if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } iCurPos--; break; } if(iCurPos>=iDataLength) { iValueStartOffset=iCurPos-1; goto AttributeValueEnd; } } cQuotes=cPeek; } // because we deal with VALUE of the attribute it means we can't lower-case it, // or skip whitespace (if in quotes), which in practice means that we don't need to copy // it to temporary string buffer, we can just remember starting offset and then create string from // data in bHTML // ok, first char can be one of the quote chars or something else if(cPeek!='\"' && cPeek!='\'') { iValueStartOffset=iCurPos; cQuotes=(byte)' '; // any other char here means we have value up until next whitespace or end of tag // this gives us good opportunity to scan fairly quickly without otherwise redundant // checks - this should happen fairly rarely, however loop dealing with data between quotes // will happen often enough and its best to eliminate as much stuff from it as possible //sText.bBuffer[sText.iBufPos++]=cPeek; // move to next char if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { goto AttributeValueEnd; } while(cPeek!=0) { // if whitespace then we got our value and need to go back to param if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' '); iCurPos--; goto AttributeParsing; } // end of tag? if(cPeek==(byte)'>') { //iCurPos--; break; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { iCurPos=iDataLength+1; goto AttributeValueEnd; } } // ok we are done, add outstanding attribute oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' '); goto ReturnChunk; } // move one step forward iCurPos++; iValueStartOffset=iCurPos; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { goto AttributeValueEnd; } // attribute value parsing from between two quotes while(cPeek!=0) { // check whether we have got possible entity (can be anything starting with &) if(cPeek==38) { int iPrevPos=iCurPos; char cEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength); // restore current symbol if(cEntityChar==0) { if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else break; //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';; continue; } else { // okay we have got an entity, our hope of not having to copy stuff into variable // is over, we have to continue in a slower fashion :( // but thankfully this should happen very rarely, so, annoying to code, but // most codepaths will run very fast! int iPreEntLen=iPrevPos-iValueStartOffset-1; // 14/05/08 need to clear text - it contains attribute name text sText.Clear(); // copy previous data if(iPreEntLen>0) { Array.Copy(bHTML,iValueStartOffset,sText.bBuffer,0,iPreEntLen); sText.iBufPos=iPreEntLen; } // we have to skip now to next byte, since // some converted chars might well be control chars like > oChunk.bEntities=true; if(cChar==(byte)'<') oChunk.bLtEntity=true; // unless is space we will ignore it // note that this won't work if is defined as it should // byte int value of 160, rather than 32. //if(cChar!=' ') sText.Append(cEntityChar); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { goto AttributeValueEnd; } // okay, we continue here using in effect new inside loop as we might have more entities here // attribute value parsing from between two quotes while(cPeek!=0) { // check whether we have got possible entity (can be anything starting with &) if(cPeek==38) { char cNewEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength); // restore current symbol if(cNewEntityChar!=0) { if(cNewEntityChar==(byte)'<') oChunk.bLtEntity=true; sText.Append(cNewEntityChar); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else goto AttributeValueEnd; continue; } } // check if is end of quotes if(cPeek==cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else break; goto AttributeParsing; } sText.bBuffer[sText.iBufPos++]=cPeek; //sText.Append(cPeek); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else break; } oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes); goto ReturnChunk; } } // check if is end of quotes if(cPeek==cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing //sText.Clear(); oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),cQuotes); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else { //iCurPos++; break; } goto AttributeParsing; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { //iCurPos++; break; } } AttributeValueEnd: // ok we are done, add outstanding attribute int iLen=iCurPos-iValueStartOffset-1; if(iLen>0) oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iLen),cQuotes); else oChunk.AddParam(sAttrName,"",cQuotes); ReturnChunk: if(oChunk.bClosure) { oChunk.oType=HTMLchunkType.CloseTag; } else oChunk.oType=HTMLchunkType.OpenTag; return oChunk; }
private void Dispose(bool bDisposing) { if(!bDisposed) { bDisposed=true; bHTML=null; oChunk=null; sText=null; oE=null; oP=null; } }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet) { if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta") return false; // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if(!oChunk.bHashMode) oChunk.ConvertParamsToHash(); string sKey=oChunk.oParams["http-equiv"] as string; if(sKey!=null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch(sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if(!bEncodingSet) { string sData=oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if(sData!=null) { if(oP.SetEncoding(sData)) { // we may need to re-encode title if(!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet=true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return true; default: break; }; } return false; }
/// <summary> /// Internally parses tag and returns it from point when '<' was found /// </summary> /// <returns>Chunk</returns> HTMLchunk GetNextTag() { //iCurPos++; oChunk=oTP.ParseTag(ref iCurPos); // for backwards compatibility mark closed tags with params as open if(oChunk.iParams>0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType==HTMLchunkType.CloseTag) oChunk.oType=HTMLchunkType.OpenTag; // 012345 // check for start of script if(oChunk.sTag.Length==6 && oChunk.sTag[0]=='s' && oChunk.sTag=="script") { if(!oChunk.bClosure) { oChunk.oType=HTMLchunkType.Script; oChunk=oTP.ParseScript(ref iCurPos); return oChunk; } } oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset; if(bKeepRawHTML) oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength); return oChunk; }
private void Dispose(bool bDisposing) { if(!bDisposed) { bDisposed=true; if(oChunk!=null) { oChunk.Dispose(); oChunk=null; } if(sText!=null) { sText.Dispose(); sText=null; } bHTML=null; if(oE!=null) { oE.Dispose(); oE=null; } if(oTP!=null) { oTP.Dispose(); oTP=null; } } }
/// <summary> /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk. /// </summary> /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that /// was initiated with the same HTML data that this chunk belongs to</param> public void SetRawHTML(HTMLchunk oChunk) { // note: this really should have been byte array assigned rather than string // it would be more correct originality-wise oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength); }