private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; if (oChunk != null) { oChunk.Dispose(); oChunk = null; } if (sText != null) { sText.Dispose(); sText = null; } bHTML = null; if (oE != null) { oE.Dispose(); oE = null; } if (oTP != null) { oTP.Dispose(); oTP = null; } } }
/// <summary> /// Internally parses tag and returns it from point when '<' was found /// </summary> /// <returns>Chunk</returns> HTMLchunk GetNextTag() { //iCurPos++; oChunk = oTP.ParseTag(ref iCurPos); // for backwards compatibility mark closed tags with params as open if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag) { oChunk.oType = HTMLchunkType.OpenTag; } // 012345 // check for start of script if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script") { if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.Script; oChunk = oTP.ParseScript(ref iCurPos); return(oChunk); } } oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (bKeepRawHTML) { oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); } return(oChunk); }
/// <summary> /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk. /// </summary> /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that /// was initiated with the same HTML data that this chunk belongs to</param> public void SetRawHTML(HTMLchunk oChunk) { // note: this really should have been byte array assigned rather than string // it would be more correct originality-wise oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!oChunk.bHashMode) { oChunk.ConvertParamsToHash(); } string sKey = oChunk.oParams["http-equiv"] as string; if (sKey != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin B�chtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!bEncodingSet) { string sData = oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if (sData != null) { if (oP.SetEncoding(sData)) { // we may need to re-encode title if (!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } ; } return(false); }