/// <summary> /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here /// </summary> /// <param name="oP">Parser object</param> void BenchMarkParse(HTMLparser oP) { // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk = null; // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: break; // matched normal text case HTMLchunkType.Text: break; // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: break; } ; } }
internal static HTMLparser GetInstance() { HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = true; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities = true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities = true; if (!oP.bDecodeEntities && oP.bDecodeMiniEntities) oP.InitMiniEntities(); // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly = true; // if true then comments will be extracted automatically oP.bAutoKeepComments = true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts = true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag = true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen = false; return oP; }
/// <summary> /// Starts parsing /// </summary> /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param> void Start(int iParseTimes) { string sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + "majestic12.html"); if (!File.Exists(sFileName)) { Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName); return; } HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = false; // load HTML from file oP.LoadFromFile(sFileName); DateTime oStart = DateTime.Now; for (int i = 0; i < iParseTimes; i++) { if (iParseTimes > 1) { BenchMarkParse(oP); } else { ParseAndPrint(oP); } oP.Reset(); } // calculate number of milliseconds we were parsing int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond); if (iMSecs > 0 && iParseTimes > 0) { Console.WriteLine("Parsed {0} time(s), total time {1} secs, approximately {2} ms per full parse.", iParseTimes, iMSecs / 1000, iMSecs / iParseTimes); } oP.Close(); }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; bHTML = null; oChunk = null; sText = null; oE = null; oP = null; } }
/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE) { oP = p_oP; oChunk = p_oChunk; sText = p_sText; bHTML = p_bHTML; iDataLength = p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS; oE = p_oE; oHE = p_oHE; }
private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) return; if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } }
public void CreateParser() { if(oP!=null) { oP.Close(); oP=null; } oP=new HTMLparser(); oP.bDecodeEntities=true; // dummy assertion Assert.IsNotNull(oP); }
static void getCDU(ref string pu) { var dcc = new Dictionary <string, string>(); string dm = ""; var assembly = System.Reflection.Assembly.GetExecutingAssembly(); var sr = new StreamReader(assembly.GetManifestResourceStream("fcc.dcc"), Encoding.UTF8); // foreach(var s in File.ReadAllLines("fcc.dcc",Encoding.UTF8)) string s; while (!sr.EndOfStream) { s = sr.ReadLine(); int p = s.IndexOf('\t'); dcc.Add(s.Substring(0, p), s.Substring(p + 1)); } foreach (var k in dcc.Keys) { if (pu.IndexOf(k) > -1) { dm = k; break; } } if (dm == "") { return; } byte[] by = getPUbytes(pu, dm, dcc[dm]); getM12(by); string v; int o = 0; switch (dm) { case "m24.ru": //https://www.m24.ru/news/proisshestviya/20012020/104276 v = getScrByID("type", "application/ld+json"); if (v != null) { o = v.IndexOf("contentUrl"); if (o > -1) { pu = v.Substring(o + 14, v.IndexOf(",", o + 14) - o - 15); } } break; } by = null; parser = null; return; }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oChunk">Chunk</param> void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) { return; } if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) { Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } } }
static void getM12(byte[] by) { parser = new HTMLparser(); parser.SetChunkHashMode(false); parser.bKeepRawHTML = false; parser.bDecodeEntities = true; parser.bDecodeMiniEntities = true; if (!parser.bDecodeEntities && parser.bDecodeMiniEntities) { parser.InitMiniEntities(); } parser.bAutoExtractBetweenTagsOnly = true; parser.bAutoKeepComments = true; parser.bAutoKeepScripts = true; parser.bCompressWhiteSpaceBeforeTag = true; parser.bAutoMarkClosedTagsWithParamsAsOpen = false; parser.Init(by); }
private HTMLchunk[] htmlParse(string str) { //return value ArrayList ret = new ArrayList(); //init parser Majestic12.HTMLparser parser = new Majestic12.HTMLparser(); //keep raw html because we need to reconstruct it parser.bKeepRawHTML = true; //keep text... this is for parsing just tags parser.bTextMode = true; //initialize to parse the string parser.Init(str); Majestic12.HTMLchunk chunk = null; // we parse until returned chunk is null indicating we reached end of parsing while ((chunk = parser.ParseNext()) != null) { //discard empty blocks for performance increase if (chunk.oHTML.Trim() != "") { //hard copy the chunk HTMLchunk clone = new HTMLchunk(false); clone.oHTML = String.Copy(chunk.oHTML); clone.oType = chunk.oType; clone.sTag = String.Copy(chunk.sTag); ret.Add(clone); } } parser.CleanUp(); //return string array return((HTMLchunk[])ret.ToArray(typeof(HTMLchunk))); }
/// <summary> /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk /// </summary> /// <param name="oP">Parser object</param> void ParseAndPrint(HTMLparser oP) { // bReadLineDelay=false; if(bReadLineDelay) Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue"); // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk=null; // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to // consider HTTP header setting as more important, so it is best to behave in similar way. // See below for code that deals with META based charset setting, similarly you need to call // it here if charset is set in Content-Type header // we will track whether encoding was set or not here, this is important // because we may have to do re-encoding of text found BEFORE META tag, this typically // happens for TITLE tags only - if we had no encoding set and then had it set, then // we need to reencode it, highly annoying, but having garbage in title is even more annoying bool bEncodingSet=false; // debug: oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1")); // we parse until returned oChunk is null indicating we reached end of parsing while((oChunk=oP.ParseNext())!=null) { switch(oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: Console.Write("Open tag: "+oChunk.sTag); // in order to set correct encoding we need to keep an eye on META tags // that hit us on what the encoding should be used, note here // that some webpages have TITLE set BEFORE meta-tags, which means you will // have to re-encode it in order to get correct representation of text PrintParams: if(oChunk.sTag.Length==4 && oChunk.sTag=="meta") { HandleMetaEncoding(oP,oChunk,ref bEncodingSet); }; // commented out call to code that will do the job for you - long code below // is left to demonstrate how to access individual param values // Console.WriteLine("{0}",oChunk.GenerateParamsHTML()); if(oChunk.bHashMode) { if(oChunk.oParams.Count>0) { foreach(string sParam in oChunk.oParams.Keys) { string sValue=oChunk.oParams[sParam].ToString(); if(sValue.Length>0) Console.Write(" {0}='{1}'",sParam,sValue); else Console.Write(" {0}",sParam); } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if(oChunk.iParams>0) { for(int i=0; i<oChunk.iParams; i++) { // here we can use exactly the same single/double quotes as they // were used on params switch(oChunk.cParamChars[i]) { case (byte)' ': if(oChunk.sValues[i].Length==0) Console.Write(" {0}",oChunk.sParams[i]); else Console.Write(" {0}={1}",oChunk.sParams[i],oChunk.sValues[i]); break; default: Console.Write(" {0}={1}{2}{1}",oChunk.sParams[i],(char)oChunk.cParamChars[i],oChunk.sValues[i]); break; } } } } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: //Console.Write(oChunk.GenerateHTML()); Console.Write("Closed tag: "+oChunk.sTag); if(oChunk.iParams>0) goto PrintParams; break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // Matched data between <script></script> tags case HTMLchunkType.Script: if(!oP.bAutoKeepScripts && !oP.bKeepRawHTML) oP.SetRawHTML(oChunk); if(oChunk.oHTML.Length>0) Console.Write("Script: "+oChunk.oHTML); else Console.Write("Script: [ignored for performance reasons]"); if(oChunk.iParams>0) goto PrintParams; break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: //Console.WriteLine("{0}",oChunk.GenerateHTML()); if(oP.bKeepRawHTML || oP.bAutoKeepComments) { // by default we won't finalise automatically as comments are often // very lenghty and it is costly to create long strings when they are not // needed, ie: during indexing of text Console.Write("Comment: "+oChunk.oHTML); } else { // Even if raw HTML by default was not taken you can get it anyway by // uncommenting next line //oP.SetRawHTML(oChunk); Console.Write("Comment: [ignored for performance reasons]"); } break; // matched normal text case HTMLchunkType.Text: // skip pure whitespace that we are not really interested in if(oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length==0 && bReadLineDelay) continue; Console.Write("Text: '{0}'",oChunk.oHTML); break; }; if(bReadLineDelay) Console.ReadLine(); else Console.WriteLine(""); } }
/// <summary> /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk /// </summary> /// <param name="oP">Parser object</param> void ParseAndPrint(HTMLparser oP) { if (bReadLineDelay) { Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue"); } // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk = null; // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: Console.Write("Open tag: " + oChunk.sTag); // lets get params and their values // if hashmode is set then param/values are kept in Hashtable oChunk.oParams // this makes parsing slower, so if you want the highest performance then you // need to HashMode to false if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { Console.Write(" {0}='{1}'", sParam, sValue); } else { Console.Write(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { if (oChunk.sValues[i].Length > 0) { Console.Write(" {0}='{1}'", oChunk.sParams[i], oChunk.sValues[i]); } else { Console.Write(" {0}", oChunk.sParams[i]); } } } } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: Console.Write("Closed tag: " + oChunk.sTag); break; // matched normal text case HTMLchunkType.Text: Console.Write("Text: '{0}'", oChunk.oHTML); break; // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: // Note: you need to call finalisation on the chunk as by default comments are // not finalised for performance reasons - if you have made parser to keep raw // HTML then you won't be needing to finalise it if (!oP.bKeepRawHTML) { oChunk.Finalise(); } Console.Write("Comment: " + oChunk.oHTML); break; } ; if (bReadLineDelay) { Console.ReadLine(); } else { Console.WriteLine(""); } } }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!oChunk.bHashMode) { oChunk.ConvertParamsToHash(); } string sKey = oChunk.oParams["http-equiv"] as string; if (sKey != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!bEncodingSet) { string sData = oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if (sData != null) { if (oP.SetEncoding(sData)) { // we may need to re-encode title if (!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } ; } return(false); }
/// <summary> /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk /// </summary> /// <param name="oP">Parser object</param> void ParseAndPrint(HTMLparser oP) { // bReadLineDelay=false; if (bReadLineDelay) { Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue"); } // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk = null; // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to // consider HTTP header setting as more important, so it is best to behave in similar way. // See below for code that deals with META based charset setting, similarly you need to call // it here if charset is set in Content-Type header // we will track whether encoding was set or not here, this is important // because we may have to do re-encoding of text found BEFORE META tag, this typically // happens for TITLE tags only - if we had no encoding set and then had it set, then // we need to reencode it, highly annoying, but having garbage in title is even more annoying bool bEncodingSet = false; // debug: oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1")); // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: Console.Write("Open tag: " + oChunk.sTag); // in order to set correct encoding we need to keep an eye on META tags // that hit us on what the encoding should be used, note here // that some webpages have TITLE set BEFORE meta-tags, which means you will // have to re-encode it in order to get correct representation of text PrintParams: if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; // commented out call to code that will do the job for you - long code below // is left to demonstrate how to access individual param values // Console.WriteLine("{0}",oChunk.GenerateParamsHTML()); if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { Console.Write(" {0}='{1}'", sParam, sValue); } else { Console.Write(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { // here we can use exactly the same single/double quotes as they // were used on params switch (oChunk.cParamChars[i]) { case (byte)' ': if (oChunk.sValues[i].Length == 0) { Console.Write(" {0}", oChunk.sParams[i]); } else { Console.Write(" {0}={1}", oChunk.sParams[i], oChunk.sValues[i]); } break; default: Console.Write(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], oChunk.sValues[i]); break; } } } } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: //Console.Write(oChunk.GenerateHTML()); Console.Write("Closed tag: " + oChunk.sTag); if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // Matched data between <script></script> tags case HTMLchunkType.Script: if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML) { oP.SetRawHTML(oChunk); } if (oChunk.oHTML.Length > 0) { Console.Write("Script: " + oChunk.oHTML); } else { Console.Write("Script: [ignored for performance reasons]"); } if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: //Console.WriteLine("{0}",oChunk.GenerateHTML()); if (oP.bKeepRawHTML || oP.bAutoKeepComments) { // by default we won't finalise automatically as comments are often // very lenghty and it is costly to create long strings when they are not // needed, ie: during indexing of text Console.Write("Comment: " + oChunk.oHTML); } else { // Even if raw HTML by default was not taken you can get it anyway by // uncommenting next line //oP.SetRawHTML(oChunk); Console.Write("Comment: [ignored for performance reasons]"); } break; // matched normal text case HTMLchunkType.Text: // skip pure whitespace that we are not really interested in if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0 && bReadLineDelay) { continue; } Console.Write("Text: '{0}'", oChunk.oHTML); break; } ; if (bReadLineDelay) { Console.ReadLine(); } else { Console.WriteLine(""); } } }
/// <summary> /// Starts parsing /// </summary> /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param> void Start(int iParseTimes, string sFileName) { if (!File.Exists(sFileName)) { sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + sFileName); if (!File.Exists(sFileName)) { Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName); return; } } HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = false; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities = true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities = true; if (!oP.bDecodeEntities && oP.bDecodeMiniEntities) { oP.InitMiniEntities(); } // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly = true; // if true then comments will be extracted automatically oP.bAutoKeepComments = true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts = true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag = true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen = false; // load HTML from file oP.LoadFromFile(sFileName); // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data): // oP.Init(bHTML); DateTime oStart = DateTime.Now; for (int i = 0; i < iParseTimes; i++) { if (iParseTimes > 1) { BenchMarkParse(oP); } else { ParseAndPrint(oP); } oP.Reset(); } // calculate number of milliseconds we were parsing int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond); if (iMSecs > 0 && iParseTimes > 0) { Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.", iParseTimes, iMSecs * 1.0 / 1000, iMSecs * 1.0 / iParseTimes); } oP.Close(); }
private HTMLchunk[] htmlParse(string str) { //return value ArrayList ret = new ArrayList(); //init parser Majestic12.HTMLparser parser = new Majestic12.HTMLparser(); //keep raw html because we need to reconstruct it parser.bKeepRawHTML = true; //keep text... this is for parsing just tags parser.bTextMode = true; //initialize to parse the string parser.Init(str); Majestic12.HTMLchunk chunk = null; // we parse until returned chunk is null indicating we reached end of parsing while ((chunk = parser.ParseNext()) != null) { //discard empty blocks for performance increase if (chunk.oHTML.Trim() != "") { //hard copy the chunk HTMLchunk clone = new HTMLchunk(false); clone.oHTML = String.Copy(chunk.oHTML); clone.oType = chunk.oType; clone.sTag = String.Copy(chunk.sTag); ret.Add(clone); } } parser.CleanUp(); //return string array return (HTMLchunk[])ret.ToArray(typeof(HTMLchunk)); }
private void Dispose(bool bDisposing) { if(!bDisposed) { bDisposed=true; bHTML=null; oChunk=null; sText=null; oE=null; oP=null; } }
/// <summary> /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here /// </summary> /// <param name="oP">Parser object</param> void BenchMarkParse(HTMLparser oP) { // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk=null; // we parse until returned oChunk is null indicating we reached end of parsing while((oChunk=oP.ParseNext())!=null) { switch(oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: break; // matched normal text case HTMLchunkType.Text: break; // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: break; }; } }
public HtmlNode Parse(string html) { // Majestic12 doesn't support doctype html = dedoctype.Replace(html, ""); var builder = new HtmlBuilder(); var parser = new HTMLparser(); parser.bDecodeEntities = false; parser.SetChunkHashMode(true); parser.Init(html); var chunk = parser.ParseNext(); while (chunk != null) { switch (chunk.oType) { case HTMLchunkType.OpenTag: // if something goes wrong - ignore it if (chunk.sTag != "") { var attributes = new Dictionary<string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attributes.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attributes); } break; case HTMLchunkType.Comment: builder.AddComment(chunk.oHTML); break; case HTMLchunkType.CloseTag: if (chunk.bEndClosure) { var attr = new Dictionary<string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attr.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attr); builder.CloseTag(chunk.sTag); } else { builder.CloseTag(chunk.sTag); } break; case HTMLchunkType.Script: builder.AddScript(chunk.oHTML); break; case HTMLchunkType.Text: builder.AddText(chunk.oHTML); break; default: break; } chunk = parser.ParseNext(); } return builder.Render(); }
/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE) { oP=p_oP; oChunk=p_oChunk; sText=p_sText; bHTML=p_bHTML; iDataLength=p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS; oE=p_oE; oHE=p_oHE; }
/// <summary> /// Starts parsing /// </summary> /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param> void Start(int iParseTimes,string sFileName) { if(!File.Exists(sFileName)) { sFileName=Path.Combine(Directory.GetCurrentDirectory(),"tests"+Path.DirectorySeparatorChar+sFileName); if(!File.Exists(sFileName)) { Console.WriteLine("Could not find file in current directory to parse - expected it to be here: "+sFileName); return; } } HTMLparser oP=new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML=false; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities=true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities=true; if(!oP.bDecodeEntities && oP.bDecodeMiniEntities) oP.InitMiniEntities(); // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly=true; // if true then comments will be extracted automatically oP.bAutoKeepComments=true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts=true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag=true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen=false; // load HTML from file oP.LoadFromFile(sFileName); // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data): // oP.Init(bHTML); DateTime oStart=DateTime.Now; for(int i=0; i<iParseTimes; i++) { if(iParseTimes>1) BenchMarkParse(oP); else ParseAndPrint(oP); oP.Reset(); } // calculate number of milliseconds we were parsing int iMSecs=(int)((DateTime.Now.Ticks-oStart.Ticks)/TimeSpan.TicksPerMillisecond); if(iMSecs>0 && iParseTimes>0) { Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.",iParseTimes,iMSecs*1.0/1000,iMSecs*1.0/iParseTimes); } oP.Close(); }
static void GotDiscoveryPage(IAsyncResult result) { var stateObjects = (object[])result.AsyncState; var request = (HttpWebRequest)stateObjects[0]; var callback = (Action<ProviderDiscoveryData>)stateObjects[1]; HttpWebResponse response; try { response = (HttpWebResponse)request.EndGetResponse(result); } catch (Exception ex) { // Signal a failure. callback(new ProviderDiscoveryData { Success = false, FailureReason = ex }); return; } // Let's take a look at this response. // Do we have an XRDS document on our hands? if (response.ContentType.StartsWith("application/xrds+xml")) { // We do. Get the contents and send them off for processing. Then we're done here. var reader = new StreamReader(response.GetResponseStream()); var xrdsData = reader.ReadToEnd(); response.Close(); ProcessXrds(xrdsData, callback); return; } // Look for a telling header. if (!string.IsNullOrEmpty(response.Headers["X-XRDS-Location"])) { // We know where to look. Create a new request to get that document, and point its callback right back to this function. var newRequest = CreateDiscoveryWebRequest(response.Headers["X-XRDS-Location"]); response.Close(); if (newRequest == null) { // Signal a failure. callback(new ProviderDiscoveryData { Success = false }); return; } newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback }); return; } // So much for keeping it simple. Now we've got to parse HTML to figure out something about OpenID at this URL. // Read the HTML. var reader2 = new StreamReader(response.GetResponseStream()); var htmlData = reader2.ReadToEnd(); response.Close(); // Initialize the HTML parser. var parser = new HTMLparser(); parser.SetChunkHashMode(false); parser.bDecodeEntities = true; parser.Init(htmlData); // Go though every chunk and look for useful tags. HTMLchunk chunk; string xrdsPointer = null; string openid2Provider = null; string openid2OpLocal = null; while ((chunk = parser.ParseNextTag()) != null) { if (chunk.oType != HTMLchunkType.OpenTag && chunk.oType != HTMLchunkType.CloseTag) continue; if (chunk.sTag != "meta" && chunk.sTag != "link") continue; // Convert the params to a dictionary, with keys being lowercase. var dict = new Dictionary<string, string>(); for (var i = 0; i < chunk.iParams; i++) dict[chunk.sParams[i].ToLower().Trim()] = chunk.sValues[i]; // Do we have a META tag? if (chunk.sTag == "meta") { // Do we have an XRDS pointer? if (dict.ContainsKey("http-equiv") && dict.ContainsKey("content") && dict["http-equiv"].Equals("X-XRDS-Location", StringComparison.CurrentCultureIgnoreCase)) xrdsPointer = dict["content"]; } else if (chunk.sTag == "link") { if (dict.ContainsKey("rel") && dict.ContainsKey("href")) { // There are certain RELs we care about. if (dict["rel"].Contains("openid2.provider")) openid2Provider = dict["href"]; else if (dict["rel"].Contains("openid2.local_id")) openid2OpLocal = dict["href"]; } } } // Do we have needed LINKs? if (openid2Provider != null) { // Yes we do! Signal success. callback(new ProviderDiscoveryData { Success = true, DiscoveredClaimedIdentifier = true, ProviderUri = openid2Provider, OpLocalIdentity = openid2OpLocal }); return; } // Do we have an XRDS pointer? if (xrdsPointer != null) { // Yes we do! Retrieve that and point back to this function. var newRequest = CreateDiscoveryWebRequest(xrdsPointer); if (newRequest == null) { // Signal a failure. callback(new ProviderDiscoveryData { Success = false }); return; } newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback }); return; } // We got nothing :( callback(new ProviderDiscoveryData { Success = false, FailureReason = new Exception("Could not find OpenID endpoint.") }); }
public HtmlParser(HTMLparser parser) { Parser = parser; }