public SetChunkHashMode ( bool bHashMode ) : void | ||
bHashMode | bool | If true then tag's params will be kept in Chunk's hashtable (slower), otherwise kept in arrays (sParams/sValues) |
return | void |
internal static HTMLparser GetInstance() { HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = true; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities = true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities = true; if (!oP.bDecodeEntities && oP.bDecodeMiniEntities) oP.InitMiniEntities(); // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly = true; // if true then comments will be extracted automatically oP.bAutoKeepComments = true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts = true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag = true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen = false; return oP; }
/// <summary> /// Starts parsing /// </summary> /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param> void Start(int iParseTimes) { string sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + "majestic12.html"); if (!File.Exists(sFileName)) { Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName); return; } HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = false; // load HTML from file oP.LoadFromFile(sFileName); DateTime oStart = DateTime.Now; for (int i = 0; i < iParseTimes; i++) { if (iParseTimes > 1) { BenchMarkParse(oP); } else { ParseAndPrint(oP); } oP.Reset(); } // calculate number of milliseconds we were parsing int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond); if (iMSecs > 0 && iParseTimes > 0) { Console.WriteLine("Parsed {0} time(s), total time {1} secs, approximately {2} ms per full parse.", iParseTimes, iMSecs / 1000, iMSecs / iParseTimes); } oP.Close(); }
static void getM12(byte[] by) { parser = new HTMLparser(); parser.SetChunkHashMode(false); parser.bKeepRawHTML = false; parser.bDecodeEntities = true; parser.bDecodeMiniEntities = true; if (!parser.bDecodeEntities && parser.bDecodeMiniEntities) { parser.InitMiniEntities(); } parser.bAutoExtractBetweenTagsOnly = true; parser.bAutoKeepComments = true; parser.bAutoKeepScripts = true; parser.bCompressWhiteSpaceBeforeTag = true; parser.bAutoMarkClosedTagsWithParamsAsOpen = false; parser.Init(by); }
static void GotDiscoveryPage(IAsyncResult result) { var stateObjects = (object[])result.AsyncState; var request = (HttpWebRequest)stateObjects[0]; var callback = (Action<ProviderDiscoveryData>)stateObjects[1]; HttpWebResponse response; try { response = (HttpWebResponse)request.EndGetResponse(result); } catch (Exception ex) { // Signal a failure. callback(new ProviderDiscoveryData { Success = false, FailureReason = ex }); return; } // Let's take a look at this response. // Do we have an XRDS document on our hands? if (response.ContentType.StartsWith("application/xrds+xml")) { // We do. Get the contents and send them off for processing. Then we're done here. var reader = new StreamReader(response.GetResponseStream()); var xrdsData = reader.ReadToEnd(); response.Close(); ProcessXrds(xrdsData, callback); return; } // Look for a telling header. if (!string.IsNullOrEmpty(response.Headers["X-XRDS-Location"])) { // We know where to look. Create a new request to get that document, and point its callback right back to this function. var newRequest = CreateDiscoveryWebRequest(response.Headers["X-XRDS-Location"]); response.Close(); if (newRequest == null) { // Signal a failure. callback(new ProviderDiscoveryData { Success = false }); return; } newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback }); return; } // So much for keeping it simple. Now we've got to parse HTML to figure out something about OpenID at this URL. // Read the HTML. var reader2 = new StreamReader(response.GetResponseStream()); var htmlData = reader2.ReadToEnd(); response.Close(); // Initialize the HTML parser. var parser = new HTMLparser(); parser.SetChunkHashMode(false); parser.bDecodeEntities = true; parser.Init(htmlData); // Go though every chunk and look for useful tags. HTMLchunk chunk; string xrdsPointer = null; string openid2Provider = null; string openid2OpLocal = null; while ((chunk = parser.ParseNextTag()) != null) { if (chunk.oType != HTMLchunkType.OpenTag && chunk.oType != HTMLchunkType.CloseTag) continue; if (chunk.sTag != "meta" && chunk.sTag != "link") continue; // Convert the params to a dictionary, with keys being lowercase. var dict = new Dictionary<string, string>(); for (var i = 0; i < chunk.iParams; i++) dict[chunk.sParams[i].ToLower().Trim()] = chunk.sValues[i]; // Do we have a META tag? if (chunk.sTag == "meta") { // Do we have an XRDS pointer? if (dict.ContainsKey("http-equiv") && dict.ContainsKey("content") && dict["http-equiv"].Equals("X-XRDS-Location", StringComparison.CurrentCultureIgnoreCase)) xrdsPointer = dict["content"]; } else if (chunk.sTag == "link") { if (dict.ContainsKey("rel") && dict.ContainsKey("href")) { // There are certain RELs we care about. if (dict["rel"].Contains("openid2.provider")) openid2Provider = dict["href"]; else if (dict["rel"].Contains("openid2.local_id")) openid2OpLocal = dict["href"]; } } } // Do we have needed LINKs? if (openid2Provider != null) { // Yes we do! Signal success. callback(new ProviderDiscoveryData { Success = true, DiscoveredClaimedIdentifier = true, ProviderUri = openid2Provider, OpLocalIdentity = openid2OpLocal }); return; } // Do we have an XRDS pointer? if (xrdsPointer != null) { // Yes we do! Retrieve that and point back to this function. var newRequest = CreateDiscoveryWebRequest(xrdsPointer); if (newRequest == null) { // Signal a failure. callback(new ProviderDiscoveryData { Success = false }); return; } newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback }); return; } // We got nothing :( callback(new ProviderDiscoveryData { Success = false, FailureReason = new Exception("Could not find OpenID endpoint.") }); }
/// <summary> /// Starts parsing /// </summary> /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param> void Start(int iParseTimes, string sFileName) { if (!File.Exists(sFileName)) { sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + sFileName); if (!File.Exists(sFileName)) { Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName); return; } } HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = false; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities = true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities = true; if (!oP.bDecodeEntities && oP.bDecodeMiniEntities) { oP.InitMiniEntities(); } // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly = true; // if true then comments will be extracted automatically oP.bAutoKeepComments = true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts = true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag = true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen = false; // load HTML from file oP.LoadFromFile(sFileName); // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data): // oP.Init(bHTML); DateTime oStart = DateTime.Now; for (int i = 0; i < iParseTimes; i++) { if (iParseTimes > 1) { BenchMarkParse(oP); } else { ParseAndPrint(oP); } oP.Reset(); } // calculate number of milliseconds we were parsing int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond); if (iMSecs > 0 && iParseTimes > 0) { Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.", iParseTimes, iMSecs * 1.0 / 1000, iMSecs * 1.0 / iParseTimes); } oP.Close(); }
/// <summary> /// Starts parsing /// </summary> /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param> void Start(int iParseTimes,string sFileName) { if(!File.Exists(sFileName)) { sFileName=Path.Combine(Directory.GetCurrentDirectory(),"tests"+Path.DirectorySeparatorChar+sFileName); if(!File.Exists(sFileName)) { Console.WriteLine("Could not find file in current directory to parse - expected it to be here: "+sFileName); return; } } HTMLparser oP=new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML=false; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities=true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities=true; if(!oP.bDecodeEntities && oP.bDecodeMiniEntities) oP.InitMiniEntities(); // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly=true; // if true then comments will be extracted automatically oP.bAutoKeepComments=true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts=true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag=true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen=false; // load HTML from file oP.LoadFromFile(sFileName); // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data): // oP.Init(bHTML); DateTime oStart=DateTime.Now; for(int i=0; i<iParseTimes; i++) { if(iParseTimes>1) BenchMarkParse(oP); else ParseAndPrint(oP); oP.Reset(); } // calculate number of milliseconds we were parsing int iMSecs=(int)((DateTime.Now.Ticks-oStart.Ticks)/TimeSpan.TicksPerMillisecond); if(iMSecs>0 && iParseTimes>0) { Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.",iParseTimes,iMSecs*1.0/1000,iMSecs*1.0/iParseTimes); } oP.Close(); }
public HtmlNode Parse(string html) { // Majestic12 doesn't support doctype html = dedoctype.Replace(html, ""); var builder = new HtmlBuilder(); var parser = new HTMLparser(); parser.bDecodeEntities = false; parser.SetChunkHashMode(true); parser.Init(html); var chunk = parser.ParseNext(); while (chunk != null) { switch (chunk.oType) { case HTMLchunkType.OpenTag: // if something goes wrong - ignore it if (chunk.sTag != "") { var attributes = new Dictionary<string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attributes.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attributes); } break; case HTMLchunkType.Comment: builder.AddComment(chunk.oHTML); break; case HTMLchunkType.CloseTag: if (chunk.bEndClosure) { var attr = new Dictionary<string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attr.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attr); builder.CloseTag(chunk.sTag); } else { builder.CloseTag(chunk.sTag); } break; case HTMLchunkType.Script: builder.AddScript(chunk.oHTML); break; case HTMLchunkType.Text: builder.AddText(chunk.oHTML); break; default: break; } chunk = parser.ParseNext(); } return builder.Render(); }