SetChunkHashMode() public method

Sets chunk param hash mode
public SetChunkHashMode ( bool bHashMode ) : void
bHashMode bool If true then tag's params will be kept in Chunk's hashtable (slower), otherwise kept in arrays (sParams/sValues)
return void
        internal static HTMLparser GetInstance()
        {
            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = true;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities = true;

            // we have option to keep most entities as is - only replace stuff like  
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities = true;

            if (!oP.bDecodeEntities && oP.bDecodeMiniEntities)
                oP.InitMiniEntities();

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly = true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments = true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts = true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag = true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen = false;

            return oP;
        }
Esempio n. 2
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes)
        {
            string sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + "majestic12.html");

            if (!File.Exists(sFileName))
            {
                Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName);
                return;
            }

            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            DateTime oStart = DateTime.Now;

            for (int i = 0; i < iParseTimes; i++)
            {
                if (iParseTimes > 1)
                {
                    BenchMarkParse(oP);
                }
                else
                {
                    ParseAndPrint(oP);
                }


                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond);

            if (iMSecs > 0 && iParseTimes > 0)
            {
                Console.WriteLine("Parsed {0} time(s), total time {1} secs, approximately {2} ms per full parse.", iParseTimes, iMSecs / 1000, iMSecs / iParseTimes);
            }

            oP.Close();
        }
Esempio n. 3
0
 static void getM12(byte[] by)
 {
     parser = new HTMLparser();
     parser.SetChunkHashMode(false);
     parser.bKeepRawHTML        = false;
     parser.bDecodeEntities     = true;
     parser.bDecodeMiniEntities = true;
     if (!parser.bDecodeEntities && parser.bDecodeMiniEntities)
     {
         parser.InitMiniEntities();
     }
     parser.bAutoExtractBetweenTagsOnly         = true;
     parser.bAutoKeepComments                   = true;
     parser.bAutoKeepScripts                    = true;
     parser.bCompressWhiteSpaceBeforeTag        = true;
     parser.bAutoMarkClosedTagsWithParamsAsOpen = false;
     parser.Init(by);
 }
Esempio n. 4
0
        static void GotDiscoveryPage(IAsyncResult result)
        {
            var stateObjects = (object[])result.AsyncState;
            var request = (HttpWebRequest)stateObjects[0];
            var callback = (Action<ProviderDiscoveryData>)stateObjects[1];

            HttpWebResponse response;

            try
            {
                response = (HttpWebResponse)request.EndGetResponse(result);
            }
            catch (Exception ex)
            {
                // Signal a failure.
                callback(new ProviderDiscoveryData { Success = false, FailureReason = ex });
                return;
            }

            // Let's take a look at this response.

            // Do we have an XRDS document on our hands?
            if (response.ContentType.StartsWith("application/xrds+xml"))
            {
                // We do. Get the contents and send them off for processing. Then we're done here.
                var reader = new StreamReader(response.GetResponseStream());
                var xrdsData = reader.ReadToEnd();
                response.Close();
                ProcessXrds(xrdsData, callback);
                return;
            }

            // Look for a telling header.
            if (!string.IsNullOrEmpty(response.Headers["X-XRDS-Location"]))
            {
                // We know where to look. Create a new request to get that document, and point its callback right back to this function.
                var newRequest = CreateDiscoveryWebRequest(response.Headers["X-XRDS-Location"]);
                response.Close();
                if (newRequest == null)
                {
                    // Signal a failure.
                    callback(new ProviderDiscoveryData { Success = false });
                    return;
                }
                newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback });
                return;
            }

            // So much for keeping it simple. Now we've got to parse HTML to figure out something about OpenID at this URL.
            // Read the HTML.
            var reader2 = new StreamReader(response.GetResponseStream());
            var htmlData = reader2.ReadToEnd();
            response.Close();

            // Initialize the HTML parser.
            var parser = new HTMLparser();
            parser.SetChunkHashMode(false);
            parser.bDecodeEntities = true;
            parser.Init(htmlData);

            // Go though every chunk and look for useful tags.
            HTMLchunk chunk;
            string xrdsPointer = null;
            string openid2Provider = null;
            string openid2OpLocal = null;
            while ((chunk = parser.ParseNextTag()) != null)
            {
                if (chunk.oType != HTMLchunkType.OpenTag && chunk.oType != HTMLchunkType.CloseTag) continue;

                if (chunk.sTag != "meta" && chunk.sTag != "link") continue;

                // Convert the params to a dictionary, with keys being lowercase.
                var dict = new Dictionary<string, string>();
                for (var i = 0; i < chunk.iParams; i++)
                    dict[chunk.sParams[i].ToLower().Trim()] = chunk.sValues[i];

                // Do we have a META tag?
                if (chunk.sTag == "meta")
                {
                    // Do we have an XRDS pointer?
                    if (dict.ContainsKey("http-equiv") && dict.ContainsKey("content") &&
                        dict["http-equiv"].Equals("X-XRDS-Location", StringComparison.CurrentCultureIgnoreCase))
                        xrdsPointer = dict["content"];
                }
                else if (chunk.sTag == "link")
                {
                    if (dict.ContainsKey("rel") && dict.ContainsKey("href"))
                    {
                        // There are certain RELs we care about.
                        if (dict["rel"].Contains("openid2.provider")) openid2Provider = dict["href"];
                        else if (dict["rel"].Contains("openid2.local_id")) openid2OpLocal = dict["href"];
                    }
                }
            }

            // Do we have needed LINKs?
            if (openid2Provider != null)
            {
                // Yes we do! Signal success.
                callback(new ProviderDiscoveryData { Success = true, DiscoveredClaimedIdentifier = true, ProviderUri = openid2Provider, OpLocalIdentity = openid2OpLocal });
                return;
            }

            // Do we have an XRDS pointer?
            if (xrdsPointer != null)
            {
                // Yes we do! Retrieve that and point back to this function.
                var newRequest = CreateDiscoveryWebRequest(xrdsPointer);
                if (newRequest == null)
                {
                    // Signal a failure.
                    callback(new ProviderDiscoveryData { Success = false });
                    return;
                }
                newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback });
                return;
            }

            // We got nothing :(
            callback(new ProviderDiscoveryData { Success = false, FailureReason = new Exception("Could not find OpenID endpoint.") });
        }
Esempio n. 5
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes, string sFileName)
        {
            if (!File.Exists(sFileName))
            {
                sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + sFileName);

                if (!File.Exists(sFileName))
                {
                    Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName);
                    return;
                }
            }

            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = false;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities = true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities = true;

            if (!oP.bDecodeEntities && oP.bDecodeMiniEntities)
            {
                oP.InitMiniEntities();
            }

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly = true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments = true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts = true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag = true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen = false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data):
            // oP.Init(bHTML);

            DateTime oStart = DateTime.Now;

            for (int i = 0; i < iParseTimes; i++)
            {
                if (iParseTimes > 1)
                {
                    BenchMarkParse(oP);
                }
                else
                {
                    ParseAndPrint(oP);
                }

                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond);

            if (iMSecs > 0 && iParseTimes > 0)
            {
                Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.", iParseTimes, iMSecs * 1.0 / 1000, iMSecs * 1.0 / iParseTimes);
            }

            oP.Close();
        }
Esempio n. 6
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes,string sFileName)
        {
            if(!File.Exists(sFileName))
            {
                sFileName=Path.Combine(Directory.GetCurrentDirectory(),"tests"+Path.DirectorySeparatorChar+sFileName);

                if(!File.Exists(sFileName))
                {
                    Console.WriteLine("Could not find file in current directory to parse - expected it to be here: "+sFileName);
                    return;
                }
            }

            HTMLparser oP=new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML=false;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities=true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities=true;

            if(!oP.bDecodeEntities && oP.bDecodeMiniEntities)
               oP.InitMiniEntities();

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly=true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments=true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts=true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag=true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen=false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data):
            // oP.Init(bHTML);

            DateTime oStart=DateTime.Now;

            for(int i=0; i<iParseTimes; i++)
            {
                if(iParseTimes>1)
                    BenchMarkParse(oP);
                else
                    ParseAndPrint(oP);

                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs=(int)((DateTime.Now.Ticks-oStart.Ticks)/TimeSpan.TicksPerMillisecond);

            if(iMSecs>0 && iParseTimes>0)
            {
                Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.",iParseTimes,iMSecs*1.0/1000,iMSecs*1.0/iParseTimes);
            }

            oP.Close();
        }
Esempio n. 7
0
        public HtmlNode Parse(string html)
        {
            // Majestic12 doesn't support doctype
            html = dedoctype.Replace(html, "");
            var builder = new HtmlBuilder();

            var parser = new HTMLparser();
            parser.bDecodeEntities = false;
            parser.SetChunkHashMode(true);

            parser.Init(html);
            var chunk = parser.ParseNext();
            while (chunk != null)
            {
                switch (chunk.oType)
                {
                    case HTMLchunkType.OpenTag:
                        // if something goes wrong - ignore it
                        if (chunk.sTag != "")
                        {
                            var attributes = new Dictionary<string, string>();
                            if (chunk.iParams != 0)
                            {
                                foreach (string name in chunk.oParams.Keys)
                                {
                                    attributes.Add(name, (string)chunk.oParams[name]);
                                }
                            }
                            builder.OpenTag(chunk.sTag, attributes);
                        }
                        break;
                    case HTMLchunkType.Comment:
                        builder.AddComment(chunk.oHTML);
                        break;
                    case HTMLchunkType.CloseTag:
                        if (chunk.bEndClosure)
                        {
                            var attr = new Dictionary<string, string>();
                            if (chunk.iParams != 0)
                            {
                                foreach (string name in chunk.oParams.Keys)
                                {
                                    attr.Add(name, (string)chunk.oParams[name]);
                                }
                            }
                            builder.OpenTag(chunk.sTag, attr);
                            builder.CloseTag(chunk.sTag);
                        }
                        else
                        {
                            builder.CloseTag(chunk.sTag);
                        }
                        break;
                    case HTMLchunkType.Script:
                        builder.AddScript(chunk.oHTML);
                        break;
                    case HTMLchunkType.Text:
                        builder.AddText(chunk.oHTML);
                        break;
                    default:
                        break;
                }
                chunk = parser.ParseNext();
            }
            return builder.Render();
        }