Allows to parse HTML by splitting it into small token (HTMLchunks) such as tags, text, comments etc. Do NOT create multiple instances of this class - REUSE single instance Do NOT call same instance from multiple threads - it is NOT thread safe
Inheritance: IDisposable
Exemple #1
0
        /// <summary>
        /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here
        /// </summary>
        /// <param name="oP">Parser object</param>
        void BenchMarkParse(HTMLparser oP)
        {
            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    break;

                // matched normal text
                case HTMLchunkType.Text:
                    break;

                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:
                    break;
                }
                ;
            }
        }
        internal static HTMLparser GetInstance()
        {
            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = true;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities = true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities = true;

            if (!oP.bDecodeEntities && oP.bDecodeMiniEntities)
                oP.InitMiniEntities();

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly = true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments = true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts = true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag = true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen = false;

            return oP;
        }
Exemple #3
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes)
        {
            string sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + "majestic12.html");

            if (!File.Exists(sFileName))
            {
                Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName);
                return;
            }

            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            DateTime oStart = DateTime.Now;

            for (int i = 0; i < iParseTimes; i++)
            {
                if (iParseTimes > 1)
                {
                    BenchMarkParse(oP);
                }
                else
                {
                    ParseAndPrint(oP);
                }


                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond);

            if (iMSecs > 0 && iParseTimes > 0)
            {
                Console.WriteLine("Parsed {0} time(s), total time {1} secs, approximately {2} ms per full parse.", iParseTimes, iMSecs / 1000, iMSecs / iParseTimes);
            }

            oP.Close();
        }
        private void Dispose(bool bDisposing)
        {
            if (!bDisposed)
            {
                bDisposed = true;

                bHTML  = null;
                oChunk = null;
                sText  = null;
                oE     = null;
                oP     = null;
            }
        }
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE)
        {
            oP          = p_oP;
            oChunk      = p_oChunk;
            sText       = p_sText;
            bHTML       = p_bHTML;
            iDataLength = p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS;

            oE  = p_oE;
            oHE = p_oHE;
        }
        private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            // if encoding already set then we should not be trying to set new one
            // this is the logic that major browsers follow - the first Encoding is assumed to be
            // the correct one
            if (bEncodingSet)
                return;

            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
            {
                if (!bEncodingSet)
                    Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML());
            }
        }
Exemple #7
0
		public void CreateParser()
		{
			if(oP!=null)
			{
				oP.Close();
				oP=null;
			}

			oP=new HTMLparser();

			oP.bDecodeEntities=true;

			// dummy assertion 
			Assert.IsNotNull(oP);
		}
Exemple #8
0
        static void getCDU(ref string pu)
        {
            var dcc = new Dictionary <string, string>(); string dm = "";
            var assembly = System.Reflection.Assembly.GetExecutingAssembly();
            var sr       = new StreamReader(assembly.GetManifestResourceStream("fcc.dcc"), Encoding.UTF8);
//			foreach(var s in File.ReadAllLines("fcc.dcc",Encoding.UTF8))
            string s; while (!sr.EndOfStream)

            {
                s = sr.ReadLine();

                int p = s.IndexOf('\t'); dcc.Add(s.Substring(0, p), s.Substring(p + 1));
            }
            foreach (var k in dcc.Keys)
            {
                if (pu.IndexOf(k) > -1)
                {
                    dm = k; break;
                }
            }
            if (dm == "")
            {
                return;
            }
            byte[]             by = getPUbytes(pu, dm, dcc[dm]);
            getM12(by); string v; int o = 0;
            switch (dm)
            {
            case "m24.ru":
                //https://www.m24.ru/news/proisshestviya/20012020/104276
                v = getScrByID("type", "application/ld+json");
                if (v != null)
                {
                    o = v.IndexOf("contentUrl");
                    if (o > -1)
                    {
                        pu = v.Substring(o + 14, v.IndexOf(",", o + 14) - o - 15);
                    }
                }
                break;
            }
            by     = null;
            parser = null;
            return;
        }
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oChunk">Chunk</param>
        void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            // if encoding already set then we should not be trying to set new one
            // this is the logic that major browsers follow - the first Encoding is assumed to be
            // the correct one
            if (bEncodingSet)
            {
                return;
            }

            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
            {
                if (!bEncodingSet)
                {
                    Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML());
                }
            }
        }
Exemple #10
0
 static void getM12(byte[] by)
 {
     parser = new HTMLparser();
     parser.SetChunkHashMode(false);
     parser.bKeepRawHTML        = false;
     parser.bDecodeEntities     = true;
     parser.bDecodeMiniEntities = true;
     if (!parser.bDecodeEntities && parser.bDecodeMiniEntities)
     {
         parser.InitMiniEntities();
     }
     parser.bAutoExtractBetweenTagsOnly         = true;
     parser.bAutoKeepComments                   = true;
     parser.bAutoKeepScripts                    = true;
     parser.bCompressWhiteSpaceBeforeTag        = true;
     parser.bAutoMarkClosedTagsWithParamsAsOpen = false;
     parser.Init(by);
 }
Exemple #11
0
        private HTMLchunk[] htmlParse(string str)
        {
            //return value
            ArrayList ret = new ArrayList();

            //init parser
            Majestic12.HTMLparser parser = new Majestic12.HTMLparser();

            //keep raw html because we need to reconstruct it
            parser.bKeepRawHTML = true;
            //keep text... this is for parsing just tags
            parser.bTextMode = true;
            //initialize to parse the string
            parser.Init(str);

            Majestic12.HTMLchunk chunk = null;
            // we parse until returned chunk is null indicating we reached end of parsing
            while ((chunk = parser.ParseNext()) != null)
            {
                //discard empty blocks for performance increase
                if (chunk.oHTML.Trim() != "")
                {
                    //hard copy the chunk
                    HTMLchunk clone = new HTMLchunk(false);
                    clone.oHTML = String.Copy(chunk.oHTML);
                    clone.oType = chunk.oType;
                    clone.sTag  = String.Copy(chunk.sTag);

                    ret.Add(clone);
                }
            }

            parser.CleanUp();

            //return string array
            return((HTMLchunk[])ret.ToArray(typeof(HTMLchunk)));
        }
Exemple #12
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if(bReadLineDelay)
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk=null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet=false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while((oChunk=oP.ParseNext())!=null)
            {
                switch(oChunk.oType)
                {
                        // matched open tag, ie <a href="">
                    case HTMLchunkType.OpenTag:
                        Console.Write("Open tag: "+oChunk.sTag);

                        // in order to set correct encoding we need to keep an eye on META tags
                        // that hit us on what the encoding should be used, note here
                        // that some webpages have TITLE set BEFORE meta-tags, which means you will
                        // have to re-encode it in order to get correct representation of text

            PrintParams:

                        if(oChunk.sTag.Length==4 && oChunk.sTag=="meta")
                        {
                            HandleMetaEncoding(oP,oChunk,ref bEncodingSet);
                        };

                        // commented out call to code that will do the job for you - long code below
                        // is left to demonstrate how to access individual param values
                        // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());

                        if(oChunk.bHashMode)
                        {
                            if(oChunk.oParams.Count>0)
                            {
                                foreach(string sParam in oChunk.oParams.Keys)
                                {
                                    string sValue=oChunk.oParams[sParam].ToString();

                                    if(sValue.Length>0)
                                        Console.Write(" {0}='{1}'",sParam,sValue);
                                    else
                                        Console.Write(" {0}",sParam);
                                }

                            }
                        }
                        else
                        {
                            // this is alternative method of getting params -- it may look less convinient
                            // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                            // params for a few
                            if(oChunk.iParams>0)
                            {
                                for(int i=0; i<oChunk.iParams; i++)
                                {
                                    // here we can use exactly the same single/double quotes as they
                                    // were used on params

                                    switch(oChunk.cParamChars[i])
                                    {
                                        case (byte)' ':
                                            if(oChunk.sValues[i].Length==0)
                                                Console.Write(" {0}",oChunk.sParams[i]);
                                            else
                                                Console.Write(" {0}={1}",oChunk.sParams[i],oChunk.sValues[i]);
                                            break;

                                        default:
                                            Console.Write(" {0}={1}{2}{1}",oChunk.sParams[i],(char)oChunk.cParamChars[i],oChunk.sValues[i]);
                                            break;
                                    }

                                }

                            }

                        }

                        break;

                        // matched close tag, ie </a>
                    case HTMLchunkType.CloseTag:
                        //Console.Write(oChunk.GenerateHTML());

                        Console.Write("Closed tag: "+oChunk.sTag);

                        if(oChunk.iParams>0)
                            goto PrintParams;

                        break;

                    // NOTE: you have to call finalisation because it is not done for Scripts or comments
                    // Matched data between <script></script> tags
                    case HTMLchunkType.Script:

                        if(!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                            oP.SetRawHTML(oChunk);

                        if(oChunk.oHTML.Length>0)
                            Console.Write("Script: "+oChunk.oHTML);
                        else
                            Console.Write("Script: [ignored for performance reasons]");

                        if(oChunk.iParams>0)
                            goto PrintParams;

                        break;

                    // NOTE: you have to call finalisation because it is not done for Scripts or comments
                    // matched HTML comment, that's stuff between <!-- and -->
                    case HTMLchunkType.Comment:

                        //Console.WriteLine("{0}",oChunk.GenerateHTML());

                        if(oP.bKeepRawHTML || oP.bAutoKeepComments)
                        {
                            // by default we won't finalise automatically as comments are often
                            // very lenghty and it is costly to create long strings when they are not
                            // needed, ie: during indexing of text
                            Console.Write("Comment: "+oChunk.oHTML);
                        }
                        else
                        {
                            // Even if raw HTML by default was not taken you can get it anyway by
                            // uncommenting next line
                            //oP.SetRawHTML(oChunk);

                            Console.Write("Comment: [ignored for performance reasons]");
                        }
                        break;

                    // matched normal text
                    case HTMLchunkType.Text:

                        // skip pure whitespace that we are not really interested in
                        if(oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length==0 && bReadLineDelay)
                            continue;

                        Console.Write("Text: '{0}'",oChunk.oHTML);
                        break;

                };

                if(bReadLineDelay)
                    Console.ReadLine();
                else
                    Console.WriteLine("");
            }
        }
Exemple #13
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // lets get params and their values

                    // if hashmode is set then param/values are kept in Hashtable oChunk.oParams
                    // this makes parsing slower, so if you want the highest performance then you
                    // need to HashMode to false
                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                if (oChunk.sValues[i].Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", oChunk.sParams[i], oChunk.sValues[i]);
                                }
                                else
                                {
                                    Console.Write(" {0}", oChunk.sParams[i]);
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    Console.Write("Closed tag: " + oChunk.sTag);
                    break;

                // matched normal text
                case HTMLchunkType.Text:
                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;

                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    // Note: you need to call finalisation on the chunk as by default comments are
                    // not finalised for performance reasons - if you have made parser to keep raw
                    // HTML then you won't be needing to finalise it
                    if (!oP.bKeepRawHTML)
                    {
                        oChunk.Finalise();
                    }

                    Console.Write("Comment: " + oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
Exemple #14
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
                ;
            }

            return(false);
        }
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet = false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // in order to set correct encoding we need to keep an eye on META tags
                    // that hit us on what the encoding should be used, note here
                    // that some webpages have TITLE set BEFORE meta-tags, which means you will
                    // have to re-encode it in order to get correct representation of text

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());



                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        Console.Write(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        Console.Write(" {0}={1}", oChunk.sParams[i], oChunk.sValues[i]);
                                    }
                                    break;

                                default:
                                    Console.Write(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], oChunk.sValues[i]);
                                    break;
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    //Console.Write(oChunk.GenerateHTML());

                    Console.Write("Closed tag: " + oChunk.sTag);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    if (oChunk.oHTML.Length > 0)
                    {
                        Console.Write("Script: " + oChunk.oHTML);
                    }
                    else
                    {
                        Console.Write("Script: [ignored for performance reasons]");
                    }

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    //Console.WriteLine("{0}",oChunk.GenerateHTML());

                    if (oP.bKeepRawHTML || oP.bAutoKeepComments)
                    {
                        // by default we won't finalise automatically as comments are often
                        // very lenghty and it is costly to create long strings when they are not
                        // needed, ie: during indexing of text
                        Console.Write("Comment: " + oChunk.oHTML);
                    }
                    else
                    {
                        // Even if raw HTML by default was not taken you can get it anyway by
                        // uncommenting next line
                        //oP.SetRawHTML(oChunk);

                        Console.Write("Comment: [ignored for performance reasons]");
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0 && bReadLineDelay)
                    {
                        continue;
                    }

                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes, string sFileName)
        {
            if (!File.Exists(sFileName))
            {
                sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + sFileName);

                if (!File.Exists(sFileName))
                {
                    Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName);
                    return;
                }
            }

            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = false;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities = true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities = true;

            if (!oP.bDecodeEntities && oP.bDecodeMiniEntities)
            {
                oP.InitMiniEntities();
            }

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly = true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments = true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts = true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag = true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen = false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data):
            // oP.Init(bHTML);

            DateTime oStart = DateTime.Now;

            for (int i = 0; i < iParseTimes; i++)
            {
                if (iParseTimes > 1)
                {
                    BenchMarkParse(oP);
                }
                else
                {
                    ParseAndPrint(oP);
                }

                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond);

            if (iMSecs > 0 && iParseTimes > 0)
            {
                Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.", iParseTimes, iMSecs * 1.0 / 1000, iMSecs * 1.0 / iParseTimes);
            }

            oP.Close();
        }
Exemple #17
0
        private HTMLchunk[] htmlParse(string str)
        {
            //return value
            ArrayList ret = new ArrayList();

            //init parser
            Majestic12.HTMLparser parser = new Majestic12.HTMLparser();

            //keep raw html because we need to reconstruct it
            parser.bKeepRawHTML = true;
            //keep text... this is for parsing just tags
            parser.bTextMode = true;
            //initialize to parse the string
            parser.Init(str);

            Majestic12.HTMLchunk chunk = null;
            // we parse until returned chunk is null indicating we reached end of parsing
            while ((chunk = parser.ParseNext()) != null)
            {

                //discard empty blocks for performance increase
                if (chunk.oHTML.Trim() != "")
                {
                    //hard copy the chunk
                    HTMLchunk clone = new HTMLchunk(false);
                    clone.oHTML = String.Copy(chunk.oHTML);
                    clone.oType = chunk.oType;
                    clone.sTag = String.Copy(chunk.sTag);

                    ret.Add(clone);
                }
            }

            parser.CleanUp();

            //return string array
            return (HTMLchunk[])ret.ToArray(typeof(HTMLchunk));
        }
Exemple #18
0
        private void Dispose(bool bDisposing)
        {
            if(!bDisposed)
            {
                bDisposed=true;

                bHTML=null;
                oChunk=null;
                sText=null;
                oE=null;
                oP=null;
            }
        }
Exemple #19
0
        /// <summary>
        /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here
        /// </summary>
        /// <param name="oP">Parser object</param>
        void BenchMarkParse(HTMLparser oP)
        {
            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk=null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while((oChunk=oP.ParseNext())!=null)
            {
                switch(oChunk.oType)
                {
                        // matched open tag, ie <a href="">
                    case HTMLchunkType.OpenTag:
                        break;

                        // matched close tag, ie </a>
                    case HTMLchunkType.CloseTag:
                        break;

                        // matched normal text
                    case HTMLchunkType.Text:
                        break;

                        // matched HTML comment, that's stuff between <!-- and -->
                    case HTMLchunkType.Comment:
                        break;
                };
            }
        }
Exemple #20
0
        public HtmlNode Parse(string html)
        {
            // Majestic12 doesn't support doctype
            html = dedoctype.Replace(html, "");
            var builder = new HtmlBuilder();

            var parser = new HTMLparser();
            parser.bDecodeEntities = false;
            parser.SetChunkHashMode(true);

            parser.Init(html);
            var chunk = parser.ParseNext();
            while (chunk != null)
            {
                switch (chunk.oType)
                {
                    case HTMLchunkType.OpenTag:
                        // if something goes wrong - ignore it
                        if (chunk.sTag != "")
                        {
                            var attributes = new Dictionary<string, string>();
                            if (chunk.iParams != 0)
                            {
                                foreach (string name in chunk.oParams.Keys)
                                {
                                    attributes.Add(name, (string)chunk.oParams[name]);
                                }
                            }
                            builder.OpenTag(chunk.sTag, attributes);
                        }
                        break;
                    case HTMLchunkType.Comment:
                        builder.AddComment(chunk.oHTML);
                        break;
                    case HTMLchunkType.CloseTag:
                        if (chunk.bEndClosure)
                        {
                            var attr = new Dictionary<string, string>();
                            if (chunk.iParams != 0)
                            {
                                foreach (string name in chunk.oParams.Keys)
                                {
                                    attr.Add(name, (string)chunk.oParams[name]);
                                }
                            }
                            builder.OpenTag(chunk.sTag, attr);
                            builder.CloseTag(chunk.sTag);
                        }
                        else
                        {
                            builder.CloseTag(chunk.sTag);
                        }
                        break;
                    case HTMLchunkType.Script:
                        builder.AddScript(chunk.oHTML);
                        break;
                    case HTMLchunkType.Text:
                        builder.AddText(chunk.oHTML);
                        break;
                    default:
                        break;
                }
                chunk = parser.ParseNext();
            }
            return builder.Render();
        }
Exemple #21
0
        /// <summary>
        /// Inits tag parser
        /// </summary>
        /// <param name="p_oChunk"></param>
        /// <param name="p_sText"></param>
        internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE)
        {
            oP=p_oP;
            oChunk=p_oChunk;
            sText=p_sText;
            bHTML=p_bHTML;
            iDataLength=p_iDataLength;

            // we don't want to be too close to end of data when dealing with heuristics
            iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS;

            oE=p_oE;
            oHE=p_oHE;
        }
Exemple #22
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes,string sFileName)
        {
            if(!File.Exists(sFileName))
            {
                sFileName=Path.Combine(Directory.GetCurrentDirectory(),"tests"+Path.DirectorySeparatorChar+sFileName);

                if(!File.Exists(sFileName))
                {
                    Console.WriteLine("Could not find file in current directory to parse - expected it to be here: "+sFileName);
                    return;
                }
            }

            HTMLparser oP=new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML=false;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities=true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities=true;

            if(!oP.bDecodeEntities && oP.bDecodeMiniEntities)
               oP.InitMiniEntities();

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly=true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments=true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts=true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag=true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen=false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data):
            // oP.Init(bHTML);

            DateTime oStart=DateTime.Now;

            for(int i=0; i<iParseTimes; i++)
            {
                if(iParseTimes>1)
                    BenchMarkParse(oP);
                else
                    ParseAndPrint(oP);

                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs=(int)((DateTime.Now.Ticks-oStart.Ticks)/TimeSpan.TicksPerMillisecond);

            if(iMSecs>0 && iParseTimes>0)
            {
                Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.",iParseTimes,iMSecs*1.0/1000,iMSecs*1.0/iParseTimes);
            }

            oP.Close();
        }
Exemple #23
0
        static void GotDiscoveryPage(IAsyncResult result)
        {
            var stateObjects = (object[])result.AsyncState;
            var request = (HttpWebRequest)stateObjects[0];
            var callback = (Action<ProviderDiscoveryData>)stateObjects[1];

            HttpWebResponse response;

            try
            {
                response = (HttpWebResponse)request.EndGetResponse(result);
            }
            catch (Exception ex)
            {
                // Signal a failure.
                callback(new ProviderDiscoveryData { Success = false, FailureReason = ex });
                return;
            }

            // Let's take a look at this response.

            // Do we have an XRDS document on our hands?
            if (response.ContentType.StartsWith("application/xrds+xml"))
            {
                // We do. Get the contents and send them off for processing. Then we're done here.
                var reader = new StreamReader(response.GetResponseStream());
                var xrdsData = reader.ReadToEnd();
                response.Close();
                ProcessXrds(xrdsData, callback);
                return;
            }

            // Look for a telling header.
            if (!string.IsNullOrEmpty(response.Headers["X-XRDS-Location"]))
            {
                // We know where to look. Create a new request to get that document, and point its callback right back to this function.
                var newRequest = CreateDiscoveryWebRequest(response.Headers["X-XRDS-Location"]);
                response.Close();
                if (newRequest == null)
                {
                    // Signal a failure.
                    callback(new ProviderDiscoveryData { Success = false });
                    return;
                }
                newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback });
                return;
            }

            // So much for keeping it simple. Now we've got to parse HTML to figure out something about OpenID at this URL.
            // Read the HTML.
            var reader2 = new StreamReader(response.GetResponseStream());
            var htmlData = reader2.ReadToEnd();
            response.Close();

            // Initialize the HTML parser.
            var parser = new HTMLparser();
            parser.SetChunkHashMode(false);
            parser.bDecodeEntities = true;
            parser.Init(htmlData);

            // Go though every chunk and look for useful tags.
            HTMLchunk chunk;
            string xrdsPointer = null;
            string openid2Provider = null;
            string openid2OpLocal = null;
            while ((chunk = parser.ParseNextTag()) != null)
            {
                if (chunk.oType != HTMLchunkType.OpenTag && chunk.oType != HTMLchunkType.CloseTag) continue;

                if (chunk.sTag != "meta" && chunk.sTag != "link") continue;

                // Convert the params to a dictionary, with keys being lowercase.
                var dict = new Dictionary<string, string>();
                for (var i = 0; i < chunk.iParams; i++)
                    dict[chunk.sParams[i].ToLower().Trim()] = chunk.sValues[i];

                // Do we have a META tag?
                if (chunk.sTag == "meta")
                {
                    // Do we have an XRDS pointer?
                    if (dict.ContainsKey("http-equiv") && dict.ContainsKey("content") &&
                        dict["http-equiv"].Equals("X-XRDS-Location", StringComparison.CurrentCultureIgnoreCase))
                        xrdsPointer = dict["content"];
                }
                else if (chunk.sTag == "link")
                {
                    if (dict.ContainsKey("rel") && dict.ContainsKey("href"))
                    {
                        // There are certain RELs we care about.
                        if (dict["rel"].Contains("openid2.provider")) openid2Provider = dict["href"];
                        else if (dict["rel"].Contains("openid2.local_id")) openid2OpLocal = dict["href"];
                    }
                }
            }

            // Do we have needed LINKs?
            if (openid2Provider != null)
            {
                // Yes we do! Signal success.
                callback(new ProviderDiscoveryData { Success = true, DiscoveredClaimedIdentifier = true, ProviderUri = openid2Provider, OpLocalIdentity = openid2OpLocal });
                return;
            }

            // Do we have an XRDS pointer?
            if (xrdsPointer != null)
            {
                // Yes we do! Retrieve that and point back to this function.
                var newRequest = CreateDiscoveryWebRequest(xrdsPointer);
                if (newRequest == null)
                {
                    // Signal a failure.
                    callback(new ProviderDiscoveryData { Success = false });
                    return;
                }
                newRequest.BeginGetResponse(GotDiscoveryPage, new object[] { newRequest, callback });
                return;
            }

            // We got nothing :(
            callback(new ProviderDiscoveryData { Success = false, FailureReason = new Exception("Could not find OpenID endpoint.") });
        }
Exemple #24
0
 public HtmlParser(HTMLparser parser)
 {
     Parser = parser;
 }