ParseNext() public method

Parses next chunk and returns it with
public ParseNext ( ) : HTMLchunk
return HTMLchunk
Esempio n. 1
0
        /// <summary>
        /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here
        /// </summary>
        /// <param name="oP">Parser object</param>
        void BenchMarkParse(HTMLparser oP)
        {
            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    break;

                // matched normal text
                case HTMLchunkType.Text:
                    break;

                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:
                    break;
                }
                ;
            }
        }
Esempio n. 2
0
        static string getScrByID(string pn, string pv)
        {
            string    rv       = null;
            HTMLchunk m12chunk = null;

            while ((m12chunk = parser.ParseNext()) != null)
            {
                switch (m12chunk.oType)
                {
                case HTMLchunkType.OpenTag: break;

                case HTMLchunkType.CloseTag: break;

                case HTMLchunkType.Script:
                    if (pn == "")
                    {
                        int o = m12chunk.oHTML.IndexOf(pv);
                        if (o > -1)
                        {
                            rv = m12chunk.oHTML.Substring(o + pv.Length);
                        }
                    }
                    else
                    {
                        if (m12chunk.GetParamValue(pn) == pv)
                        {
                            rv = m12chunk.oHTML;
                        }
                    }
                    break;

                case HTMLchunkType.Comment: break;

                case HTMLchunkType.Text: break;

                default: break;
                }
            }
            return(rv);
        }
Esempio n. 3
0
        private HTMLchunk[] htmlParse(string str)
        {
            //return value
            ArrayList ret = new ArrayList();

            //init parser
            Majestic12.HTMLparser parser = new Majestic12.HTMLparser();

            //keep raw html because we need to reconstruct it
            parser.bKeepRawHTML = true;
            //keep text... this is for parsing just tags
            parser.bTextMode = true;
            //initialize to parse the string
            parser.Init(str);

            Majestic12.HTMLchunk chunk = null;
            // we parse until returned chunk is null indicating we reached end of parsing
            while ((chunk = parser.ParseNext()) != null)
            {
                //discard empty blocks for performance increase
                if (chunk.oHTML.Trim() != "")
                {
                    //hard copy the chunk
                    HTMLchunk clone = new HTMLchunk(false);
                    clone.oHTML = String.Copy(chunk.oHTML);
                    clone.oType = chunk.oType;
                    clone.sTag  = String.Copy(chunk.sTag);

                    ret.Add(clone);
                }
            }

            parser.CleanUp();

            //return string array
            return((HTMLchunk[])ret.ToArray(typeof(HTMLchunk)));
        }
Esempio n. 4
0
        private HTMLchunk[] htmlParse(string str)
        {
            //return value
            ArrayList ret = new ArrayList();

            //init parser
            Majestic12.HTMLparser parser = new Majestic12.HTMLparser();

            //keep raw html because we need to reconstruct it
            parser.bKeepRawHTML = true;
            //keep text... this is for parsing just tags
            parser.bTextMode = true;
            //initialize to parse the string
            parser.Init(str);

            Majestic12.HTMLchunk chunk = null;
            // we parse until returned chunk is null indicating we reached end of parsing
            while ((chunk = parser.ParseNext()) != null)
            {

                //discard empty blocks for performance increase
                if (chunk.oHTML.Trim() != "")
                {
                    //hard copy the chunk
                    HTMLchunk clone = new HTMLchunk(false);
                    clone.oHTML = String.Copy(chunk.oHTML);
                    clone.oType = chunk.oType;
                    clone.sTag = String.Copy(chunk.sTag);

                    ret.Add(clone);
                }
            }

            parser.CleanUp();

            //return string array
            return (HTMLchunk[])ret.ToArray(typeof(HTMLchunk));
        }
Esempio n. 5
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // lets get params and their values

                    // if hashmode is set then param/values are kept in Hashtable oChunk.oParams
                    // this makes parsing slower, so if you want the highest performance then you
                    // need to HashMode to false
                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                if (oChunk.sValues[i].Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", oChunk.sParams[i], oChunk.sValues[i]);
                                }
                                else
                                {
                                    Console.Write(" {0}", oChunk.sParams[i]);
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    Console.Write("Closed tag: " + oChunk.sTag);
                    break;

                // matched normal text
                case HTMLchunkType.Text:
                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;

                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    // Note: you need to call finalisation on the chunk as by default comments are
                    // not finalised for performance reasons - if you have made parser to keep raw
                    // HTML then you won't be needing to finalise it
                    if (!oP.bKeepRawHTML)
                    {
                        oChunk.Finalise();
                    }

                    Console.Write("Comment: " + oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
Esempio n. 6
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if (bReadLineDelay)
            {
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");
            }

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk = null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet = false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                // matched open tag, ie <a href="">
                case HTMLchunkType.OpenTag:
                    Console.Write("Open tag: " + oChunk.sTag);

                    // in order to set correct encoding we need to keep an eye on META tags
                    // that hit us on what the encoding should be used, note here
                    // that some webpages have TITLE set BEFORE meta-tags, which means you will
                    // have to re-encode it in order to get correct representation of text

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());



                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    Console.Write(" {0}='{1}'", sParam, sValue);
                                }
                                else
                                {
                                    Console.Write(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        Console.Write(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        Console.Write(" {0}={1}", oChunk.sParams[i], oChunk.sValues[i]);
                                    }
                                    break;

                                default:
                                    Console.Write(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], oChunk.sValues[i]);
                                    break;
                                }
                            }
                        }
                    }

                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:
                    //Console.Write(oChunk.GenerateHTML());

                    Console.Write("Closed tag: " + oChunk.sTag);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    if (oChunk.oHTML.Length > 0)
                    {
                        Console.Write("Script: " + oChunk.oHTML);
                    }
                    else
                    {
                        Console.Write("Script: [ignored for performance reasons]");
                    }

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:

                    //Console.WriteLine("{0}",oChunk.GenerateHTML());

                    if (oP.bKeepRawHTML || oP.bAutoKeepComments)
                    {
                        // by default we won't finalise automatically as comments are often
                        // very lenghty and it is costly to create long strings when they are not
                        // needed, ie: during indexing of text
                        Console.Write("Comment: " + oChunk.oHTML);
                    }
                    else
                    {
                        // Even if raw HTML by default was not taken you can get it anyway by
                        // uncommenting next line
                        //oP.SetRawHTML(oChunk);

                        Console.Write("Comment: [ignored for performance reasons]");
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0 && bReadLineDelay)
                    {
                        continue;
                    }

                    Console.Write("Text: '{0}'", oChunk.oHTML);
                    break;
                }
                ;

                if (bReadLineDelay)
                {
                    Console.ReadLine();
                }
                else
                {
                    Console.WriteLine("");
                }
            }
        }
Esempio n. 7
0
        /// <summary>
        /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk
        /// </summary>
        /// <param name="oP">Parser object</param>
        void ParseAndPrint(HTMLparser oP)
        {
            //	bReadLineDelay=false;
            if(bReadLineDelay)
                Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue");

            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk=null;

            // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you
            // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes
            // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to
            // consider HTTP header setting as more important, so it is best to behave in similar way.

            // See below for code that deals with META based charset setting, similarly you need to call
            // it here if charset is set in Content-Type header

            // we will track whether encoding was set or not here, this is important
            // because we may have to do re-encoding of text found BEFORE META tag, this typically
            // happens for TITLE tags only - if we had no encoding set and then had it set, then
            // we need to reencode it, highly annoying, but having garbage in title is even more annoying
            bool bEncodingSet=false;

            // debug:
            oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1"));

            // we parse until returned oChunk is null indicating we reached end of parsing
            while((oChunk=oP.ParseNext())!=null)
            {
                switch(oChunk.oType)
                {
                        // matched open tag, ie <a href="">
                    case HTMLchunkType.OpenTag:
                        Console.Write("Open tag: "+oChunk.sTag);

                        // in order to set correct encoding we need to keep an eye on META tags
                        // that hit us on what the encoding should be used, note here
                        // that some webpages have TITLE set BEFORE meta-tags, which means you will
                        // have to re-encode it in order to get correct representation of text

            PrintParams:

                        if(oChunk.sTag.Length==4 && oChunk.sTag=="meta")
                        {
                            HandleMetaEncoding(oP,oChunk,ref bEncodingSet);
                        };

                        // commented out call to code that will do the job for you - long code below
                        // is left to demonstrate how to access individual param values
                        // Console.WriteLine("{0}",oChunk.GenerateParamsHTML());

                        if(oChunk.bHashMode)
                        {
                            if(oChunk.oParams.Count>0)
                            {
                                foreach(string sParam in oChunk.oParams.Keys)
                                {
                                    string sValue=oChunk.oParams[sParam].ToString();

                                    if(sValue.Length>0)
                                        Console.Write(" {0}='{1}'",sParam,sValue);
                                    else
                                        Console.Write(" {0}",sParam);
                                }

                            }
                        }
                        else
                        {
                            // this is alternative method of getting params -- it may look less convinient
                            // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                            // params for a few
                            if(oChunk.iParams>0)
                            {
                                for(int i=0; i<oChunk.iParams; i++)
                                {
                                    // here we can use exactly the same single/double quotes as they
                                    // were used on params

                                    switch(oChunk.cParamChars[i])
                                    {
                                        case (byte)' ':
                                            if(oChunk.sValues[i].Length==0)
                                                Console.Write(" {0}",oChunk.sParams[i]);
                                            else
                                                Console.Write(" {0}={1}",oChunk.sParams[i],oChunk.sValues[i]);
                                            break;

                                        default:
                                            Console.Write(" {0}={1}{2}{1}",oChunk.sParams[i],(char)oChunk.cParamChars[i],oChunk.sValues[i]);
                                            break;
                                    }

                                }

                            }

                        }

                        break;

                        // matched close tag, ie </a>
                    case HTMLchunkType.CloseTag:
                        //Console.Write(oChunk.GenerateHTML());

                        Console.Write("Closed tag: "+oChunk.sTag);

                        if(oChunk.iParams>0)
                            goto PrintParams;

                        break;

                    // NOTE: you have to call finalisation because it is not done for Scripts or comments
                    // Matched data between <script></script> tags
                    case HTMLchunkType.Script:

                        if(!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                            oP.SetRawHTML(oChunk);

                        if(oChunk.oHTML.Length>0)
                            Console.Write("Script: "+oChunk.oHTML);
                        else
                            Console.Write("Script: [ignored for performance reasons]");

                        if(oChunk.iParams>0)
                            goto PrintParams;

                        break;

                    // NOTE: you have to call finalisation because it is not done for Scripts or comments
                    // matched HTML comment, that's stuff between <!-- and -->
                    case HTMLchunkType.Comment:

                        //Console.WriteLine("{0}",oChunk.GenerateHTML());

                        if(oP.bKeepRawHTML || oP.bAutoKeepComments)
                        {
                            // by default we won't finalise automatically as comments are often
                            // very lenghty and it is costly to create long strings when they are not
                            // needed, ie: during indexing of text
                            Console.Write("Comment: "+oChunk.oHTML);
                        }
                        else
                        {
                            // Even if raw HTML by default was not taken you can get it anyway by
                            // uncommenting next line
                            //oP.SetRawHTML(oChunk);

                            Console.Write("Comment: [ignored for performance reasons]");
                        }
                        break;

                    // matched normal text
                    case HTMLchunkType.Text:

                        // skip pure whitespace that we are not really interested in
                        if(oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length==0 && bReadLineDelay)
                            continue;

                        Console.Write("Text: '{0}'",oChunk.oHTML);
                        break;

                };

                if(bReadLineDelay)
                    Console.ReadLine();
                else
                    Console.WriteLine("");
            }
        }
Esempio n. 8
0
        /// <summary>
        /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here
        /// </summary>
        /// <param name="oP">Parser object</param>
        void BenchMarkParse(HTMLparser oP)
        {
            // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
            // because HTMLparser re-uses this object
            HTMLchunk oChunk=null;

            // we parse until returned oChunk is null indicating we reached end of parsing
            while((oChunk=oP.ParseNext())!=null)
            {
                switch(oChunk.oType)
                {
                        // matched open tag, ie <a href="">
                    case HTMLchunkType.OpenTag:
                        break;

                        // matched close tag, ie </a>
                    case HTMLchunkType.CloseTag:
                        break;

                        // matched normal text
                    case HTMLchunkType.Text:
                        break;

                        // matched HTML comment, that's stuff between <!-- and -->
                    case HTMLchunkType.Comment:
                        break;
                };
            }
        }
Esempio n. 9
0
        public HtmlNode Parse(string html)
        {
            // Majestic12 doesn't support doctype
            html = dedoctype.Replace(html, "");
            var builder = new HtmlBuilder();

            var parser = new HTMLparser();
            parser.bDecodeEntities = false;
            parser.SetChunkHashMode(true);

            parser.Init(html);
            var chunk = parser.ParseNext();
            while (chunk != null)
            {
                switch (chunk.oType)
                {
                    case HTMLchunkType.OpenTag:
                        // if something goes wrong - ignore it
                        if (chunk.sTag != "")
                        {
                            var attributes = new Dictionary<string, string>();
                            if (chunk.iParams != 0)
                            {
                                foreach (string name in chunk.oParams.Keys)
                                {
                                    attributes.Add(name, (string)chunk.oParams[name]);
                                }
                            }
                            builder.OpenTag(chunk.sTag, attributes);
                        }
                        break;
                    case HTMLchunkType.Comment:
                        builder.AddComment(chunk.oHTML);
                        break;
                    case HTMLchunkType.CloseTag:
                        if (chunk.bEndClosure)
                        {
                            var attr = new Dictionary<string, string>();
                            if (chunk.iParams != 0)
                            {
                                foreach (string name in chunk.oParams.Keys)
                                {
                                    attr.Add(name, (string)chunk.oParams[name]);
                                }
                            }
                            builder.OpenTag(chunk.sTag, attr);
                            builder.CloseTag(chunk.sTag);
                        }
                        else
                        {
                            builder.CloseTag(chunk.sTag);
                        }
                        break;
                    case HTMLchunkType.Script:
                        builder.AddScript(chunk.oHTML);
                        break;
                    case HTMLchunkType.Text:
                        builder.AddText(chunk.oHTML);
                        break;
                    default:
                        break;
                }
                chunk = parser.ParseNext();
            }
            return builder.Render();
        }