LoadFromFile() public method

Loads HTML from file
public LoadFromFile ( string sFileName ) : void
sFileName string Full filename
return void
Ejemplo n.º 1
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes)
        {
            string sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + "majestic12.html");

            if (!File.Exists(sFileName))
            {
                Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName);
                return;
            }

            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            DateTime oStart = DateTime.Now;

            for (int i = 0; i < iParseTimes; i++)
            {
                if (iParseTimes > 1)
                {
                    BenchMarkParse(oP);
                }
                else
                {
                    ParseAndPrint(oP);
                }


                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond);

            if (iMSecs > 0 && iParseTimes > 0)
            {
                Console.WriteLine("Parsed {0} time(s), total time {1} secs, approximately {2} ms per full parse.", iParseTimes, iMSecs / 1000, iMSecs / iParseTimes);
            }

            oP.Close();
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes, string sFileName)
        {
            if (!File.Exists(sFileName))
            {
                sFileName = Path.Combine(Directory.GetCurrentDirectory(), "tests" + Path.DirectorySeparatorChar + sFileName);

                if (!File.Exists(sFileName))
                {
                    Console.WriteLine("Could not find file in current directory to parse - expected it to be here: " + sFileName);
                    return;
                }
            }

            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = false;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities = true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities = true;

            if (!oP.bDecodeEntities && oP.bDecodeMiniEntities)
            {
                oP.InitMiniEntities();
            }

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly = true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments = true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts = true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag = true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen = false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data):
            // oP.Init(bHTML);

            DateTime oStart = DateTime.Now;

            for (int i = 0; i < iParseTimes; i++)
            {
                if (iParseTimes > 1)
                {
                    BenchMarkParse(oP);
                }
                else
                {
                    ParseAndPrint(oP);
                }

                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs = (int)((DateTime.Now.Ticks - oStart.Ticks) / TimeSpan.TicksPerMillisecond);

            if (iMSecs > 0 && iParseTimes > 0)
            {
                Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.", iParseTimes, iMSecs * 1.0 / 1000, iMSecs * 1.0 / iParseTimes);
            }

            oP.Close();
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Starts parsing
        /// </summary>
        /// <param name="iParseTimes">Number of times to parse document (useful for benchmarking)</param>
        void Start(int iParseTimes,string sFileName)
        {
            if(!File.Exists(sFileName))
            {
                sFileName=Path.Combine(Directory.GetCurrentDirectory(),"tests"+Path.DirectorySeparatorChar+sFileName);

                if(!File.Exists(sFileName))
                {
                    Console.WriteLine("Could not find file in current directory to parse - expected it to be here: "+sFileName);
                    return;
                }
            }

            HTMLparser oP=new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML=false;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities=true;

            // we have option to keep most entities as is - only replace stuff like &nbsp;
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities=true;

            if(!oP.bDecodeEntities && oP.bDecodeMiniEntities)
               oP.InitMiniEntities();

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly=true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments=true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts=true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag=true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen=false;

            // load HTML from file
            oP.LoadFromFile(sFileName);

            // alternatively you can set HTML to be parsed as follows (bHTML is byte[] array containing data):
            // oP.Init(bHTML);

            DateTime oStart=DateTime.Now;

            for(int i=0; i<iParseTimes; i++)
            {
                if(iParseTimes>1)
                    BenchMarkParse(oP);
                else
                    ParseAndPrint(oP);

                oP.Reset();
            }

            // calculate number of milliseconds we were parsing
            int iMSecs=(int)((DateTime.Now.Ticks-oStart.Ticks)/TimeSpan.TicksPerMillisecond);

            if(iMSecs>0 && iParseTimes>0)
            {
                Console.Error.WriteLine("Parsed {0} time(s), total time {1:0.00} secs, ~{2:0.00} ms per full parse.",iParseTimes,iMSecs*1.0/1000,iMSecs*1.0/iParseTimes);
            }

            oP.Close();
        }