A StreamTokenizer similar to Java's. This breaks an input stream (coming from a TextReader) into Tokens based on various settings. The settings are stored in the TokenizerSettings property, which is a StreamTokenizerSettings instance.

This is configurable in that you can modify TokenizerSettings.CharTypes[] array to specify which characters are which type, along with other settings such as whether to look for comments or not.

WARNING: This is not internationalized. This treats all characters beyond the 7-bit ASCII range (decimal 127) as Word characters.

There are two main ways to use this: 1) Parse the entire stream at once and get an List of Tokens (see the Tokenize* methods), and 2) call NextToken() successively. This reads from a TextReader, which you can set directly, and this also provides some convenient methods to parse files and strings. This returns an Eof token if the end of the input is reached.

Here's an example of the NextToken() style of use: StreamTokenizer tokenizer = new StreamTokenizer(); tokenizer.GrabWhitespace = true; tokenizer.Verbosity = VerbosityLevel.Debug; // just for debugging tokenizer.TextReader = File.OpenText(fileName); Token token; while (tokenizer.NextToken(out token)) log.Info("Token = '{0}'", token);

Here's an example of the Tokenize... style of use: StreamTokenizer tokenizer = new StreamTokenizer("some string"); List tokens = new List(); if (!tokenizer.Tokenize(tokens)) { // error handling } foreach (Token t in tokens) Console.WriteLine("t = {0}", t);

Comment delimiters are hardcoded (// and /*), not affected by char type table.

This sets line numbers in the tokens it produces. These numbers are normally the line on which the token starts. There is one known caveat, and that is that when GrabWhitespace setting is true, and a whitespace token contains a newline, that token's line number will be set to the following line rather than the line on which the token started.

Example #1
0
        /// <summary>
        /// Use the supplied tokenizer to tokenize the specified stream
        /// and time it.
        /// </summary>
        /// <param name="tokenizer"></param>
        /// <param name="stream"></param>
        /// <returns>Total milliseconds per parse.</returns>
        protected static double SpeedTestParse(StreamTokenizer tokenizer,
            Stream stream)
        {
            GC.Collect();
            List<Token> tokens = new List<Token>();
            DateTime start = System.DateTime.UtcNow;
            int cycles = 100;
            for (int i = 0; i < cycles; i++)
            {
                tokenizer.TokenizeStream(stream, tokens);
                stream.Position = 0;
            }
            TimeSpan duration = System.DateTime.UtcNow - start;

            return (duration.TotalMilliseconds / cycles);
        }
Example #2
0
        /// <summary>
        /// Speed test.  This tests the speed of the parse.
        /// </summary>
        /// <returns>bool - true for ran, false for failed to run.</returns>
        public static bool SpeedTest()
        {
            Logger log = new Logger("SpeedTest");
            log.Verbosity = VerbosityLevel.Debug;
            log.Info("Starting...");
            Random rand = new Random(0);

            // setup tokenizer
            StreamTokenizer tokenizer = new StreamTokenizer();
            tokenizer.Settings.ParseNumbers = true;

            int nTokens = 1024;
            MemoryStream ms;
            StreamWriter writer;

            // int
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("{0}", (int)(rand.NextDouble() * 256));
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} integers took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // float
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            ms.Position = 0;
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("{0:f9}", rand.NextDouble() * 10);
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} floats took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // exponential
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            ms.Position = 0;
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("{0:e9}", rand.NextDouble() * 1000);
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} exponential floats took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // words
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("foo ");
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} words took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // hex
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("0x{0:x}", (int)(rand.NextDouble() * 256));
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} hex numbers took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            //			Console.WriteLine("Buffer to parse is:");
            //			Console.WriteLine("{0}", Encoding.ASCII.GetString(ms.GetBuffer()));

            return (true);
        }
Example #3
0
        // ---------------------------------------------------------------------
        #region TestSelf
        // ---------------------------------------------------------------------
        /// <summary>
        /// Simple self test.  See StreamTokenizerTestCase for full
        /// tests.
        /// </summary>
        /// <returns>bool - true for success, false for failure.</returns>
        public static bool TestSelf()
        {
            Logger log = new Logger("testSelf");
            log.Verbosity = VerbosityLevel.Debug;
            log.Info("Starting...");
            string testString;
            List<Token> tokens;

            // setup tokenizer
            StreamTokenizer tokenizer = new StreamTokenizer();
            tokenizer.Settings.SetupForCodeParse();
            tokenizer.Verbosity = VerbosityLevel.Debug;

            //
            // try string parse
            //
            log.Write("--------------------------------------------------------\n");
            log.Info("string parse:");
            log.Write("--------------------------------------------------------\n");
            tokens = new List<Token>();
            testString = "-1.2ej";
            tokenizer.Settings.DoUntermCheck = false;
            tokenizer.Settings.GrabWhitespace = false;

            if (!tokenizer.TokenizeString(testString, tokens))
            {
                log.Error("Unable to parse into token vector.");
                return (false);
            }

            foreach (Token t in tokens) log.Info("Token = '{0}'", t.ToString());
            tokens = new List<Token>();

            //
            // try NextToken style
            //
            //			log.Write("--------------------------------------------------------\n");
            //			log.Info("NextToken use");
            //			log.Write("--------------------------------------------------------\n");
            //string fileName = "st-testSelf.tmp";
            //testString = "this is a simple string";
            //tokenizer.TextReader = new StringReader(testString);
            //tokenizer.TextReader = File.OpenText(fileName);
            //Token token;
            //while (tokenizer.NextToken(out token)) log.Info("Token = '{0}'", token);

            //
            // try TokenizeFile
            //
            log.Write("--------------------------------------------------------\n");
            log.Info("Tokenize missing file");
            log.Write("--------------------------------------------------------\n");
            string nonExistentFileName = "ThisFile better not exist";
            bool caughtIt = false;
            try
            {
                tokenizer.TokenizeFile(nonExistentFileName);
            }
            catch (FileNotFoundException e)
            {
                log.Info("Correctly caught exception: {0}: {1}", e.GetType().ToString(), e.Message);
                caughtIt = true;
            }
            if (!caughtIt)
            {
                log.Error("Didn't get a file not found exception from TokenizeFile.");
                return (false);
            }

            //
            // test line numbers in tokens
            //

            // done
            log.Info("Done.");
            return (true);
        }