示例#1
0
		/// <summary>
		/// Simple test of some ParseUtil methods.
		/// </summary>
		/// <returns>bool - true for all passed, false otherwise</returns>
		public static bool TestSelf()
		{
			Logger log = new Logger("ParseUtil: TestSelf");
			log.Info("Starting...");

			StreamTokenizer tokenizer = new StreamTokenizer();
			tokenizer.Verbosity = VerbosityLevel.Warn;

			// FindMatch
			ArrayList alist = new ArrayList();
			tokenizer.TokenizeString("{ [ ] '}' }", alist);
			foreach(Token t in alist) log.Debug("Token = {0}", t);

			Token[] tarray = (Token[])alist.ToArray(typeof(Token));
			int i = 0;
			if (!FindMatch(tarray, ref i, '{'))
			{
				log.Error("FindMatch failed to match { char");
				return(false);
			}

			if (i != 4)
			{
				log.Error("FindMatch got the wrong answer {0}", i);
				return(false);
			}
			else log.Info("FindMatch worked.");

			//
			// try BuildArray
			//
			ArrayList tokens = new ArrayList();
			tokenizer.TokenizeString("1 2 3 4 5", tokens);
			foreach(Token t in tokens) log.Debug("Token = {0}", t);

			i = 0;
			Int16[] shorts = (short[])ParseUtil.BuildArray(tokens, ref i, typeof(Int16), null,
				-1, log);
		
			if (shorts == null) 
			{
				log.Error("Unable to BuildArray of shorts.");
				return(false);
			}

			log.Info("Parsed shorts:");
			foreach(Int16 s in shorts)
			{
				log.Write("{0}, ", s);
			}
			log.WriteLine("");

			//
			// try BuildArray of floats, char terminated
			//
			tokens.Clear();
			tokenizer.TokenizeString("1 2 ; 3 4", tokens);
			foreach(Token t in tokens) log.Debug("Token = {0}", t);

			i = 0;
			Single[] floats = (float[])ParseUtil.BuildArray(tokens, ref i, typeof(Single), 
				new CharToken(';'), -1, log);
		
			if (floats == null) 
			{
				log.Error("Unable to BuildArray of floats.");
				return(false);
			}

			log.Info("Parsed floats:");
			foreach(float f in floats)
			{
				log.Write("{0}, ", f);
			}
			log.WriteLine("");

			if (i != 2)
			{
				log.Error("BuildArray left i = {0} which is incorrect");
				return(false);
			}

			//
			// try BuildArray on high-precision floats
			//
			tokens.Clear();
			float f1 = 1.23456f;
			float f2 = 2.34567f;
			tokenizer.TokenizeString(String.Format("{0:f6} {1:f6}", f1,f2), tokens);
			foreach(Token t in tokens) log.Debug("Token = {0}", t);

			i = 0;
			floats = (float[])ParseUtil.BuildArray(tokens, ref i, typeof(Single), 
				null, -1, log);
		
			if (floats == null) 
			{
				log.Error("Unable to BuildArray of floats.");
				return(false);
			}

			log.Info("Parsed floats:");
			foreach(float f in floats)
			{
				log.Write("{0}, ", f);
			}
			log.WriteLine("");

			if (floats[0] != f1)
			{
				log.Error("BuildArray produced float {0:f6} instead of {1:f6}",
					floats[0], f1);
				return(false);
			}

			//
			// try BuildArray of chars, maxLength terminated
			//
			log.Info("Chars, terminated by maxLength");
			tokens.Clear();
			tokenizer.TokenizeString("12 2 ; 3 4", tokens);
			foreach(Token t in tokens) log.Debug("Token = {0}", t);

			i = 0;
			char[] chars = (char[])ParseUtil.BuildArray(tokens, ref i, typeof(Char), 
				null, 3, log);
		
			if (chars == null) 
			{
				log.Error("Unable to BuildArray of chars.");
				return(false);
			}

			log.Info("Parsed chars:");
			foreach(char f in chars)
			{
				log.Write("{0}, ", f);
			}
			log.WriteLine("");

			if (i != 4)
			{
				log.Error("BuildArray left i = {0} which is incorrect", i);
				return(false);
			}

			//
			// try BuildArray of hex numbers
			//
			log.Info("Hex numbers");
			tokens.Clear();
			tokenizer.Settings.ParseHexNumbers = true;
			tokenizer.TokenizeString("0xfff, 0xffe", tokens);
			foreach(Token t in tokens) log.Debug("Token = {0}", t);

			i = 0;
			ushort[] ushorts = (ushort[])ParseUtil.BuildArray(tokens, ref i, typeof(ushort), 
				null, 3, log);
		
			if (ushorts == null) 
			{
				log.Error("Unable to BuildArray of ushorts.");
				return(false);
			}

			log.Info("Parsed ushorts:");
			foreach(ushort us in ushorts)
			{
				log.Write("{0}, ", us);
			}
			log.WriteLine("");

//			if (i != 4)
//			{
//				log.Error("BuildArray left i = {0} which is incorrect", i);
//				return(false);
//			}

			log.Info("All PASSED");
			return(true);
		}
示例#2
0
		/// <summary> Initializes the StreamTokenizer used for reading the ARFF file.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		protected internal virtual void  initTokenizer(StreamTokenizer tokenizer)
		{
			
			tokenizer.Settings.ResetCharTypeTable();
			tokenizer.Settings.WhitespaceChars(0,(int) ' ');
			tokenizer.Settings.WordChars((int)' ' + 1, (int) '\u00FF');
			tokenizer.Settings.WhitespaceChars((int)',',(int) ',');
			tokenizer.Settings.CommentChar('%');
			tokenizer.Settings.QuoteChar('"');
			tokenizer.Settings.QuoteChar('\'');
			tokenizer.Settings.OrdinaryChar('{');
			tokenizer.Settings.OrdinaryChar('}');
			tokenizer.Settings.GrabEol=true;
		}
示例#3
0
		/// <summary> Reads and skips all tokens before next end of line token.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		protected internal virtual void  readTillEOL(StreamTokenizer tokenizer)
		{

            Token token;
            tokenizer.NextToken(out token);
			while (!(token is EolToken))
			{
                tokenizer.NextToken(out token);
			} ;
			tokenizer.PushBack(token);
		}
示例#4
0
		/// <summary> Gets token and checks if its end of line.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		/// <exception cref="IOException">if it doesn't find an end of line
		/// </exception>
		protected internal virtual void  getLastToken(StreamTokenizer tokenizer,out Token token , bool endOfFileOk)
		{
			tokenizer.NextToken(out token);
			if ( (!(token is EolToken)) && ( (!(token is EofToken))  || !endOfFileOk))
			{
				errms(tokenizer, "end of line expected");
			}
		}
示例#5
0
		/// <summary> Gets next token, checking for a premature and of line.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		/// <exception cref="IOException">if it finds a premature end of line
		/// </exception>
		protected internal virtual void  getNextToken(StreamTokenizer tokenizer,out Token token)
		{
            tokenizer.NextToken(out token);
			if (token is EolToken)
			{
				errms(tokenizer, "premature end of line");
			}
            if (token is EofToken)
			{
				errms(tokenizer, "premature end of file");
			}
			//else if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"'))
            else if ((token is CharToken) && ((token.StringValue == "'") || (token.StringValue == "\"") ))
			{
				//tokenizer.ttype = SupportClass.StreamTokenizerSupport.TT_WORD;
			}
			//else if ((tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_WORD) && (tokenizer.sval.Equals("?")))
            else if ((token is CharToken) && (token.StringValue == "?"))
			{
				//tokenizer.ttype = '?';
			}
		}
示例#6
0
        /// <summary>
        /// Speed test.  This tests the speed of the parse.
        /// </summary>
        /// <returns>bool - true for ran, false for failed to run.</returns>
        public static bool SpeedTest()
        {
            Logger log = new Logger("SpeedTest");
            log.Verbosity = VerbosityLevel.Debug;
            log.Info("Starting...");
            Random rand = new Random(0);

            // setup tokenizer
            StreamTokenizer tokenizer = new StreamTokenizer();
            tokenizer.Settings.ParseNumbers = true;

            int nTokens = 1024;
            MemoryStream ms;
            StreamWriter writer;

            // int
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("{0}", (int)(rand.NextDouble() * 256));
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} integers took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // float
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            ms.Position = 0;
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("{0:f9}", rand.NextDouble() * 10);
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} floats took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // exponential
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            ms.Position = 0;
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("{0:e9}", rand.NextDouble() * 1000);
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} exponential floats took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // words
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("foo ");
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} words took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            // hex
            ms = new MemoryStream();
            writer = new StreamWriter(ms);
            for (int i = 0; i < nTokens; i++)
            {
                writer.WriteLine("0x{0:x}", (int)(rand.NextDouble() * 256));
            }
            writer.Flush();
            ms.Position = 0;

            Console.WriteLine("Parse {0} hex numbers took {1:f2} ms", nTokens,
                SpeedTestParse(tokenizer, ms));

            //			Console.WriteLine("Buffer to parse is:");
            //			Console.WriteLine("{0}", Encoding.ASCII.GetString(ms.GetBuffer()));

            return (true);
        }
示例#7
0
		/// <summary> Reads a single instance from the reader and appends it
		/// to the dataset.  Automatically expands the dataset if it
		/// is not large enough to hold the instance. This method does
		/// not check for carriage return at the end of the line.
		/// 
		/// </summary>
		/// <param name="reader">the reader 
		/// </param>
		/// <returns> false if end of file has been reached
		/// </returns>
		/// <exception cref="IOException">if the information is not read 
		/// successfully
		/// </exception>
		//UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
		public virtual bool readInstance(System.IO.StreamReader reader)
		{
			StreamTokenizer tokenizer = new StreamTokenizer(reader);
			
			initTokenizer(tokenizer);
			return getInstance(tokenizer, false);
		}
示例#8
0
		/// <summary> Reads and stores header of an ARFF file.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		/// <exception cref="IOException">if the information is not read 
		/// successfully
		/// </exception>
		protected internal virtual void  readHeader(StreamTokenizer tokenizer)
		{
			
			System.String attributeName;
			FastVector attributeValues;
			//int i;
            Token token=null;
			// Get name of relation.
			getFirstToken(tokenizer, out token);
			//if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF)
            if ((token != null)   && (token is EofToken))
			{
				errms(tokenizer, "premature end of file");
			}
			if (ARFF_RELATION.ToUpper().Equals(token.StringValue.ToUpper()))
			{
				getNextToken(tokenizer,out token);
				m_RelationName = token.StringValue;
				getLastToken(tokenizer,out token, false);
			}
			else
			{
				errms(tokenizer, "keyword " + ARFF_RELATION + " expected");
			}
			
			// Create vectors to hold information temporarily.
			m_Attributes = new FastVector();
			
			// Get attribute declarations.
			getFirstToken(tokenizer, out token);
			//if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF)
            if ((token != null) && (token is EofToken))
			{
				errms(tokenizer, "premature end of file");
			}
			
			while (Attribute.ARFF_ATTRIBUTE.ToUpper().Equals(token.StringValue.ToUpper()))
			{
				
				// Get attribute name.
				getNextToken(tokenizer,out token);
				attributeName = token.StringValue;
				getNextToken(tokenizer,out token);
				
				// Check if attribute is nominal.
				//if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_WORD)
                if ((token != null) && (token is WordToken))
				{
					
					// Attribute is real, integer, or string.
                    if (token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_REAL.ToUpper()) || token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_INTEGER.ToUpper()) || token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_NUMERIC.ToUpper()))
					{
						m_Attributes.addElement(new Attribute(attributeName, numAttributes()));
						readTillEOL(tokenizer);
					}
                    else if (token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_STRING.ToUpper()))
					{
						m_Attributes.addElement(new Attribute(attributeName, (FastVector) null, numAttributes()));
						readTillEOL(tokenizer);
					}
                    else if (token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_DATE.ToUpper()))
					{
						System.String format = null;
                        tokenizer.NextToken(out token);
						//if (tokenizer.NextToken() != SupportClass.StreamTokenizerSupport.TT_EOL)
                        if ((token != null) && (!(token is EofToken)))
						{
							//if ((tokenizer.ttype != SupportClass.StreamTokenizerSupport.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"'))
                            if ((token != null) && (!(token is WordToken)) && (token.StringValue!="'") && (token.StringValue!="\"") )
							{
								errms(tokenizer, "not a valid date format");
							}
							format = token.StringValue;
							readTillEOL(tokenizer);
						}
						else
						{
							tokenizer.PushBack(token);
						}
						m_Attributes.addElement(new Attribute(attributeName, format, numAttributes()));
					}
					else
					{
						errms(tokenizer, "no valid attribute type or invalid " + "enumeration");
					}
				}
				else
				{
					
					// Attribute is nominal.
					attributeValues = new FastVector();
					tokenizer.PushBack(token);
					
					// Get values for nominal attribute.
                    tokenizer.NextToken(out token);
					if ( token.StringValue != "{")
					{
						errms(tokenizer, "{ expected at beginning of enumeration");
					}
                    tokenizer.NextToken(out token);
					while ( token.StringValue != "}")
					{
						//if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOL)
                        if (token is EolToken)
						{
							errms(tokenizer, "} expected at end of enumeration");
						}
						else
						{
							attributeValues.addElement(token.StringValue);
						}

                        tokenizer.NextToken(out token);
					}
					if (attributeValues.size() == 0)
					{
						errms(tokenizer, "no nominal values found");
					}
					m_Attributes.addElement(new Attribute(attributeName, attributeValues, numAttributes()));
				}
				getLastToken(tokenizer,out token, false);
				getFirstToken(tokenizer,out token);
				//if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF)
                if (token is EofToken)
					errms(tokenizer, "premature end of file");
			}
			
			// Check if data part follows. We can't easily check for EOL.
			if (!ARFF_DATA.ToUpper().Equals(token.StringValue.ToUpper()))
			{
				errms(tokenizer, "keyword " + ARFF_DATA + " expected");
			}
			
			// Check if any attributes have been declared.
			if (m_Attributes.size() == 0)
			{
				errms(tokenizer, "no attributes declared");
			}
			
			// Allocate buffers in case sparse instances have to be read
			m_ValueBuffer = new double[numAttributes()];
			m_IndicesBuffer = new int[numAttributes()];
            
            
		}
示例#9
0
        protected internal virtual void errms(StreamTokenizer tokenizer, System.String theMsg)
        {

            throw new System.IO.IOException(theMsg + ", read " + tokenizer.ToString());
        }
示例#10
0
		/// <summary> Reads a single instance using the tokenizer and appends it
		/// to the dataset. Automatically expands the dataset if it
		/// is not large enough to hold the instance.
		/// 
		/// </summary>
		/// <param name="tokenizer">the tokenizer to be used
		/// </param>
		/// <param name="flag">if method should test for carriage return after 
		/// each instance
		/// </param>
		/// <returns> false if end of file has been reached
		/// </returns>
		/// <exception cref="IOException">if the information is not read 
		/// successfully
		/// </exception>
		protected internal virtual bool getInstanceSparse(StreamTokenizer tokenizer, bool flag)
		{
			
			int valIndex, numValues = 0, maxIndex = - 1;
            Token token;
			// Get values
			do 
			{
             
				// Get index
				getIndex(tokenizer,out token);
				//if (tokenizer.ttype == '}')
                if ((token is CharToken) && (token.StringValue == "}"))
				{
					break;
				}
				
				// Is index valid?
				try
				{
					m_IndicesBuffer[numValues] = System.Int32.Parse(token.StringValue);
				}
				catch (System.FormatException e)
				{
					errms(tokenizer, "index number expected" + " "+e.ToString());
				}
				if (m_IndicesBuffer[numValues] <= maxIndex)
				{
					errms(tokenizer, "indices have to be ordered" );
				}
				if ((m_IndicesBuffer[numValues] < 0) || (m_IndicesBuffer[numValues] >= numAttributes()))
				{
					errms(tokenizer, "index out of bounds");
				}
				maxIndex = m_IndicesBuffer[numValues];
				
				// Get value;
                
				getNextToken(tokenizer,out token);
				
				// Check if value is missing.
				//if (tokenizer.ttype == '?')
                if ((token is CharToken) && (token.StringValue == "?"))
				{
					m_ValueBuffer[numValues] = Instance.missingValue();
				}
				else
				{
					
					// Check if token is valid.
                    if (!(token is WordToken))
					{
						errms(tokenizer, "not a valid value");
					}
					switch (attribute(m_IndicesBuffer[numValues]).type())
					{
						
						case Attribute.NOMINAL: 
							// Check if value appears in header.
							valIndex = attribute(m_IndicesBuffer[numValues]).indexOfValue(token.StringValue);
							if (valIndex == - 1)
							{
								errms(tokenizer, "nominal value not declared in header");
							}
							m_ValueBuffer[numValues] = (double) valIndex;
							break;
						
						case Attribute.NUMERIC: 
							// Check if value is really a number.
							try
							{
								m_ValueBuffer[numValues] = System.Double.Parse(token.StringValue);
							}
							catch (System.FormatException e)
							{
                                errms(tokenizer, "number expected" + " " + e.ToString());
							}
							break;
						
						case Attribute.STRING: 
							m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).addStringValue(token.StringValue);
							break;
						
						case Attribute.DATE: 
							try
							{
								m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).parseDate(token.StringValue);
							}
							//UPGRADE_TODO: Class 'java.text.ParseException' was converted to 'System.FormatException' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javatextParseException'"
							catch (System.FormatException e)
							{
								errms(tokenizer, "unparseable date: " + token.StringValue +" "+e.ToString());
							}
							break;
						
						default: 
							errms(tokenizer, "unknown attribute type in column " + m_IndicesBuffer[numValues]);
							break;
						
					}
				}
				numValues++;
			}
			while (true);
			if (flag)
			{
				getLastToken(tokenizer,out token, true);
			}
			
			// Add instance to dataset
			double[] tempValues = new double[numValues];
			int[] tempIndices = new int[numValues];
			Array.Copy(m_ValueBuffer, 0, tempValues, 0, numValues);
			Array.Copy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
			add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
			return true;
		}
示例#11
0
		/// <summary> Reads a single instance using the tokenizer and appends it
		/// to the dataset. Automatically expands the dataset if it
		/// is not large enough to hold the instance.
		/// 
		/// </summary>
		/// <param name="tokenizer">the tokenizer to be used
		/// </param>
		/// <param name="flag">if method should test for carriage return after 
		/// each instance
		/// </param>
		/// <returns> false if end of file has been reached
		/// </returns>
		/// <exception cref="IOException">if the information is not read 
		/// successfully
		/// </exception>
		protected internal virtual bool getInstanceFull(StreamTokenizer tokenizer, bool flag)
		{
			
			double[] instance = new double[numAttributes()];
			int index;
            Token token=null;

			// Get values for all attributes.
			for (int i = 0; i < numAttributes(); i++)
			{
				
				// Get next token
				//if (i > 0)
				//{
				getNextToken(tokenizer,out token);
				//}
				
				// Check if value is missing.
				//if (tokenizer.ttype == '?')
                if (token != null)
                {
                    if (token.StringValue == "?")
                    {
                        instance[i] = Instance.missingValue();
                    }
                    else
                    {

                        // Check if token is valid.
                        //if (tokenizer.ttype != SupportClass.StreamTokenizerSupport.TT_WORD)
                        if (!(token is WordToken))
                        {
                            errms(tokenizer, "not a valid value");
                        }
                        switch (attribute(i).type())
                        {

                            case Attribute.NOMINAL:
                                // Check if value appears in header.
                                index = attribute(i).indexOfValue(token.StringValue);
                                if (index == -1)
                                {
                                    errms(tokenizer, "nominal value not declared in header");
                                }
                                instance[i] = (double)index;
                                break;

                            case Attribute.NUMERIC:
                                // Check if value is really a number.
                                try
                                {
                                    instance[i] = System.Double.Parse(token.StringValue);
                                }
                                catch (System.FormatException e)
                                {
                                    errms(tokenizer, "number expected" + " " + e.ToString());
                                }
                                break;

                            case Attribute.STRING:
                                instance[i] = attribute(i).addStringValue(token.StringValue);
                                break;

                            case Attribute.DATE:
                                try
                                {
                                    instance[i] = attribute(i).parseDate(token.StringValue);
                                }
                                //UPGRADE_TODO: Class 'java.text.ParseException' was converted to 'System.FormatException' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javatextParseException'"
                                catch (System.FormatException e)
                                {
                                    errms(tokenizer, "unparseable date: " + token.StringValue + " " + e.ToString());
                                }
                                break;

                            default:
                                errms(tokenizer, "unknown attribute type in column " + i);
                                break;

                        }
                    }
                }
			}
			if (flag)
			{
				getLastToken(tokenizer,out token, true);
			}
			
			// Add instance to dataset
			add(new Instance(1, instance));
			return true;
		}
示例#12
0
		/// <summary> Reads the header of an ARFF file from a reader and 
		/// reserves space for the given number of instances. Lets
		/// the class index be undefined (negative).
		/// 
		/// </summary>
		/// <param name="reader">the reader
		/// </param>
		/// <param name="capacity">the capacity
		/// </param>
		/// <exception cref="IllegalArgumentException">if the header is not read successfully
		/// or the capacity is negative.
		/// </exception>
		/// <exception cref="IOException">if there is a problem with the reader.
		/// </exception>
		//@ requires capacity >= 0;
		//@ ensures classIndex() == -1;
		//UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
		public Instances(System.IO.StreamReader reader, int capacity)
		{
			
			StreamTokenizer tokenizer;
			
			if (capacity < 0)
			{
				throw new System.ArgumentException("Capacity has to be positive!");
			}
			tokenizer = new StreamTokenizer(reader);
			initTokenizer(tokenizer);
			readHeader(tokenizer);
			m_ClassIndex = - 1;
			m_Instances = new FastVector(capacity);
		}
示例#13
0
		/// <summary> Reads a single instance using the tokenizer and appends it
		/// to the dataset. Automatically expands the dataset if it
		/// is not large enough to hold the instance.
		/// 
		/// </summary>
		/// <param name="tokenizer">the tokenizer to be used
		/// </param>
		/// <param name="flag">if method should test for carriage return after 
		/// each instance
		/// </param>
		/// <returns> false if end of file has been reached
		/// </returns>
		/// <exception cref="IOException">if the information is not read 
		/// successfully
		/// </exception>
		protected internal virtual bool getInstance(StreamTokenizer tokenizer, bool flag)
		{
			
			// Check if any attributes have been declared.
			if (m_Attributes.size() == 0)
			{
				errms(tokenizer, "no header information available");
			}
			
			// Check if end of file reached.
			//getFirstToken(tokenizer);
            Token token;

            getFirstToken(tokenizer, out token);
            tokenizer.PushBack(token);
          

			//if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF)
            if (token is EofToken)
			{
				return false;
			}

            
			// Parse instance
			//if (tokenizer.ttype == '{')
            if ((token is CharToken) && (token.StringValue == "{"))
			{
				return getInstanceSparse(tokenizer, flag);
			}
			else
			{
				return getInstanceFull(tokenizer, flag);
			}
		}
示例#14
0
		/// <summary> Reads an ARFF file from a reader, and assigns a weight of
		/// one to each instance. Lets the index of the class 
		/// attribute be undefined (negative).
		/// 
		/// </summary>
		/// <param name="reader">the reader
		/// </param>
		/// <exception cref="IOException">if the ARFF file is not read 
		/// successfully
		/// </exception>
		//UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
		public Instances(System.IO.StreamReader reader)
		{
			
			StreamTokenizer tokenizer;
			
			tokenizer = new StreamTokenizer(reader);
			initTokenizer(tokenizer);
			readHeader(tokenizer);
			m_ClassIndex = - 1;
			m_Instances = new FastVector(1000);
			while (getInstance(tokenizer, true))
			{
			} ;
			compactify();
		}
示例#15
0
        // ---------------------------------------------------------------------
        #region TestSelf
        // ---------------------------------------------------------------------
        /// <summary>
        /// Simple self test.  See StreamTokenizerTestCase for full
        /// tests.
        /// </summary>
        /// <returns>bool - true for success, false for failure.</returns>
        public static bool TestSelf()
        {
            Logger log = new Logger("testSelf");
            log.Verbosity = VerbosityLevel.Debug;
            log.Info("Starting...");
            string testString;
            ArrayList tokens;

            // setup tokenizer
            StreamTokenizer tokenizer = new StreamTokenizer();
            tokenizer.Settings.SetupForCodeParse();
            tokenizer.Verbosity = VerbosityLevel.Debug;

            //
            // try string parse
            //
            log.Write("--------------------------------------------------------\n");
            log.Info("string parse:");
            log.Write("--------------------------------------------------------\n");
            tokens = new ArrayList();
            testString = "-1.2ej";
            tokenizer.Settings.DoUntermCheck = false;
            tokenizer.Settings.GrabWhitespace = false;

            if (!tokenizer.TokenizeString(testString, tokens))
            {
                log.Error("Unable to parse into token vector.");
                return (false);
            }

            foreach (Token t in tokens) log.Info("Token = '{0}'", t);
            tokens = new ArrayList();

            //
            // try NextToken style
            //
            //			log.Write("--------------------------------------------------------\n");
            //			log.Info("NextToken use");
            //			log.Write("--------------------------------------------------------\n");
            //string fileName = "st-testSelf.tmp";
            //testString = "this is a simple string";
            //tokenizer.TextReader = new StringReader(testString);
            //tokenizer.TextReader = File.OpenText(fileName);
            //Token token;
            //while (tokenizer.NextToken(out token)) log.Info("Token = '{0}'", token);

            //
            // try TokenizeFile
            //
            log.Write("--------------------------------------------------------\n");
            log.Info("Tokenize missing file");
            log.Write("--------------------------------------------------------\n");
            string nonExistentFileName = "ThisFile better not exist";
            bool caughtIt = false;
            try
            {
                tokenizer.TokenizeFile(nonExistentFileName);
            }
            catch (FileNotFoundException e)
            {
                log.Info("Correctly caught exception: {0}: {1}", e.GetType(), e.Message);
                caughtIt = true;
            }
            if (!caughtIt)
            {
                log.Error("Didn't get a file not found exception from TokenizeFile.");
                return (false);
            }

            //
            // test line numbers in tokens
            //

            // done
            log.Info("Done.");
            return (true);
        }
示例#16
0
		/// <summary> Gets next token, skipping empty lines.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		/// <exception cref="IOException">if reading the next token fails
		/// </exception>
		protected internal virtual void  getFirstToken(StreamTokenizer tokenizer,out Token token)
		{
			
			//while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_EOL)
            tokenizer.NextToken(out token);
            while(token is EolToken)
			{
                tokenizer.NextToken(out token);
			} ;

			//if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"'))
            //if ((token.StringValue == "'") || (token.StringValue == "\"") )
			//{
				//tokenizer.ttype = SupportClass.StreamTokenizerSupport.TT_WORD;
			//}
			//else if ((tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_WORD) && (tokenizer.sval.Equals("?")))
			//{
			//	tokenizer.ttype = '?';
			//}
		}
示例#17
0
        /// <summary>
        /// Use the supplied tokenizer to tokenize the specified stream
        /// and time it.
        /// </summary>
        /// <param name="tokenizer"></param>
        /// <param name="stream"></param>
        /// <returns>Total milliseconds per parse.</returns>
        protected static double SpeedTestParse(StreamTokenizer tokenizer,
            Stream stream)
        {
            GC.Collect();
            ArrayList tokens = new ArrayList();
            DateTime start = HighResClock.Now;
            int cycles = 100;
            for (int i = 0; i < cycles; i++)
            {
                tokenizer.TokenizeStream(stream, tokens);
                stream.Position = 0;
            }
            TimeSpan duration = HighResClock.Now - start;

            return (duration.TotalMilliseconds / cycles);
        }
示例#18
0
		/// <summary> Gets index, checking for a premature and of line.
		/// 
		/// </summary>
		/// <param name="tokenizer">the stream tokenizer
		/// </param>
		/// <exception cref="IOException">if it finds a premature end of line
		/// </exception>
		protected internal virtual void  getIndex(StreamTokenizer tokenizer, out Token token)
		{
			tokenizer.NextToken(out token);
            if (token is EolToken)
			{
				errms(tokenizer, "premature end of line");
			}
            if (token is EofToken)
			{
				errms(tokenizer, "premature end of file");
			}
		}
示例#19
0
		/// <summary> Loads a cost matrix in the old format from a reader. Adapted from code once sitting 
		/// in Instances.java
		/// 
		/// </summary>
		/// <param name="reader">the reader to get the values from.
		/// </param>
		/// <exception cref="Exception">if the matrix cannot be read correctly.
		/// </exception>
		//UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'"
		public virtual void  readOldFormat(System.IO.StreamReader reader)
		{
			
			StreamTokenizer tokenizer;
			Token currentToken;
			double firstIndex, secondIndex, weight;
			
			tokenizer = new StreamTokenizer(reader);
			
			initialize();
			
			tokenizer.Settings.CommentChar('%');
			tokenizer.Settings.GrabEol=true;
            tokenizer.NextToken(out currentToken);
			//while (SupportClass.StreamTokenizerSupport.TT_EOF != (currentToken = tokenizer.NextToken()))
            while (!(currentToken is EofToken))
			{
				
				// Skip empty lines 
				//if (currentToken == SupportClass.StreamTokenizerSupport.TT_EOL)
                if (currentToken is EolToken)
				{
					continue;
				}
				
				// Get index of first class.
				//if (currentToken != SupportClass.StreamTokenizerSupport.TT_NUMBER)
                if (!((currentToken is FloatToken)|| (currentToken is IntToken)))
				{
					throw new System.Exception("Only numbers and comments allowed " + "in cost file!");
				}
				//firstIndex = tokenizer.nval;
                firstIndex = Convert.ToDouble(currentToken.StringValue);
				//UPGRADE_WARNING: Data types in Visual C# might be different.  Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'"
				if (!Utils.eq((double) firstIndex, firstIndex))
				{
					throw new System.Exception("First number in line has to be " + "index of a class!");
				}
				//UPGRADE_WARNING: Data types in Visual C# might be different.  Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'"
				if ((int) firstIndex >= size())
				{
					throw new System.Exception("Class index out of range!");
				}
				
				// Get index of second class.
				//if (SupportClass.StreamTokenizerSupport.TT_EOF == (currentToken = tokenizer.NextToken()))
                tokenizer.NextToken(out currentToken);
                if (currentToken is EofToken)
				{
					throw new System.Exception("Premature end of file!");
				}
				//if (currentToken == SupportClass.StreamTokenizerSupport.TT_EOL)
                if (currentToken is EolToken)
				{
					throw new System.Exception("Premature end of line!");
				}
				//if (currentToken != SupportClass.StreamTokenizerSupport.TT_NUMBER)
                if  (!((currentToken is IntToken) || (currentToken is FloatToken)))
				{
					throw new System.Exception("Only numbers and comments allowed " + "in cost file!");
				}
				//secondIndex = tokenizer.nval;
                secondIndex = Convert.ToDouble(currentToken.StringValue);
				//UPGRADE_WARNING: Data types in Visual C# might be different.  Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'"
				if (!Utils.eq((double) secondIndex, secondIndex))
				{
					throw new System.Exception("Second number in line has to be " + "index of a class!");
				}
				//UPGRADE_WARNING: Data types in Visual C# might be different.  Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'"
				if ((int) secondIndex >= size())
				{
					throw new System.Exception("Class index out of range!");
				}
				//UPGRADE_WARNING: Data types in Visual C# might be different.  Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'"
				if ((int) secondIndex == (int) firstIndex)
				{
					throw new System.Exception("Diagonal of cost matrix non-zero!");
				}
				
				// Get cost factor.

                tokenizer.NextToken(out currentToken);
                if (currentToken is EofToken)

				//if (SupportClass.StreamTokenizerSupport.TT_EOF == (currentToken = tokenizer.NextToken()))
				{
					throw new System.Exception("Premature end of file!");
				}
				//if (currentToken == SupportClass.StreamTokenizerSupport.TT_EOL)
                if (currentToken is EolToken)
				{
					throw new System.Exception("Premature end of line!");
				}

				//if (currentToken != SupportClass.StreamTokenizerSupport.TT_NUMBER)
                if (!((currentToken is IntToken) || (currentToken is FloatToken)))
				{
					throw new System.Exception("Only numbers and comments allowed " + "in cost file!");
				}
                weight = Convert.ToDouble(currentToken.StringValue);
				if (!Utils.gr(weight, 0))
				{
					throw new System.Exception("Only positive weights allowed!");
				}
				//UPGRADE_WARNING: Data types in Visual C# might be different.  Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'"
				setXmlElement((int) firstIndex, (int) secondIndex, weight);

                tokenizer.NextToken(out currentToken);
			}
		}
示例#20
0
		/// <summary> Read a matrix from a stream.  The format is the same the print method,
		/// so printed matrices can be read back in (provided they were printed using
		/// US Locale).  XmlElements are separated by
		/// whitespace, all the elements for each row appear on a single line,
		/// the last row is followed by a blank line.
		/// <p/>
		/// Note: This format differs from the one that can be read via the
		/// Matrix(Reader) constructor! For that format, the write(Writer) method
		/// is used (from the original weka.core.Matrix class).
		/// 
		/// </summary>
		/// <param name="input">the input stream.
		/// </param>
		/// <seealso cref="Matrix(Reader)">
		/// </seealso>
		/// <seealso cref="write(Writer)">
		/// </seealso>
		public static Matrix read(System.IO.StreamReader input)
		{
			StreamTokenizer tokenizer = new StreamTokenizer(input);
			
			// Although StreamTokenizer will parse numbers, it doesn't recognize
			// scientific notation (E or D); however, Double.valueOf does.
			// The strategy here is to disable StreamTokenizer's number parsing.
			// We'll only get whitespace delimited words, EOL's and EOF's.
			// These words should all be numbers, for Double.valueOf to parse.

            tokenizer.Settings.SetDefaults();//.ResetSyntax();
			tokenizer.Settings.WordChars(0, 255);
			tokenizer.Settings.WhitespaceChars(0,(int) ' ');//  .WhitespaceChars(0, ' ');
			tokenizer.Settings.GrabEol=true;
			System.Collections.ArrayList v = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
			
			// Ignore initial empty lines
            Token token;
            tokenizer.NextToken(out token);
			while (token is EolToken)// == SupportClass.StreamTokenizerSupport.TT_EOL)
				;
			//if (token.ttype == SupportClass.StreamTokenizerSupport.TT_EOF)
            if (token is EofToken)
				throw new System.IO.IOException("Unexpected EOF on matrix read.");
            do
            {
               // v.Add(System.Double.Parse(tokenizer.sval)); // Read & store 1st row.
                v.Add(System.Double.Parse(token.StringValue)); // Read & store 1st row.
                tokenizer.NextToken(out token);
            }
            while (token is WordToken);
			//while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_WORD);
			
			int n = v.Count; // Now we've got the number of columns!
			double[] row = new double[n];
			for (int j = 0; j < n; j++)
			// extract the elements of the 1st row.
				row[j] = ((System.Double) v[j]);
			v.Clear();
			v.Add(row); // Start storing rows instead of columns.
            tokenizer.NextToken(out token);
            while (token is WordToken)
			//while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_WORD)
			{
				// While non-empty lines
				v.Add(row = new double[n]);
				int j = 0;
				do 
				{
					if (j >= n)
						throw new System.IO.IOException("Row " + v.Count + " is too long.");
					//row[j++] = System.Double.Parse(tokenizer.sval);
                    row[j++] = System.Double.Parse(token.StringValue);
                    tokenizer.NextToken(out token);
				}
                while (token is WordToken);
              
				//while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_WORD);
				if (j < n)
					throw new System.IO.IOException("Row " + v.Count + " is too short.");
			}
			int m = v.Count; // Now we've got the number of rows.
			double[][] A = new double[m][];
			v.CopyTo(A); // copy the rows out of the vector
			return new Matrix(A);
		}