/// <summary> /// Simple test of some ParseUtil methods. /// </summary> /// <returns>bool - true for all passed, false otherwise</returns> public static bool TestSelf() { Logger log = new Logger("ParseUtil: TestSelf"); log.Info("Starting..."); StreamTokenizer tokenizer = new StreamTokenizer(); tokenizer.Verbosity = VerbosityLevel.Warn; // FindMatch ArrayList alist = new ArrayList(); tokenizer.TokenizeString("{ [ ] '}' }", alist); foreach(Token t in alist) log.Debug("Token = {0}", t); Token[] tarray = (Token[])alist.ToArray(typeof(Token)); int i = 0; if (!FindMatch(tarray, ref i, '{')) { log.Error("FindMatch failed to match { char"); return(false); } if (i != 4) { log.Error("FindMatch got the wrong answer {0}", i); return(false); } else log.Info("FindMatch worked."); // // try BuildArray // ArrayList tokens = new ArrayList(); tokenizer.TokenizeString("1 2 3 4 5", tokens); foreach(Token t in tokens) log.Debug("Token = {0}", t); i = 0; Int16[] shorts = (short[])ParseUtil.BuildArray(tokens, ref i, typeof(Int16), null, -1, log); if (shorts == null) { log.Error("Unable to BuildArray of shorts."); return(false); } log.Info("Parsed shorts:"); foreach(Int16 s in shorts) { log.Write("{0}, ", s); } log.WriteLine(""); // // try BuildArray of floats, char terminated // tokens.Clear(); tokenizer.TokenizeString("1 2 ; 3 4", tokens); foreach(Token t in tokens) log.Debug("Token = {0}", t); i = 0; Single[] floats = (float[])ParseUtil.BuildArray(tokens, ref i, typeof(Single), new CharToken(';'), -1, log); if (floats == null) { log.Error("Unable to BuildArray of floats."); return(false); } log.Info("Parsed floats:"); foreach(float f in floats) { log.Write("{0}, ", f); } log.WriteLine(""); if (i != 2) { log.Error("BuildArray left i = {0} which is incorrect"); return(false); } // // try BuildArray on high-precision floats // tokens.Clear(); float f1 = 1.23456f; float f2 = 2.34567f; tokenizer.TokenizeString(String.Format("{0:f6} {1:f6}", f1,f2), tokens); foreach(Token t in tokens) log.Debug("Token = {0}", t); i = 0; floats = (float[])ParseUtil.BuildArray(tokens, ref i, typeof(Single), null, -1, log); if (floats == null) { log.Error("Unable to BuildArray of floats."); return(false); } log.Info("Parsed floats:"); foreach(float f in floats) { log.Write("{0}, ", f); } log.WriteLine(""); if (floats[0] != f1) { log.Error("BuildArray produced float {0:f6} instead of {1:f6}", floats[0], f1); return(false); } // // try BuildArray of chars, maxLength terminated // log.Info("Chars, terminated by maxLength"); tokens.Clear(); tokenizer.TokenizeString("12 2 ; 3 4", tokens); foreach(Token t in tokens) log.Debug("Token = {0}", t); i = 0; char[] chars = (char[])ParseUtil.BuildArray(tokens, ref i, typeof(Char), null, 3, log); if (chars == null) { log.Error("Unable to BuildArray of chars."); return(false); } log.Info("Parsed chars:"); foreach(char f in chars) { log.Write("{0}, ", f); } log.WriteLine(""); if (i != 4) { log.Error("BuildArray left i = {0} which is incorrect", i); return(false); } // // try BuildArray of hex numbers // log.Info("Hex numbers"); tokens.Clear(); tokenizer.Settings.ParseHexNumbers = true; tokenizer.TokenizeString("0xfff, 0xffe", tokens); foreach(Token t in tokens) log.Debug("Token = {0}", t); i = 0; ushort[] ushorts = (ushort[])ParseUtil.BuildArray(tokens, ref i, typeof(ushort), null, 3, log); if (ushorts == null) { log.Error("Unable to BuildArray of ushorts."); return(false); } log.Info("Parsed ushorts:"); foreach(ushort us in ushorts) { log.Write("{0}, ", us); } log.WriteLine(""); // if (i != 4) // { // log.Error("BuildArray left i = {0} which is incorrect", i); // return(false); // } log.Info("All PASSED"); return(true); }
/// <summary> Initializes the StreamTokenizer used for reading the ARFF file. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> protected internal virtual void initTokenizer(StreamTokenizer tokenizer) { tokenizer.Settings.ResetCharTypeTable(); tokenizer.Settings.WhitespaceChars(0,(int) ' '); tokenizer.Settings.WordChars((int)' ' + 1, (int) '\u00FF'); tokenizer.Settings.WhitespaceChars((int)',',(int) ','); tokenizer.Settings.CommentChar('%'); tokenizer.Settings.QuoteChar('"'); tokenizer.Settings.QuoteChar('\''); tokenizer.Settings.OrdinaryChar('{'); tokenizer.Settings.OrdinaryChar('}'); tokenizer.Settings.GrabEol=true; }
/// <summary> Reads and skips all tokens before next end of line token. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> protected internal virtual void readTillEOL(StreamTokenizer tokenizer) { Token token; tokenizer.NextToken(out token); while (!(token is EolToken)) { tokenizer.NextToken(out token); } ; tokenizer.PushBack(token); }
/// <summary> Gets token and checks if its end of line. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> /// <exception cref="IOException">if it doesn't find an end of line /// </exception> protected internal virtual void getLastToken(StreamTokenizer tokenizer,out Token token , bool endOfFileOk) { tokenizer.NextToken(out token); if ( (!(token is EolToken)) && ( (!(token is EofToken)) || !endOfFileOk)) { errms(tokenizer, "end of line expected"); } }
/// <summary> Gets next token, checking for a premature and of line. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> /// <exception cref="IOException">if it finds a premature end of line /// </exception> protected internal virtual void getNextToken(StreamTokenizer tokenizer,out Token token) { tokenizer.NextToken(out token); if (token is EolToken) { errms(tokenizer, "premature end of line"); } if (token is EofToken) { errms(tokenizer, "premature end of file"); } //else if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) else if ((token is CharToken) && ((token.StringValue == "'") || (token.StringValue == "\"") )) { //tokenizer.ttype = SupportClass.StreamTokenizerSupport.TT_WORD; } //else if ((tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_WORD) && (tokenizer.sval.Equals("?"))) else if ((token is CharToken) && (token.StringValue == "?")) { //tokenizer.ttype = '?'; } }
/// <summary> /// Speed test. This tests the speed of the parse. /// </summary> /// <returns>bool - true for ran, false for failed to run.</returns> public static bool SpeedTest() { Logger log = new Logger("SpeedTest"); log.Verbosity = VerbosityLevel.Debug; log.Info("Starting..."); Random rand = new Random(0); // setup tokenizer StreamTokenizer tokenizer = new StreamTokenizer(); tokenizer.Settings.ParseNumbers = true; int nTokens = 1024; MemoryStream ms; StreamWriter writer; // int ms = new MemoryStream(); writer = new StreamWriter(ms); for (int i = 0; i < nTokens; i++) { writer.WriteLine("{0}", (int)(rand.NextDouble() * 256)); } writer.Flush(); ms.Position = 0; Console.WriteLine("Parse {0} integers took {1:f2} ms", nTokens, SpeedTestParse(tokenizer, ms)); // float ms = new MemoryStream(); writer = new StreamWriter(ms); ms.Position = 0; for (int i = 0; i < nTokens; i++) { writer.WriteLine("{0:f9}", rand.NextDouble() * 10); } writer.Flush(); ms.Position = 0; Console.WriteLine("Parse {0} floats took {1:f2} ms", nTokens, SpeedTestParse(tokenizer, ms)); // exponential ms = new MemoryStream(); writer = new StreamWriter(ms); ms.Position = 0; for (int i = 0; i < nTokens; i++) { writer.WriteLine("{0:e9}", rand.NextDouble() * 1000); } writer.Flush(); ms.Position = 0; Console.WriteLine("Parse {0} exponential floats took {1:f2} ms", nTokens, SpeedTestParse(tokenizer, ms)); // words ms = new MemoryStream(); writer = new StreamWriter(ms); for (int i = 0; i < nTokens; i++) { writer.WriteLine("foo "); } writer.Flush(); ms.Position = 0; Console.WriteLine("Parse {0} words took {1:f2} ms", nTokens, SpeedTestParse(tokenizer, ms)); // hex ms = new MemoryStream(); writer = new StreamWriter(ms); for (int i = 0; i < nTokens; i++) { writer.WriteLine("0x{0:x}", (int)(rand.NextDouble() * 256)); } writer.Flush(); ms.Position = 0; Console.WriteLine("Parse {0} hex numbers took {1:f2} ms", nTokens, SpeedTestParse(tokenizer, ms)); // Console.WriteLine("Buffer to parse is:"); // Console.WriteLine("{0}", Encoding.ASCII.GetString(ms.GetBuffer())); return (true); }
/// <summary> Reads a single instance from the reader and appends it /// to the dataset. Automatically expands the dataset if it /// is not large enough to hold the instance. This method does /// not check for carriage return at the end of the line. /// /// </summary> /// <param name="reader">the reader /// </param> /// <returns> false if end of file has been reached /// </returns> /// <exception cref="IOException">if the information is not read /// successfully /// </exception> //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'" public virtual bool readInstance(System.IO.StreamReader reader) { StreamTokenizer tokenizer = new StreamTokenizer(reader); initTokenizer(tokenizer); return getInstance(tokenizer, false); }
/// <summary> Reads and stores header of an ARFF file. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> /// <exception cref="IOException">if the information is not read /// successfully /// </exception> protected internal virtual void readHeader(StreamTokenizer tokenizer) { System.String attributeName; FastVector attributeValues; //int i; Token token=null; // Get name of relation. getFirstToken(tokenizer, out token); //if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF) if ((token != null) && (token is EofToken)) { errms(tokenizer, "premature end of file"); } if (ARFF_RELATION.ToUpper().Equals(token.StringValue.ToUpper())) { getNextToken(tokenizer,out token); m_RelationName = token.StringValue; getLastToken(tokenizer,out token, false); } else { errms(tokenizer, "keyword " + ARFF_RELATION + " expected"); } // Create vectors to hold information temporarily. m_Attributes = new FastVector(); // Get attribute declarations. getFirstToken(tokenizer, out token); //if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF) if ((token != null) && (token is EofToken)) { errms(tokenizer, "premature end of file"); } while (Attribute.ARFF_ATTRIBUTE.ToUpper().Equals(token.StringValue.ToUpper())) { // Get attribute name. getNextToken(tokenizer,out token); attributeName = token.StringValue; getNextToken(tokenizer,out token); // Check if attribute is nominal. //if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_WORD) if ((token != null) && (token is WordToken)) { // Attribute is real, integer, or string. if (token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_REAL.ToUpper()) || token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_INTEGER.ToUpper()) || token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_NUMERIC.ToUpper())) { m_Attributes.addElement(new Attribute(attributeName, numAttributes())); readTillEOL(tokenizer); } else if (token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_STRING.ToUpper())) { m_Attributes.addElement(new Attribute(attributeName, (FastVector) null, numAttributes())); readTillEOL(tokenizer); } else if (token.StringValue.ToUpper().Equals(Attribute.ARFF_ATTRIBUTE_DATE.ToUpper())) { System.String format = null; tokenizer.NextToken(out token); //if (tokenizer.NextToken() != SupportClass.StreamTokenizerSupport.TT_EOL) if ((token != null) && (!(token is EofToken))) { //if ((tokenizer.ttype != SupportClass.StreamTokenizerSupport.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) if ((token != null) && (!(token is WordToken)) && (token.StringValue!="'") && (token.StringValue!="\"") ) { errms(tokenizer, "not a valid date format"); } format = token.StringValue; readTillEOL(tokenizer); } else { tokenizer.PushBack(token); } m_Attributes.addElement(new Attribute(attributeName, format, numAttributes())); } else { errms(tokenizer, "no valid attribute type or invalid " + "enumeration"); } } else { // Attribute is nominal. attributeValues = new FastVector(); tokenizer.PushBack(token); // Get values for nominal attribute. tokenizer.NextToken(out token); if ( token.StringValue != "{") { errms(tokenizer, "{ expected at beginning of enumeration"); } tokenizer.NextToken(out token); while ( token.StringValue != "}") { //if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOL) if (token is EolToken) { errms(tokenizer, "} expected at end of enumeration"); } else { attributeValues.addElement(token.StringValue); } tokenizer.NextToken(out token); } if (attributeValues.size() == 0) { errms(tokenizer, "no nominal values found"); } m_Attributes.addElement(new Attribute(attributeName, attributeValues, numAttributes())); } getLastToken(tokenizer,out token, false); getFirstToken(tokenizer,out token); //if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF) if (token is EofToken) errms(tokenizer, "premature end of file"); } // Check if data part follows. We can't easily check for EOL. if (!ARFF_DATA.ToUpper().Equals(token.StringValue.ToUpper())) { errms(tokenizer, "keyword " + ARFF_DATA + " expected"); } // Check if any attributes have been declared. if (m_Attributes.size() == 0) { errms(tokenizer, "no attributes declared"); } // Allocate buffers in case sparse instances have to be read m_ValueBuffer = new double[numAttributes()]; m_IndicesBuffer = new int[numAttributes()]; }
protected internal virtual void errms(StreamTokenizer tokenizer, System.String theMsg) { throw new System.IO.IOException(theMsg + ", read " + tokenizer.ToString()); }
/// <summary> Reads a single instance using the tokenizer and appends it /// to the dataset. Automatically expands the dataset if it /// is not large enough to hold the instance. /// /// </summary> /// <param name="tokenizer">the tokenizer to be used /// </param> /// <param name="flag">if method should test for carriage return after /// each instance /// </param> /// <returns> false if end of file has been reached /// </returns> /// <exception cref="IOException">if the information is not read /// successfully /// </exception> protected internal virtual bool getInstanceSparse(StreamTokenizer tokenizer, bool flag) { int valIndex, numValues = 0, maxIndex = - 1; Token token; // Get values do { // Get index getIndex(tokenizer,out token); //if (tokenizer.ttype == '}') if ((token is CharToken) && (token.StringValue == "}")) { break; } // Is index valid? try { m_IndicesBuffer[numValues] = System.Int32.Parse(token.StringValue); } catch (System.FormatException e) { errms(tokenizer, "index number expected" + " "+e.ToString()); } if (m_IndicesBuffer[numValues] <= maxIndex) { errms(tokenizer, "indices have to be ordered" ); } if ((m_IndicesBuffer[numValues] < 0) || (m_IndicesBuffer[numValues] >= numAttributes())) { errms(tokenizer, "index out of bounds"); } maxIndex = m_IndicesBuffer[numValues]; // Get value; getNextToken(tokenizer,out token); // Check if value is missing. //if (tokenizer.ttype == '?') if ((token is CharToken) && (token.StringValue == "?")) { m_ValueBuffer[numValues] = Instance.missingValue(); } else { // Check if token is valid. if (!(token is WordToken)) { errms(tokenizer, "not a valid value"); } switch (attribute(m_IndicesBuffer[numValues]).type()) { case Attribute.NOMINAL: // Check if value appears in header. valIndex = attribute(m_IndicesBuffer[numValues]).indexOfValue(token.StringValue); if (valIndex == - 1) { errms(tokenizer, "nominal value not declared in header"); } m_ValueBuffer[numValues] = (double) valIndex; break; case Attribute.NUMERIC: // Check if value is really a number. try { m_ValueBuffer[numValues] = System.Double.Parse(token.StringValue); } catch (System.FormatException e) { errms(tokenizer, "number expected" + " " + e.ToString()); } break; case Attribute.STRING: m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).addStringValue(token.StringValue); break; case Attribute.DATE: try { m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).parseDate(token.StringValue); } //UPGRADE_TODO: Class 'java.text.ParseException' was converted to 'System.FormatException' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javatextParseException'" catch (System.FormatException e) { errms(tokenizer, "unparseable date: " + token.StringValue +" "+e.ToString()); } break; default: errms(tokenizer, "unknown attribute type in column " + m_IndicesBuffer[numValues]); break; } } numValues++; } while (true); if (flag) { getLastToken(tokenizer,out token, true); } // Add instance to dataset double[] tempValues = new double[numValues]; int[] tempIndices = new int[numValues]; Array.Copy(m_ValueBuffer, 0, tempValues, 0, numValues); Array.Copy(m_IndicesBuffer, 0, tempIndices, 0, numValues); add(new SparseInstance(1, tempValues, tempIndices, numAttributes())); return true; }
/// <summary> Reads a single instance using the tokenizer and appends it /// to the dataset. Automatically expands the dataset if it /// is not large enough to hold the instance. /// /// </summary> /// <param name="tokenizer">the tokenizer to be used /// </param> /// <param name="flag">if method should test for carriage return after /// each instance /// </param> /// <returns> false if end of file has been reached /// </returns> /// <exception cref="IOException">if the information is not read /// successfully /// </exception> protected internal virtual bool getInstanceFull(StreamTokenizer tokenizer, bool flag) { double[] instance = new double[numAttributes()]; int index; Token token=null; // Get values for all attributes. for (int i = 0; i < numAttributes(); i++) { // Get next token //if (i > 0) //{ getNextToken(tokenizer,out token); //} // Check if value is missing. //if (tokenizer.ttype == '?') if (token != null) { if (token.StringValue == "?") { instance[i] = Instance.missingValue(); } else { // Check if token is valid. //if (tokenizer.ttype != SupportClass.StreamTokenizerSupport.TT_WORD) if (!(token is WordToken)) { errms(tokenizer, "not a valid value"); } switch (attribute(i).type()) { case Attribute.NOMINAL: // Check if value appears in header. index = attribute(i).indexOfValue(token.StringValue); if (index == -1) { errms(tokenizer, "nominal value not declared in header"); } instance[i] = (double)index; break; case Attribute.NUMERIC: // Check if value is really a number. try { instance[i] = System.Double.Parse(token.StringValue); } catch (System.FormatException e) { errms(tokenizer, "number expected" + " " + e.ToString()); } break; case Attribute.STRING: instance[i] = attribute(i).addStringValue(token.StringValue); break; case Attribute.DATE: try { instance[i] = attribute(i).parseDate(token.StringValue); } //UPGRADE_TODO: Class 'java.text.ParseException' was converted to 'System.FormatException' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javatextParseException'" catch (System.FormatException e) { errms(tokenizer, "unparseable date: " + token.StringValue + " " + e.ToString()); } break; default: errms(tokenizer, "unknown attribute type in column " + i); break; } } } } if (flag) { getLastToken(tokenizer,out token, true); } // Add instance to dataset add(new Instance(1, instance)); return true; }
/// <summary> Reads the header of an ARFF file from a reader and /// reserves space for the given number of instances. Lets /// the class index be undefined (negative). /// /// </summary> /// <param name="reader">the reader /// </param> /// <param name="capacity">the capacity /// </param> /// <exception cref="IllegalArgumentException">if the header is not read successfully /// or the capacity is negative. /// </exception> /// <exception cref="IOException">if there is a problem with the reader. /// </exception> //@ requires capacity >= 0; //@ ensures classIndex() == -1; //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'" public Instances(System.IO.StreamReader reader, int capacity) { StreamTokenizer tokenizer; if (capacity < 0) { throw new System.ArgumentException("Capacity has to be positive!"); } tokenizer = new StreamTokenizer(reader); initTokenizer(tokenizer); readHeader(tokenizer); m_ClassIndex = - 1; m_Instances = new FastVector(capacity); }
/// <summary> Reads a single instance using the tokenizer and appends it /// to the dataset. Automatically expands the dataset if it /// is not large enough to hold the instance. /// /// </summary> /// <param name="tokenizer">the tokenizer to be used /// </param> /// <param name="flag">if method should test for carriage return after /// each instance /// </param> /// <returns> false if end of file has been reached /// </returns> /// <exception cref="IOException">if the information is not read /// successfully /// </exception> protected internal virtual bool getInstance(StreamTokenizer tokenizer, bool flag) { // Check if any attributes have been declared. if (m_Attributes.size() == 0) { errms(tokenizer, "no header information available"); } // Check if end of file reached. //getFirstToken(tokenizer); Token token; getFirstToken(tokenizer, out token); tokenizer.PushBack(token); //if (tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_EOF) if (token is EofToken) { return false; } // Parse instance //if (tokenizer.ttype == '{') if ((token is CharToken) && (token.StringValue == "{")) { return getInstanceSparse(tokenizer, flag); } else { return getInstanceFull(tokenizer, flag); } }
/// <summary> Reads an ARFF file from a reader, and assigns a weight of /// one to each instance. Lets the index of the class /// attribute be undefined (negative). /// /// </summary> /// <param name="reader">the reader /// </param> /// <exception cref="IOException">if the ARFF file is not read /// successfully /// </exception> //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'" public Instances(System.IO.StreamReader reader) { StreamTokenizer tokenizer; tokenizer = new StreamTokenizer(reader); initTokenizer(tokenizer); readHeader(tokenizer); m_ClassIndex = - 1; m_Instances = new FastVector(1000); while (getInstance(tokenizer, true)) { } ; compactify(); }
// --------------------------------------------------------------------- #region TestSelf // --------------------------------------------------------------------- /// <summary> /// Simple self test. See StreamTokenizerTestCase for full /// tests. /// </summary> /// <returns>bool - true for success, false for failure.</returns> public static bool TestSelf() { Logger log = new Logger("testSelf"); log.Verbosity = VerbosityLevel.Debug; log.Info("Starting..."); string testString; ArrayList tokens; // setup tokenizer StreamTokenizer tokenizer = new StreamTokenizer(); tokenizer.Settings.SetupForCodeParse(); tokenizer.Verbosity = VerbosityLevel.Debug; // // try string parse // log.Write("--------------------------------------------------------\n"); log.Info("string parse:"); log.Write("--------------------------------------------------------\n"); tokens = new ArrayList(); testString = "-1.2ej"; tokenizer.Settings.DoUntermCheck = false; tokenizer.Settings.GrabWhitespace = false; if (!tokenizer.TokenizeString(testString, tokens)) { log.Error("Unable to parse into token vector."); return (false); } foreach (Token t in tokens) log.Info("Token = '{0}'", t); tokens = new ArrayList(); // // try NextToken style // // log.Write("--------------------------------------------------------\n"); // log.Info("NextToken use"); // log.Write("--------------------------------------------------------\n"); //string fileName = "st-testSelf.tmp"; //testString = "this is a simple string"; //tokenizer.TextReader = new StringReader(testString); //tokenizer.TextReader = File.OpenText(fileName); //Token token; //while (tokenizer.NextToken(out token)) log.Info("Token = '{0}'", token); // // try TokenizeFile // log.Write("--------------------------------------------------------\n"); log.Info("Tokenize missing file"); log.Write("--------------------------------------------------------\n"); string nonExistentFileName = "ThisFile better not exist"; bool caughtIt = false; try { tokenizer.TokenizeFile(nonExistentFileName); } catch (FileNotFoundException e) { log.Info("Correctly caught exception: {0}: {1}", e.GetType(), e.Message); caughtIt = true; } if (!caughtIt) { log.Error("Didn't get a file not found exception from TokenizeFile."); return (false); } // // test line numbers in tokens // // done log.Info("Done."); return (true); }
/// <summary> Gets next token, skipping empty lines. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> /// <exception cref="IOException">if reading the next token fails /// </exception> protected internal virtual void getFirstToken(StreamTokenizer tokenizer,out Token token) { //while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_EOL) tokenizer.NextToken(out token); while(token is EolToken) { tokenizer.NextToken(out token); } ; //if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) //if ((token.StringValue == "'") || (token.StringValue == "\"") ) //{ //tokenizer.ttype = SupportClass.StreamTokenizerSupport.TT_WORD; //} //else if ((tokenizer.ttype == SupportClass.StreamTokenizerSupport.TT_WORD) && (tokenizer.sval.Equals("?"))) //{ // tokenizer.ttype = '?'; //} }
/// <summary> /// Use the supplied tokenizer to tokenize the specified stream /// and time it. /// </summary> /// <param name="tokenizer"></param> /// <param name="stream"></param> /// <returns>Total milliseconds per parse.</returns> protected static double SpeedTestParse(StreamTokenizer tokenizer, Stream stream) { GC.Collect(); ArrayList tokens = new ArrayList(); DateTime start = HighResClock.Now; int cycles = 100; for (int i = 0; i < cycles; i++) { tokenizer.TokenizeStream(stream, tokens); stream.Position = 0; } TimeSpan duration = HighResClock.Now - start; return (duration.TotalMilliseconds / cycles); }
/// <summary> Gets index, checking for a premature and of line. /// /// </summary> /// <param name="tokenizer">the stream tokenizer /// </param> /// <exception cref="IOException">if it finds a premature end of line /// </exception> protected internal virtual void getIndex(StreamTokenizer tokenizer, out Token token) { tokenizer.NextToken(out token); if (token is EolToken) { errms(tokenizer, "premature end of line"); } if (token is EofToken) { errms(tokenizer, "premature end of file"); } }
/// <summary> Loads a cost matrix in the old format from a reader. Adapted from code once sitting /// in Instances.java /// /// </summary> /// <param name="reader">the reader to get the values from. /// </param> /// <exception cref="Exception">if the matrix cannot be read correctly. /// </exception> //UPGRADE_ISSUE: Class hierarchy differences between 'java.io.Reader' and 'System.IO.StreamReader' may cause compilation errors. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1186'" public virtual void readOldFormat(System.IO.StreamReader reader) { StreamTokenizer tokenizer; Token currentToken; double firstIndex, secondIndex, weight; tokenizer = new StreamTokenizer(reader); initialize(); tokenizer.Settings.CommentChar('%'); tokenizer.Settings.GrabEol=true; tokenizer.NextToken(out currentToken); //while (SupportClass.StreamTokenizerSupport.TT_EOF != (currentToken = tokenizer.NextToken())) while (!(currentToken is EofToken)) { // Skip empty lines //if (currentToken == SupportClass.StreamTokenizerSupport.TT_EOL) if (currentToken is EolToken) { continue; } // Get index of first class. //if (currentToken != SupportClass.StreamTokenizerSupport.TT_NUMBER) if (!((currentToken is FloatToken)|| (currentToken is IntToken))) { throw new System.Exception("Only numbers and comments allowed " + "in cost file!"); } //firstIndex = tokenizer.nval; firstIndex = Convert.ToDouble(currentToken.StringValue); //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" if (!Utils.eq((double) firstIndex, firstIndex)) { throw new System.Exception("First number in line has to be " + "index of a class!"); } //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" if ((int) firstIndex >= size()) { throw new System.Exception("Class index out of range!"); } // Get index of second class. //if (SupportClass.StreamTokenizerSupport.TT_EOF == (currentToken = tokenizer.NextToken())) tokenizer.NextToken(out currentToken); if (currentToken is EofToken) { throw new System.Exception("Premature end of file!"); } //if (currentToken == SupportClass.StreamTokenizerSupport.TT_EOL) if (currentToken is EolToken) { throw new System.Exception("Premature end of line!"); } //if (currentToken != SupportClass.StreamTokenizerSupport.TT_NUMBER) if (!((currentToken is IntToken) || (currentToken is FloatToken))) { throw new System.Exception("Only numbers and comments allowed " + "in cost file!"); } //secondIndex = tokenizer.nval; secondIndex = Convert.ToDouble(currentToken.StringValue); //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" if (!Utils.eq((double) secondIndex, secondIndex)) { throw new System.Exception("Second number in line has to be " + "index of a class!"); } //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" if ((int) secondIndex >= size()) { throw new System.Exception("Class index out of range!"); } //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" if ((int) secondIndex == (int) firstIndex) { throw new System.Exception("Diagonal of cost matrix non-zero!"); } // Get cost factor. tokenizer.NextToken(out currentToken); if (currentToken is EofToken) //if (SupportClass.StreamTokenizerSupport.TT_EOF == (currentToken = tokenizer.NextToken())) { throw new System.Exception("Premature end of file!"); } //if (currentToken == SupportClass.StreamTokenizerSupport.TT_EOL) if (currentToken is EolToken) { throw new System.Exception("Premature end of line!"); } //if (currentToken != SupportClass.StreamTokenizerSupport.TT_NUMBER) if (!((currentToken is IntToken) || (currentToken is FloatToken))) { throw new System.Exception("Only numbers and comments allowed " + "in cost file!"); } weight = Convert.ToDouble(currentToken.StringValue); if (!Utils.gr(weight, 0)) { throw new System.Exception("Only positive weights allowed!"); } //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" setXmlElement((int) firstIndex, (int) secondIndex, weight); tokenizer.NextToken(out currentToken); } }
/// <summary> Read a matrix from a stream. The format is the same the print method, /// so printed matrices can be read back in (provided they were printed using /// US Locale). XmlElements are separated by /// whitespace, all the elements for each row appear on a single line, /// the last row is followed by a blank line. /// <p/> /// Note: This format differs from the one that can be read via the /// Matrix(Reader) constructor! For that format, the write(Writer) method /// is used (from the original weka.core.Matrix class). /// /// </summary> /// <param name="input">the input stream. /// </param> /// <seealso cref="Matrix(Reader)"> /// </seealso> /// <seealso cref="write(Writer)"> /// </seealso> public static Matrix read(System.IO.StreamReader input) { StreamTokenizer tokenizer = new StreamTokenizer(input); // Although StreamTokenizer will parse numbers, it doesn't recognize // scientific notation (E or D); however, Double.valueOf does. // The strategy here is to disable StreamTokenizer's number parsing. // We'll only get whitespace delimited words, EOL's and EOF's. // These words should all be numbers, for Double.valueOf to parse. tokenizer.Settings.SetDefaults();//.ResetSyntax(); tokenizer.Settings.WordChars(0, 255); tokenizer.Settings.WhitespaceChars(0,(int) ' ');// .WhitespaceChars(0, ' '); tokenizer.Settings.GrabEol=true; System.Collections.ArrayList v = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); // Ignore initial empty lines Token token; tokenizer.NextToken(out token); while (token is EolToken)// == SupportClass.StreamTokenizerSupport.TT_EOL) ; //if (token.ttype == SupportClass.StreamTokenizerSupport.TT_EOF) if (token is EofToken) throw new System.IO.IOException("Unexpected EOF on matrix read."); do { // v.Add(System.Double.Parse(tokenizer.sval)); // Read & store 1st row. v.Add(System.Double.Parse(token.StringValue)); // Read & store 1st row. tokenizer.NextToken(out token); } while (token is WordToken); //while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_WORD); int n = v.Count; // Now we've got the number of columns! double[] row = new double[n]; for (int j = 0; j < n; j++) // extract the elements of the 1st row. row[j] = ((System.Double) v[j]); v.Clear(); v.Add(row); // Start storing rows instead of columns. tokenizer.NextToken(out token); while (token is WordToken) //while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_WORD) { // While non-empty lines v.Add(row = new double[n]); int j = 0; do { if (j >= n) throw new System.IO.IOException("Row " + v.Count + " is too long."); //row[j++] = System.Double.Parse(tokenizer.sval); row[j++] = System.Double.Parse(token.StringValue); tokenizer.NextToken(out token); } while (token is WordToken); //while (tokenizer.NextToken() == SupportClass.StreamTokenizerSupport.TT_WORD); if (j < n) throw new System.IO.IOException("Row " + v.Count + " is too short."); } int m = v.Count; // Now we've got the number of rows. double[][] A = new double[m][]; v.CopyTo(A); // copy the rows out of the vector return new Matrix(A); }