public void TestWriteUTF() { ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); dos.WriteUTF("Hello, World!"); // 15 dos.Flush(); if (baos.Length != dos.Length) fail("Miscounted bytes in DataOutputStream."); }
private static void WriteAndReadAString() { // Write out a string whose UTF-8 encoding is quite possibly // longer than 65535 bytes int length = Random().nextInt(A_NUMBER_NEAR_65535) + 1; MemoryStream baos = new MemoryStream(); StringBuilder testBuffer = new StringBuilder(); for (int i = 0; i < length; i++) testBuffer.append((char)Random().Next()); string testString = testBuffer.toString(); DataOutputStream dos = new DataOutputStream(baos); dos.WriteUTF(testString); // Corrupt the data to produce malformed characters byte[] testBytes = baos.ToArray(); int dataLength = testBytes.Length; int corruptions = Random().nextInt(MAX_CORRUPTIONS_PER_CYCLE); for (int i = 0; i < corruptions; i++) { int index = Random().nextInt(dataLength); testBytes[index] = (byte)Random().Next(); } // Pay special attention to mangling the end to produce // partial characters at end testBytes[dataLength - 1] = (byte)Random().Next(); testBytes[dataLength - 2] = (byte)Random().Next(); // Attempt to decode the bytes back into a String MemoryStream bais = new MemoryStream(testBytes); DataInputStream dis = new DataInputStream(bais); dis.ReadUTF(); }
internal static int WriteUTF(string str, IDataOutput @out) { int strlen = str.Length; int utflen = 0; int c, count = 0; /* use charAt instead of copying String to char array */ for (int i = 0; i < strlen; i++) { c = str[i]; if ((c >= 0x0001) && (c <= 0x007F)) { utflen++; } else if (c > 0x07FF) { utflen += 3; } else { utflen += 2; } } if (utflen > 65535) { throw new FormatException( "encoded string too long: " + utflen + " bytes"); } byte[] bytearr = null; if (@out is DataOutputStream) { DataOutputStream dos = (DataOutputStream)@out; if (dos.bytearr == null || (dos.bytearr.Length < (utflen + 2))) { dos.bytearr = new byte[(utflen * 2) + 2]; } bytearr = dos.bytearr; } else { bytearr = new byte[utflen + 2]; } bytearr[count++] = (byte)(int)(((uint)utflen >> 8) & 0xFF); bytearr[count++] = (byte)(int)(((uint)utflen >> 0) & 0xFF); int i2 = 0; for (i2 = 0; i2 < strlen; i2++) { c = str[i2]; if (!((c >= 0x0001) && (c <= 0x007F))) { break; } bytearr[count++] = (byte)c; } for (; i2 < strlen; i2++) { c = str[i2]; if ((c >= 0x0001) && (c <= 0x007F)) { bytearr[count++] = (byte)c; } else if (c > 0x07FF) { bytearr[count++] = (byte)(0xE0 | ((c >> 12) & 0x0F)); bytearr[count++] = (byte)(0x80 | ((c >> 6) & 0x3F)); bytearr[count++] = (byte)(0x80 | ((c >> 0) & 0x3F)); } else { bytearr[count++] = (byte)(0xC0 | ((c >> 6) & 0x1F)); bytearr[count++] = (byte)(0x80 | ((c >> 0) & 0x3F)); } } @out.Write(bytearr, 0, utflen + 2); return(utflen + 2); }
/** * Entry point to the Compile application. * <p> * This program takes any number of arguments: the first is the name of the * desired stemming algorithm to use (a list is available in the package * description) , all of the rest should be the path or paths to a file or * files containing a stemmer table to compile. * * @param args the command line arguments */ public static void Main(string[] args) { if (args.Length < 1) { return; } args[0].ToUpperInvariant(); backward = args[0][0] == '-'; int qq = (backward) ? 1 : 0; bool storeorig = false; if (args[0][qq] == '0') { storeorig = true; qq++; } multi = args[0][qq] == 'M'; if (multi) { qq++; } // LUCENENET TODO: Is this any different than Encoding.UTF8? //String charset = System.getProperty("egothor.stemmer.charset", "UTF-8"); char[] optimizer = new char[args[0].Length - qq]; for (int i = 0; i < optimizer.Length; i++) { optimizer[i] = args[0][qq + i]; } for (int i = 1; i < args.Length; i++) { TextReader @in; // System.out.println("[" + args[i] + "]"); Diff diff = new Diff(); //int stems = 0; // not used int words = 0; AllocTrie(); Console.WriteLine(args[i]); using (@in = new StreamReader( new FileStream(args[i], FileMode.Open, FileAccess.Read), Encoding.UTF8)) { for (string line = @in.ReadLine(); line != null; line = @in.ReadLine()) { try { line = line.ToLowerInvariant(); StringTokenizer st = new StringTokenizer(line); string stem = st.NextToken(); if (storeorig) { trie.Add(stem, "-a"); words++; } while (st.HasMoreTokens()) { string token = st.NextToken(); if (token.Equals(stem) == false) { trie.Add(token, diff.Exec(token, stem)); words++; } } } catch (InvalidOperationException /*x*/) { // no base token (stem) on a line } } } Optimizer o = new Optimizer(); Optimizer2 o2 = new Optimizer2(); Lift l = new Lift(true); Lift e = new Lift(false); Gener g = new Gener(); for (int j = 0; j < optimizer.Length; j++) { string prefix; switch (optimizer[j]) { case 'G': trie = trie.Reduce(g); prefix = "G: "; break; case 'L': trie = trie.Reduce(l); prefix = "L: "; break; case 'E': trie = trie.Reduce(e); prefix = "E: "; break; case '2': trie = trie.Reduce(o2); prefix = "2: "; break; case '1': trie = trie.Reduce(o); prefix = "1: "; break; default: continue; } trie.PrintInfo(System.Console.Out, prefix + " "); } using (DataOutputStream os = new DataOutputStream( new FileStream(args[i] + ".out", FileMode.OpenOrCreate, FileAccess.Write))) { os.WriteUTF(args[0]); trie.Store(os); } } }