/// <exception cref="System.Exception"/> public virtual void TestFormat() { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.GetLocal(conf); Path dir = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.Null; int seed = new Random().Next(); //LOG.info("seed = "+seed); Random random = new Random(seed); fs.Delete(dir, true); FileInputFormat.SetInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) + 1) { //LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, file, typeof(IntWritable ), typeof(BytesWritable)); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); byte[] data = new byte[random.Next(10)]; random.NextBytes(data); BytesWritable value = new BytesWritable(data); writer.Append(key, value); } } finally { writer.Close(); } // try splitting the file in a variety of sizes InputFormat <IntWritable, BytesWritable> format = new SequenceFileInputFormat <IntWritable , BytesWritable>(); IntWritable key_1 = new IntWritable(); BytesWritable value_1 = new BytesWritable(); for (int i_1 = 0; i_1 < 3; i_1++) { int numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1; //LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.GetSplits(job, numSplits); //LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.Length; j++) { RecordReader <IntWritable, BytesWritable> reader = format.GetRecordReader(splits[j ], job, reporter); try { int count = 0; while (reader.Next(key_1, value_1)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get()); // LOG.info("@"+reader.getPos()); // } NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(key_1.Get( ))); bits.Set(key_1.Get()); count++; } } finally { //LOG.info("splits["+j+"]="+splits[j]+" count=" + count); reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality ()); } } }
// A random task attempt id for testing. /// <exception cref="System.IO.IOException"/> public virtual void TestBinary() { JobConf job = new JobConf(); FileSystem fs = FileSystem.GetLocal(job); Path dir = new Path(new Path(new Path(Runtime.GetProperty("test.build.data", ".") ), FileOutputCommitter.TempDirName), "_" + attempt); Path file = new Path(dir, "testbinary.seq"); Random r = new Random(); long seed = r.NextLong(); r.SetSeed(seed); fs.Delete(dir, true); if (!fs.Mkdirs(dir)) { Fail("Failed to create output directory"); } job.Set(JobContext.TaskAttemptId, attempt); FileOutputFormat.SetOutputPath(job, dir.GetParent().GetParent()); FileOutputFormat.SetWorkOutputPath(job, dir); SequenceFileAsBinaryOutputFormat.SetSequenceFileOutputKeyClass(job, typeof(IntWritable )); SequenceFileAsBinaryOutputFormat.SetSequenceFileOutputValueClass(job, typeof(DoubleWritable )); SequenceFileAsBinaryOutputFormat.SetCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.SetOutputCompressionType(job, SequenceFile.CompressionType .Block); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); RecordWriter <BytesWritable, BytesWritable> writer = new SequenceFileAsBinaryOutputFormat ().GetRecordWriter(fs, job, file.ToString(), Reporter.Null); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); Log.Info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < Records; ++i) { iwritable = new IntWritable(r.Next()); iwritable.Write(outbuf); bkey.Set(outbuf.GetData(), 0, outbuf.GetLength()); outbuf.Reset(); dwritable = new DoubleWritable(r.NextDouble()); dwritable.Write(outbuf); bval.Set(outbuf.GetData(), 0, outbuf.GetLength()); outbuf.Reset(); writer.Write(bkey, bval); } } finally { writer.Close(Reporter.Null); } InputFormat <IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat <IntWritable , DoubleWritable>(); int count = 0; r.SetSeed(seed); DataInputBuffer buf = new DataInputBuffer(); int NumSplits = 3; SequenceFileInputFormat.AddInputPath(job, file); Log.Info("Reading data by SequenceFileInputFormat"); foreach (InputSplit split in iformat.GetSplits(job, NumSplits)) { RecordReader <IntWritable, DoubleWritable> reader = iformat.GetRecordReader(split, job, Reporter.Null); try { int sourceInt; double sourceDouble; while (reader.Next(iwritable, dwritable)) { sourceInt = r.Next(); sourceDouble = r.NextDouble(); NUnit.Framework.Assert.AreEqual("Keys don't match: " + "*" + iwritable.Get() + ":" + sourceInt + "*", sourceInt, iwritable.Get()); NUnit.Framework.Assert.IsTrue("Vals don't match: " + "*" + dwritable.Get() + ":" + sourceDouble + "*", double.Compare(dwritable.Get(), sourceDouble) == 0); ++count; } } finally { reader.Close(); } } NUnit.Framework.Assert.AreEqual("Some records not found", Records, count); }
/// <exception cref="System.Exception"/> public virtual void TestGetSplitHosts() { int numBlocks = 3; int block1Size = 100; int block2Size = 150; int block3Size = 75; int fileSize = block1Size + block2Size + block3Size; int replicationFactor = 3; NetworkTopology clusterMap = new NetworkTopology(); BlockLocation[] bs = new BlockLocation[numBlocks]; string[] block1Hosts = new string[] { "host1", "host2", "host3" }; string[] block1Names = new string[] { "host1:100", "host2:100", "host3:100" }; string[] block1Racks = new string[] { "/rack1/", "/rack1/", "/rack2/" }; string[] block1Paths = new string[replicationFactor]; for (int i = 0; i < replicationFactor; i++) { block1Paths[i] = block1Racks[i] + block1Names[i]; } bs[0] = new BlockLocation(block1Names, block1Hosts, block1Paths, 0, block1Size); string[] block2Hosts = new string[] { "host4", "host5", "host6" }; string[] block2Names = new string[] { "host4:100", "host5:100", "host6:100" }; string[] block2Racks = new string[] { "/rack2/", "/rack3/", "/rack3/" }; string[] block2Paths = new string[replicationFactor]; for (int i_1 = 0; i_1 < replicationFactor; i_1++) { block2Paths[i_1] = block2Racks[i_1] + block2Names[i_1]; } bs[1] = new BlockLocation(block2Names, block2Hosts, block2Paths, block1Size, block2Size ); string[] block3Hosts = new string[] { "host1", "host7", "host8" }; string[] block3Names = new string[] { "host1:100", "host7:100", "host8:100" }; string[] block3Racks = new string[] { "/rack1/", "/rack4/", "/rack4/" }; string[] block3Paths = new string[replicationFactor]; for (int i_2 = 0; i_2 < replicationFactor; i_2++) { block3Paths[i_2] = block3Racks[i_2] + block3Names[i_2]; } bs[2] = new BlockLocation(block3Names, block3Hosts, block3Paths, block1Size + block2Size , block3Size); SequenceFileInputFormat <string, string> sif = new SequenceFileInputFormat <string, string>(); string[] hosts = sif.GetSplitHosts(bs, 0, fileSize, clusterMap); // Contributions By Racks are // Rack1 175 // Rack2 275 // Rack3 150 // So, Rack2 hosts, host4 and host 3 should be returned // even if their individual contribution is not the highest NUnit.Framework.Assert.IsTrue(hosts.Length == replicationFactor); NUnit.Framework.Assert.IsTrue(Sharpen.Runtime.EqualsIgnoreCase(hosts[0], "host4") ); NUnit.Framework.Assert.IsTrue(Sharpen.Runtime.EqualsIgnoreCase(hosts[1], "host3") ); NUnit.Framework.Assert.IsTrue(Sharpen.Runtime.EqualsIgnoreCase(hosts[2], "host1") ); // Now Create the blocks without topology information bs[0] = new BlockLocation(block1Names, block1Hosts, 0, block1Size); bs[1] = new BlockLocation(block2Names, block2Hosts, block1Size, block2Size); bs[2] = new BlockLocation(block3Names, block3Hosts, block1Size + block2Size, block3Size ); hosts = sif.GetSplitHosts(bs, 0, fileSize, clusterMap); // host1 makes the highest contribution among all hosts // So, that should be returned before others NUnit.Framework.Assert.IsTrue(hosts.Length == replicationFactor); NUnit.Framework.Assert.IsTrue(Sharpen.Runtime.EqualsIgnoreCase(hosts[0], "host1") ); }