/// <summary> /// Use the input splits to take samples of the input and generate sample /// keys. /// </summary> /// <remarks> /// Use the input splits to take samples of the input and generate sample /// keys. By default reads 100,000 keys from 10 locations in the input, sorts /// them and picks N-1 keys to generate N equally sized partitions. /// </remarks> /// <param name="job">the job to sample</param> /// <param name="partFile">where to write the output file to</param> /// <exception cref="System.Exception">if something goes wrong</exception> public static void WritePartitionFile(JobContext job, Path partFile) { long t1 = Runtime.CurrentTimeMillis(); Configuration conf = job.GetConfiguration(); TeraInputFormat inFormat = new TeraInputFormat(); TeraInputFormat.TextSampler sampler = new TeraInputFormat.TextSampler(); int partitions = job.GetNumReduceTasks(); long sampleSize = conf.GetLong(SampleSize, 100000); IList <InputSplit> splits = inFormat.GetSplits(job); long t2 = Runtime.CurrentTimeMillis(); System.Console.Out.WriteLine("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.Min(conf.GetInt(NumPartitions, 10), splits.Count); System.Console.Out.WriteLine("Sampling " + samples + " splits of " + splits.Count ); long recordsPerSample = sampleSize / samples; int sampleStep = splits.Count / samples; Sharpen.Thread[] samplerReader = new Sharpen.Thread[samples]; TeraInputFormat.SamplerThreadGroup threadGroup = new TeraInputFormat.SamplerThreadGroup ("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { int idx = i; samplerReader[i] = new _Thread_140(job, inFormat, splits, sampleStep, idx, sampler , recordsPerSample, threadGroup, "Sampler Reader " + idx); samplerReader[i].Start(); } FileSystem outFs = partFile.GetFileSystem(conf); DataOutputStream writer = outFs.Create(partFile, true, 64 * 1024, (short)10, outFs .GetDefaultBlockSize(partFile)); for (int i_1 = 0; i_1 < samples; i_1++) { try { samplerReader[i_1].Join(); if (threadGroup.GetThrowable() != null) { throw threadGroup.GetThrowable(); } } catch (Exception) { } } foreach (Text split in sampler.CreatePartitions(partitions)) { split.Write(writer); } writer.Close(); long t3 = Runtime.CurrentTimeMillis(); System.Console.Out.WriteLine("Computing parititions took " + (t3 - t2) + "ms"); }
/// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Log.Info("starting"); Job job = Job.GetInstance(GetConf()); Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); bool useSimplePartitioner = GetUseSimplePartitioner(job); TeraInputFormat.SetInputPaths(job, inputDir); FileOutputFormat.SetOutputPath(job, outputDir); job.SetJobName("TeraSort"); job.SetJarByClass(typeof(TeraSort)); job.SetOutputKeyClass(typeof(Text)); job.SetOutputValueClass(typeof(Text)); job.SetInputFormatClass(typeof(TeraInputFormat)); job.SetOutputFormatClass(typeof(TeraOutputFormat)); if (useSimplePartitioner) { job.SetPartitionerClass(typeof(TeraSort.SimplePartitioner)); } else { long start = Runtime.CurrentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PartitionFilename); URI partitionUri = new URI(partitionFile.ToString() + "#" + TeraInputFormat.PartitionFilename ); try { TeraInputFormat.WritePartitionFile(job, partitionFile); } catch (Exception e) { Log.Error(e.Message); return(-1); } job.AddCacheFile(partitionUri); long end = Runtime.CurrentTimeMillis(); System.Console.Out.WriteLine("Spent " + (end - start) + "ms computing partitions." ); job.SetPartitionerClass(typeof(TeraSort.TotalOrderPartitioner)); } job.GetConfiguration().SetInt("dfs.replication", GetOutputReplication(job)); TeraOutputFormat.SetFinalSync(job, true); int ret = job.WaitForCompletion(true) ? 0 : 1; Log.Info("done"); return(ret); }
public _Thread_140(JobContext job, TeraInputFormat inFormat, IList <InputSplit> splits , int sampleStep, int idx, TeraInputFormat.TextSampler sampler, long recordsPerSample , ThreadGroup baseArg1, string baseArg2) : base(baseArg1, baseArg2) { this.job = job; this.inFormat = inFormat; this.splits = splits; this.sampleStep = sampleStep; this.idx = idx; this.sampler = sampler; this.recordsPerSample = recordsPerSample; { this.SetDaemon(true); } }
/// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Job job = Job.GetInstance(GetConf()); if (args.Length != 2) { Usage(); return(2); } TeraInputFormat.SetInputPaths(job, new Path(args[0])); FileOutputFormat.SetOutputPath(job, new Path(args[1])); job.SetJobName("TeraSum"); job.SetJarByClass(typeof(TeraChecksum)); job.SetMapperClass(typeof(TeraChecksum.ChecksumMapper)); job.SetReducerClass(typeof(TeraChecksum.ChecksumReducer)); job.SetOutputKeyClass(typeof(NullWritable)); job.SetOutputValueClass(typeof(Unsigned16)); // force a single reducer job.SetNumReduceTasks(1); job.SetInputFormatClass(typeof(TeraInputFormat)); return(job.WaitForCompletion(true) ? 0 : 1); }
/// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Job job = Job.GetInstance(GetConf()); if (args.Length != 2) { Usage(); return(1); } TeraInputFormat.SetInputPaths(job, new Path(args[0])); FileOutputFormat.SetOutputPath(job, new Path(args[1])); job.SetJobName("TeraValidate"); job.SetJarByClass(typeof(TeraValidate)); job.SetMapperClass(typeof(TeraValidate.ValidateMapper)); job.SetReducerClass(typeof(TeraValidate.ValidateReducer)); job.SetOutputKeyClass(typeof(Text)); job.SetOutputValueClass(typeof(Text)); // force a single reducer job.SetNumReduceTasks(1); // force a single split FileInputFormat.SetMinInputSplitSize(job, long.MaxValue); job.SetInputFormatClass(typeof(TeraInputFormat)); return(job.WaitForCompletion(true) ? 0 : 1); }