예제 #1
0
            /// <summary>From each split sampled, take the first numSamples / numSplits records.</summary>
            /// <exception cref="System.IO.IOException"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job)
            {
                // ArrayList::toArray doesn't preserve type
                InputSplit[] splits          = inf.GetSplits(job, job.GetNumMapTasks());
                AList <K>    samples         = new AList <K>(numSamples);
                int          splitsToSample  = Math.Min(maxSplitsSampled, splits.Length);
                int          splitStep       = splits.Length / splitsToSample;
                int          samplesPerSplit = numSamples / splitsToSample;
                long         records         = 0;

                for (int i = 0; i < splitsToSample; ++i)
                {
                    RecordReader <K, V> reader = inf.GetRecordReader(splits[i * splitStep], job, Reporter
                                                                     .Null);
                    K key   = reader.CreateKey();
                    V value = reader.CreateValue();
                    while (reader.Next(key, value))
                    {
                        samples.AddItem(key);
                        key = reader.CreateKey();
                        ++records;
                        if ((i + 1) * samplesPerSplit <= records)
                        {
                            break;
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
예제 #2
0
            /// <summary>
            /// Randomize the split order, then take the specified number of keys from
            /// each split sampled, where each key is selected with the specified
            /// probability and possibly replaced by a subsequently selected key when
            /// the quota of keys from that split is satisfied.
            /// </summary>
            /// <exception cref="System.IO.IOException"/>
            public virtual K[] GetSample(InputFormat <K, V> inf, JobConf job)
            {
                // ArrayList::toArray doesn't preserve type
                InputSplit[] splits         = inf.GetSplits(job, job.GetNumMapTasks());
                AList <K>    samples        = new AList <K>(numSamples);
                int          splitsToSample = Math.Min(maxSplitsSampled, splits.Length);
                Random       r    = new Random();
                long         seed = r.NextLong();

                r.SetSeed(seed);
                Log.Debug("seed: " + seed);
                // shuffle splits
                for (int i = 0; i < splits.Length; ++i)
                {
                    InputSplit tmp = splits[i];
                    int        j   = r.Next(splits.Length);
                    splits[i] = splits[j];
                    splits[j] = tmp;
                }
                // our target rate is in terms of the maximum number of sample splits,
                // but we accept the possibility of sampling additional splits to hit
                // the target sample keyset
                for (int i_1 = 0; i_1 < splitsToSample || (i_1 < splits.Length && samples.Count <
                                                           numSamples); ++i_1)
                {
                    RecordReader <K, V> reader = inf.GetRecordReader(splits[i_1], job, Reporter.Null);
                    K key   = reader.CreateKey();
                    V value = reader.CreateValue();
                    while (reader.Next(key, value))
                    {
                        if (r.NextDouble() <= freq)
                        {
                            if (samples.Count < numSamples)
                            {
                                samples.AddItem(key);
                            }
                            else
                            {
                                // When exceeding the maximum number of samples, replace a
                                // random element with this one, then adjust the frequency
                                // to reflect the possibility of existing elements being
                                // pushed out
                                int ind = r.Next(numSamples);
                                if (ind != numSamples)
                                {
                                    samples.Set(ind, key);
                                }
                                freq *= (numSamples - 1) / (double)numSamples;
                            }
                            key = reader.CreateKey();
                        }
                    }
                    reader.Close();
                }
                return((K[])Sharpen.Collections.ToArray(samples));
            }
예제 #3
0
 /// <exception cref="System.IO.IOException"/>
 internal WrappedRecordReader(int id, RecordReader <K, U> rr, Type cmpcl, Configuration
                              conf)
 {
     // index at which values will be inserted in collector
     // key at the top of this RR
     // value assoc with khead
     this.id   = id;
     this.rr   = rr;
     this.conf = (conf == null) ? new Configuration() : conf;
     khead     = rr.CreateKey();
     vhead     = rr.CreateValue();
     try
     {
         cmp = (null == cmpcl) ? WritableComparator.Get(khead.GetType(), this.conf) : System.Activator.CreateInstance
                   (cmpcl);
     }
     catch (InstantiationException e)
     {
         throw (IOException)Sharpen.Extensions.InitCause(new IOException(), e);
     }
     catch (MemberAccessException e)
     {
         throw (IOException)Sharpen.Extensions.InitCause(new IOException(), e);
     }
     vjoin = new StreamBackedIterator <U>();
     Next();
 }
예제 #4
0
        /// <exception cref="System.IO.IOException"/>
        private static IList <Text> ReadSplit(KeyValueTextInputFormat format, InputSplit split
                                              , JobConf job)
        {
            IList <Text> result = new AList <Text>();
            RecordReader <Text, Text> reader = null;

            try
            {
                reader = format.GetRecordReader(split, job, voidReporter);
                Text key   = reader.CreateKey();
                Text value = reader.CreateValue();
                while (reader.Next(key, value))
                {
                    result.AddItem(value);
                    value = (Text)reader.CreateValue();
                }
            }
            finally
            {
                if (reader != null)
                {
                    reader.Close();
                }
            }
            return(result);
        }
예제 #5
0
        /// <summary>test DBInputFormat class.</summary>
        /// <remarks>test DBInputFormat class. Class should split result for chunks</remarks>
        /// <exception cref="System.Exception"/>
        public virtual void TestDBInputFormat()
        {
            JobConf configuration = new JobConf();

            SetupDriver(configuration);
            DBInputFormat <DBInputFormat.NullDBWritable> format = new DBInputFormat <DBInputFormat.NullDBWritable
                                                                                     >();

            format.SetConf(configuration);
            format.SetConf(configuration);
            DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10);
            Reporter reporter = Org.Mockito.Mockito.Mock <Reporter>();
            RecordReader <LongWritable, DBInputFormat.NullDBWritable> reader = format.GetRecordReader
                                                                                   (splitter, configuration, reporter);

            configuration.SetInt(MRJobConfig.NumMaps, 3);
            InputSplit[] lSplits = format.GetSplits(configuration, 3);
            NUnit.Framework.Assert.AreEqual(5, lSplits[0].GetLength());
            NUnit.Framework.Assert.AreEqual(3, lSplits.Length);
            // test reader .Some simple tests
            NUnit.Framework.Assert.AreEqual(typeof(LongWritable), reader.CreateKey().GetType(
                                                ));
            NUnit.Framework.Assert.AreEqual(0, reader.GetPos());
            NUnit.Framework.Assert.AreEqual(0, reader.GetProgress(), 0.001);
            reader.Close();
        }
예제 #6
0
        /// <exception cref="System.IO.IOException"/>
        internal static long ReadBench(JobConf conf)
        {
            // InputFormat instantiation
            InputFormat  inf = conf.GetInputFormat();
            string       fn  = conf.Get("test.filebench.name", string.Empty);
            Path         pin = new Path(FileInputFormat.GetInputPaths(conf)[0], fn);
            FileStatus   @in = pin.GetFileSystem(conf).GetFileStatus(pin);
            RecordReader rr  = inf.GetRecordReader(new FileSplit(pin, 0, @in.GetLen(), (string
                                                                                        [])null), conf, Reporter.Null);

            try
            {
                object   key   = rr.CreateKey();
                object   val   = rr.CreateValue();
                DateTime start = new DateTime();
                while (rr.Next(key, val))
                {
                }
                DateTime end = new DateTime();
                return(end.GetTime() - start.GetTime());
            }
            finally
            {
                rr.Close();
            }
        }
예제 #7
0
        /// <summary>Run the map task.</summary>
        /// <param name="input">the set of inputs</param>
        /// <param name="output">the object to collect the outputs of the map</param>
        /// <param name="reporter">the object to update with status</param>
        /// <exception cref="System.IO.IOException"/>
        public override void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output
                                 , Reporter reporter)
        {
            Application <K1, V1, K2, V2> application = null;

            try
            {
                RecordReader <FloatWritable, NullWritable> fakeInput = (!Submitter.GetIsJavaRecordReader
                                                                            (job) && !Submitter.GetIsJavaMapper(job)) ? (RecordReader <FloatWritable, NullWritable
                                                                                                                                       >)input : null;
                application = new Application <K1, V1, K2, V2>(job, fakeInput, output, reporter, (
                                                                   Type)job.GetOutputKeyClass(), (Type)job.GetOutputValueClass());
            }
            catch (Exception ie)
            {
                throw new RuntimeException("interrupted", ie);
            }
            DownwardProtocol <K1, V1> downlink = application.GetDownlink();
            bool isJavaInput = Submitter.GetIsJavaRecordReader(job);

            downlink.RunMap(reporter.GetInputSplit(), job.GetNumReduceTasks(), isJavaInput);
            bool skipping = job.GetBoolean(MRJobConfig.SkipRecords, false);

            try
            {
                if (isJavaInput)
                {
                    // allocate key & value instances that are re-used for all entries
                    K1 key   = input.CreateKey();
                    V1 value = input.CreateValue();
                    downlink.SetInputTypes(key.GetType().FullName, value.GetType().FullName);
                    while (input.Next(key, value))
                    {
                        // map pair to output
                        downlink.MapItem(key, value);
                        if (skipping)
                        {
                            //flush the streams on every record input if running in skip mode
                            //so that we don't buffer other records surrounding a bad record.
                            downlink.Flush();
                        }
                    }
                    downlink.EndOfInput();
                }
                application.WaitForFinish();
            }
            catch (Exception t)
            {
                application.Abort(t);
            }
            finally
            {
                application.Cleanup();
            }
        }
        /// <exception cref="System.IO.IOException"/>
        private static IList <Text> ReadSplit(TextInputFormat format, InputSplit split, JobConf
                                              jobConf)
        {
            IList <Text> result = new AList <Text>();
            RecordReader <LongWritable, Text> reader = format.GetRecordReader(split, jobConf,
                                                                              voidReporter);
            LongWritable key   = reader.CreateKey();
            Text         value = reader.CreateValue();

            while (reader.Next(key, value))
            {
                result.AddItem(value);
                value = reader.CreateValue();
            }
            reader.Close();
            return(result);
        }
예제 #9
0
        // A reporter that does nothing
        /// <exception cref="System.IO.IOException"/>
        internal virtual void CheckFormat(JobConf job, int expectedN)
        {
            NLineInputFormat format = new NLineInputFormat();

            format.Configure(job);
            int ignoredNumSplits = 1;

            InputSplit[] splits = format.GetSplits(job, ignoredNumSplits);
            // check all splits except last one
            int count = 0;

            for (int j = 0; j < splits.Length - 1; j++)
            {
                NUnit.Framework.Assert.AreEqual("There are no split locations", 0, splits[j].GetLocations
                                                    ().Length);
                RecordReader <LongWritable, Text> reader = format.GetRecordReader(splits[j], job,
                                                                                  voidReporter);
                Type readerClass = reader.GetType();
                NUnit.Framework.Assert.AreEqual("reader class is LineRecordReader.", typeof(LineRecordReader
                                                                                            ), readerClass);
                LongWritable key      = reader.CreateKey();
                Type         keyClass = key.GetType();
                NUnit.Framework.Assert.AreEqual("Key class is LongWritable.", typeof(LongWritable
                                                                                     ), keyClass);
                Text value      = reader.CreateValue();
                Type valueClass = value.GetType();
                NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass);
                try
                {
                    count = 0;
                    while (reader.Next(key, value))
                    {
                        count++;
                    }
                }
                finally
                {
                    reader.Close();
                }
                NUnit.Framework.Assert.AreEqual("number of lines in split is " + expectedN, expectedN
                                                , count);
            }
        }
예제 #10
0
        /// <exception cref="System.IO.IOException"/>
        private static IList <string> ReadSplit(FixedLengthInputFormat format, InputSplit
                                                split, JobConf job)
        {
            IList <string> result = new AList <string>();
            RecordReader <LongWritable, BytesWritable> reader = format.GetRecordReader(split,
                                                                                       job, voidReporter);
            LongWritable  key   = reader.CreateKey();
            BytesWritable value = reader.CreateValue();

            try
            {
                while (reader.Next(key, value))
                {
                    result.AddItem(Sharpen.Runtime.GetStringForBytes(value.GetBytes(), 0, value.GetLength
                                                                         ()));
                }
            }
            finally
            {
                reader.Close();
            }
            return(result);
        }
예제 #11
0
 /// <exception cref="System.IO.IOException"/>
 public virtual void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output
                         , Reporter reporter)
 {
     try
     {
         // allocate key & value instances that are re-used for all entries
         K1 key   = input.CreateKey();
         V1 value = input.CreateValue();
         while (input.Next(key, value))
         {
             // map pair to output
             mapper.Map(key, value, output, reporter);
             if (incrProcCount)
             {
                 reporter.IncrCounter(SkipBadRecords.CounterGroup, SkipBadRecords.CounterMapProcessedRecords
                                      , 1);
             }
         }
     }
     finally
     {
         mapper.Close();
     }
 }
예제 #12
0
        /// <exception cref="System.Exception"/>
        public virtual void TestFormat()
        {
            JobConf    job      = new JobConf(conf);
            FileSystem fs       = FileSystem.GetLocal(conf);
            Path       dir      = new Path(Runtime.GetProperty("test.build.data", ".") + "/mapred");
            Path       file     = new Path(dir, "test.seq");
            Reporter   reporter = Reporter.Null;
            int        seed     = new Random().Next();
            //LOG.info("seed = "+seed);
            Random random = new Random(seed);

            fs.Delete(dir, true);
            FileInputFormat.SetInputPaths(job, dir);
            // for a variety of lengths
            for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) +
                                                               1)
            {
                //LOG.info("creating; entries = " + length);
                // create a file with length entries
                SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, file, typeof(IntWritable
                                                                                              ), typeof(LongWritable));
                try
                {
                    for (int i = 0; i < length; i++)
                    {
                        IntWritable  key   = new IntWritable(i);
                        LongWritable value = new LongWritable(10 * i);
                        writer.Append(key, value);
                    }
                }
                finally
                {
                    writer.Close();
                }
                // try splitting the file in a variety of sizes
                InputFormat <Text, Text> format = new SequenceFileAsTextInputFormat();
                for (int i_1 = 0; i_1 < 3; i_1++)
                {
                    int numSplits = random.Next(MaxLength / (SequenceFile.SyncInterval / 20)) + 1;
                    //LOG.info("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(job, numSplits);
                    //LOG.info("splitting: got =        " + splits.length);
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                    {
                        RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter
                                                                                  );
                        Type readerClass = reader.GetType();
                        NUnit.Framework.Assert.AreEqual("reader class is SequenceFileAsTextRecordReader."
                                                        , typeof(SequenceFileAsTextRecordReader), readerClass);
                        Text value = reader.CreateValue();
                        Text key   = reader.CreateKey();
                        try
                        {
                            int count = 0;
                            while (reader.Next(key, value))
                            {
                                // if (bits.get(key.get())) {
                                // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
                                // LOG.info("@"+reader.getPos());
                                // }
                                int keyInt = System.Convert.ToInt32(key.ToString());
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(keyInt));
                                bits.Set(keyInt);
                                count++;
                            }
                        }
                        finally
                        {
                            //LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
                            reader.Close();
                        }
                    }
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
                                                        ());
                }
            }
        }
예제 #13
0
 /// <exception cref="System.IO.IOException"/>
 public virtual void Run(RecordReader <K1, V1> input, OutputCollector <K2, V2> output
                         , Reporter reporter)
 {
     try
     {
         // allocate key & value instances these objects will not be reused
         // because execution of Mapper.map is not serialized.
         K1 key   = input.CreateKey();
         V1 value = input.CreateValue();
         while (input.Next(key, value))
         {
             executorService.Execute(new MultithreadedMapRunner.MapperInvokeRunable(this, key,
                                                                                    value, output, reporter));
             CheckForExceptionsFromProcessingThreads();
             // Allocate new key & value instances as mapper is running in parallel
             key   = input.CreateKey();
             value = input.CreateValue();
         }
         if (Log.IsDebugEnabled())
         {
             Log.Debug("Finished dispatching all Mappper.map calls, job " + job.GetJobName());
         }
         // Graceful shutdown of the Threadpool, it will let all scheduled
         // Runnables to end.
         executorService.Shutdown();
         try
         {
             // Now waiting for all Runnables to end.
             while (!executorService.AwaitTermination(100, TimeUnit.Milliseconds))
             {
                 if (Log.IsDebugEnabled())
                 {
                     Log.Debug("Awaiting all running Mappper.map calls to finish, job " + job.GetJobName
                                   ());
                 }
                 // NOTE: while Mapper.map dispatching has concluded there are still
                 // map calls in progress and exceptions would be thrown.
                 CheckForExceptionsFromProcessingThreads();
             }
             // NOTE: it could be that a map call has had an exception after the
             // call for awaitTermination() returing true. And edge case but it
             // could happen.
             CheckForExceptionsFromProcessingThreads();
         }
         catch (IOException ioEx)
         {
             // Forcing a shutdown of all thread of the threadpool and rethrowing
             // the IOException
             executorService.ShutdownNow();
             throw;
         }
         catch (Exception iEx)
         {
             throw new RuntimeException(iEx);
         }
     }
     finally
     {
         mapper.Close();
     }
 }
예제 #14
0
        /// <exception cref="System.Exception"/>
        public virtual void TestFormat()
        {
            JobConf job  = new JobConf();
            Path    file = new Path(workDir, "test.txt");
            // A reporter that does nothing
            Reporter reporter = Reporter.Null;
            int      seed     = new Random().Next();

            Log.Info("seed = " + seed);
            Random random = new Random(seed);

            localFs.Delete(workDir, true);
            FileInputFormat.SetInputPaths(job, workDir);
            // for a variety of lengths
            for (int length = 0; length < MaxLength; length += random.Next(MaxLength / 10) +
                                                               1)
            {
                Log.Debug("creating; entries = " + length);
                // create a file with length entries
                TextWriter writer = new OutputStreamWriter(localFs.Create(file));
                try
                {
                    for (int i = 0; i < length; i++)
                    {
                        writer.Write(Sharpen.Extensions.ToString(i * 2));
                        writer.Write("\t");
                        writer.Write(Sharpen.Extensions.ToString(i));
                        writer.Write("\n");
                    }
                }
                finally
                {
                    writer.Close();
                }
                // try splitting the file in a variety of sizes
                KeyValueTextInputFormat format = new KeyValueTextInputFormat();
                format.Configure(job);
                for (int i_1 = 0; i_1 < 3; i_1++)
                {
                    int numSplits = random.Next(MaxLength / 20) + 1;
                    Log.Debug("splitting: requesting = " + numSplits);
                    InputSplit[] splits = format.GetSplits(job, numSplits);
                    Log.Debug("splitting: got =        " + splits.Length);
                    // check each split
                    BitSet bits = new BitSet(length);
                    for (int j = 0; j < splits.Length; j++)
                    {
                        Log.Debug("split[" + j + "]= " + splits[j]);
                        RecordReader <Text, Text> reader = format.GetRecordReader(splits[j], job, reporter
                                                                                  );
                        Type readerClass = reader.GetType();
                        NUnit.Framework.Assert.AreEqual("reader class is KeyValueLineRecordReader.", typeof(
                                                            KeyValueLineRecordReader), readerClass);
                        Text key        = reader.CreateKey();
                        Type keyClass   = key.GetType();
                        Text value      = reader.CreateValue();
                        Type valueClass = value.GetType();
                        NUnit.Framework.Assert.AreEqual("Key class is Text.", typeof(Text), keyClass);
                        NUnit.Framework.Assert.AreEqual("Value class is Text.", typeof(Text), valueClass);
                        try
                        {
                            int count = 0;
                            while (reader.Next(key, value))
                            {
                                int v = System.Convert.ToInt32(value.ToString());
                                Log.Debug("read " + v);
                                if (bits.Get(v))
                                {
                                    Log.Warn("conflict with " + v + " in split " + j + " at position " + reader.GetPos
                                                 ());
                                }
                                NUnit.Framework.Assert.IsFalse("Key in multiple partitions.", bits.Get(v));
                                bits.Set(v);
                                count++;
                            }
                            Log.Debug("splits[" + j + "]=" + splits[j] + " count=" + count);
                        }
                        finally
                        {
                            reader.Close();
                        }
                    }
                    NUnit.Framework.Assert.AreEqual("Some keys in no partition.", length, bits.Cardinality
                                                        ());
                }
            }
        }
 public virtual K CreateKey()
 {
     return(delegate_.CreateKey());
 }
예제 #16
0
 public virtual K CreateKey()
 {
     return(curReader.CreateKey());
 }
예제 #17
0
 /// <summary>Request new key from proxied RR.</summary>
 public virtual K CreateKey()
 {
     return(rr.CreateKey());
 }