public DryadLinqVertexWriter(VertexEnv denv, DryadLinqFactory <T> writerFactory, UInt32 portNum) { this.m_dvertexEnv = denv; this.m_nativeHandle = denv.NativeHandle; this.m_startPort = portNum; this.m_numberOfOutputs = 1; this.m_writerFactory = writerFactory; Int32 buffSize = this.m_dvertexEnv.GetWriteBuffSize(); DryadLinqRecordWriter <T> writer = writerFactory.MakeWriter(this.m_nativeHandle, portNum, buffSize); this.m_writers = new DryadLinqRecordWriter <T>[] { writer }; }
public DryadLinqVertexReader(VertexEnv denv, DryadLinqFactory <T> readerFactory, UInt32 portNum) { this.m_dvertexEnv = denv; this.m_nativeHandle = denv.NativeHandle; this.m_readerFactory = readerFactory; this.m_startPort = portNum; this.m_numberOfInputs = 1; this.m_keepInputPortOrder = false; this.m_portPermArray = new UInt32[] { 0 }; DryadLinqRecordReader <T> reader = readerFactory.MakeReader(this.m_nativeHandle, portNum); this.m_readers = new DryadLinqRecordReader <T>[] { reader }; this.m_isUsed = false; }
public DryadLinqVertexWriter(VertexEnv denv, DryadLinqFactory <T> writerFactory, UInt32 startPort, UInt32 endPort) { this.m_dvertexEnv = denv; this.m_nativeHandle = denv.NativeHandle; this.m_startPort = startPort; this.m_numberOfOutputs = endPort - startPort; this.m_writerFactory = writerFactory; this.m_writers = new DryadLinqRecordWriter <T> [this.m_numberOfOutputs]; Int32 buffSize = this.m_dvertexEnv.GetWriteBuffSize(); for (UInt32 i = 0; i < this.m_numberOfOutputs; i++) { this.m_writers[i] = writerFactory.MakeWriter(this.m_nativeHandle, i + startPort, buffSize); } }
public DryadLinqVertexReader(VertexEnv denv, DryadLinqFactory <T> readerFactory, UInt32 startPort, UInt32 endPort, bool keepInputPortOrder) { this.m_dvertexEnv = denv; this.m_nativeHandle = denv.NativeHandle; this.m_readerFactory = readerFactory; this.m_startPort = startPort; this.m_numberOfInputs = endPort - startPort; this.m_keepInputPortOrder = keepInputPortOrder; this.m_portPermArray = new UInt32[this.NumberOfInputs]; for (UInt32 i = 0; i < this.NumberOfInputs; i++) { this.m_portPermArray[i] = i; } if (!keepInputPortOrder) { Random rdm = new Random(System.Diagnostics.Process.GetCurrentProcess().Id); Int32 max = (Int32)this.NumberOfInputs; for (UInt32 i = 1; i < this.NumberOfInputs; i++) { int idx = rdm.Next(max); UInt32 n = this.m_portPermArray[max - 1]; this.m_portPermArray[max - 1] = this.m_portPermArray[idx]; this.m_portPermArray[idx] = n; max--; } } this.m_readers = new DryadLinqRecordReader <T> [this.NumberOfInputs]; for (UInt32 i = 0; i < this.NumberOfInputs; i++) { this.m_readers[i] = this.m_readerFactory.MakeReader(this.m_nativeHandle, startPort + i); } this.m_isUsed = false; }
public static IEnumerable<K> Phase1Sampling<T, K>(IEnumerable<T> source, Func<T, K> keySelector, VertexEnv denv) { // note: vertexID is constant for each repetition of a specific vertex (eg in fail-and-retry scenarios) // this is very good as it ensure the sampling is idempotent w.r.t. retries. long vertexID = DryadLinqNative.GetVertexId(denv.NativeHandle); int seed = unchecked((int)(vertexID)); long nEmitted = 0; Random rdm = new Random(seed); List<K> allSoFar = new List<K>(); List<K> samples = new List<K>(); // try to collect 10 samples, but keep all the records just in case IEnumerator<T> sourceEnumerator = source.GetEnumerator(); while (sourceEnumerator.MoveNext()) { T elem = sourceEnumerator.Current; K key = keySelector(elem); allSoFar.Add(key); if (rdm.NextDouble() < SAMPLE_RATE) { samples.Add(key); if (samples.Count >= 10) break; } } if (samples.Count >= 10) { // we have lots of samples.. emit them and continue sampling allSoFar = null; // not needed. foreach (K key in samples) { yield return key; nEmitted++; } while (sourceEnumerator.MoveNext()) { T elem = sourceEnumerator.Current; if (rdm.NextDouble() < SAMPLE_RATE) { yield return keySelector(elem); nEmitted++; } } } else { // sampling didn't produce much, so emit all the records instead. DryadLinqLog.AddInfo("Sampling produced only {0} records. Emitting all records instead.", samples.Count()); Debug.Assert(sourceEnumerator.MoveNext() == false, "The source enumerator wasn't finished"); samples = null; // the samples list is not needed. foreach (K key in allSoFar) { yield return key; nEmitted++; } } DryadLinqLog.AddInfo("Stage1 sampling: num keys emitted = {0}", nEmitted); }