public static IQueryable <int> GetGroupByReduceDataSet(DryadLinqContext context) { //IEnumerable<IEnumerable<int>> data = new int[][] { // Enumerable.Range(1,100).ToArray(), // Enumerable.Range(101,100).ToArray(), // }; IQueryable <int> input = context.FromEnumerable(new int[1]); IQueryable <int> range = input.Apply(x => Enumerable.Range(0, 2)); // {0, 1} IQueryable <int> partitions = range.HashPartition(x => x, 2); // create 2 partitions IQueryable <int> rangePartition = partitions.SelectMany(x => Enumerable.Range(x * 100 + 1, 100)); return(rangePartition); }
public static IQueryable <int> GetRangePartitionDataSet(DryadLinqContext context) { // we need a lot of data to ensure sampler will get some data. // A few thousand should suffice. //IEnumerable<IEnumerable<int>> data = new int[][] { // Enumerable.Range(1,1000).ToArray(), // Enumerable.Range(20000,2000).ToArray(), // Enumerable.Range(40000,5000).ToArray(), // }; IQueryable <int> input = context.FromEnumerable(new int[1]); IQueryable <int> range = input.Apply(x => Enumerable.Range(0, 3)); // {0, 1, 2} IQueryable <int> partitions = range.HashPartition(x => x, 3); // create 3 partitions IQueryable <int> rangePartition = partitions.SelectMany(x => Enumerable.Range(x * 20000 + 1, 1000)); return(rangePartition); }
public static void WordCountExample() { #if local // This overload runs the computation on your local computer using a single worker var config = new DryadLinqContext(1); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #else string clusterName = "Replace with your HDInsight 3.0 cluster name"; // to use the davinci.txt example input below, select your cluster's default // storage account and container, which automatically includes the sample text string accountName = "Replace with a storage account name"; string containerName = "Replace with a storage container name"; // This overload creates an Azure-based computation var config = new DryadLinqContext(clusterName); config.JobFriendlyName = "DryadLINQ Sample Wordcount"; // plain text files should be read as type LineRecord var input = config.FromStore<LineRecord>(AzureUtils.ToAzureUri(accountName, containerName, "example/data/gutenberg/davinci.txt")); #endif var words = input.SelectMany(x => x.Line.Split(' ')); var groups = words.GroupBy(x => x); var counts = groups.Select(x => new KeyValuePair<string, int>(x.Key, x.Count())); var toOutput = counts.Select(x => new LineRecord(String.Format("{0}: {1}", x.Key, x.Value))); #if local // any collection computed by the query can be materialized back at the client, // not just the 'output' collection. For large collections this is expensive! foreach (LineRecord line in toOutput) { Console.WriteLine(line.Line); } #else // the 'true' parameter to ToStore means the output will be over-written if you run // the job more than once var info = toOutput.ToStore(AzureUtils.ToAzureUri(accountName, containerName, "wc-out.txt"), true).SubmitAndWait(); #endif }
public static void WordCountExample() { #if local // This overload runs the computation on your local computer using a single worker var config = new DryadLinqContext(1); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #else string clusterName = "Replace with your HDInsight 3.0 cluster name"; // to use the davinci.txt example input below, select your cluster's default // storage account and container, which automatically includes the sample text string accountName = "Replace with a storage account name"; string containerName = "Replace with a storage container name"; // This overload creates an Azure-based computation var config = new DryadLinqContext(clusterName); config.JobFriendlyName = "DryadLINQ Sample Wordcount"; // plain text files should be read as type LineRecord var input = config.FromStore <LineRecord>(AzureUtils.ToAzureUri(accountName, containerName, "example/data/gutenberg/davinci.txt")); #endif var words = input.SelectMany(x => x.Line.Split(' ')); var groups = words.GroupBy(x => x); var counts = groups.Select(x => new KeyValuePair <string, int>(x.Key, x.Count())); var toOutput = counts.Select(x => new LineRecord(String.Format("{0}: {1}", x.Key, x.Value))); #if local // any collection computed by the query can be materialized back at the client, // not just the 'output' collection. For large collections this is expensive! foreach (LineRecord line in toOutput) { Console.WriteLine(line.Line); } #else // the 'true' parameter to ToStore means the output will be over-written if you run // the job more than once var info = toOutput.ToStore(AzureUtils.ToAzureUri(accountName, containerName, "wc-out.txt"), true).SubmitAndWait(); #endif }
public static IQueryable <int> GetSimpleFileSets(DryadLinqContext context) { //IEnumerable<IEnumerable<int>> data = new int[][] // { // new[] { 0, 1, 2, 3 }, // new[] { 4, 5, 6, 7 }, // new[] { 8, 9, 10, 11}, // }; //IQueryable<LineRecord> input = context.FromStore<LineRecord>(AzureUtils.ToAzureUri(Config.accountName, Config.containerName, // "unittest/inputdata/SimpleFile.txt")); IQueryable <int> input = context.FromEnumerable(new int[1]); IQueryable <int> range = input.Apply(x => Enumerable.Range(0, 3)); // {0, 1, 2} IQueryable <int> partitions = range.HashPartition(x => x, 3); // create 3 partitions IQueryable <int> rangePartition = partitions.SelectMany(x => Enumerable.Range(x * 4, 4)); //IQueryable<int> store = rangePartition.ToStore(@"unittest/inputdata/SimpleFile.txt"); return(rangePartition); }
public static void WordCountExample() { #if local // This overload runs the computation on your local computer using a single worker var config = new DryadLinqContext(1); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #else #if azure string clusterName = "Replace with your HDInsight 3.1 cluster name"; // to use the davinci.txt example input below, select your cluster's default // storage account and container, which automatically includes the sample text string accountName = "Replace with a storage account name"; string containerName = "Replace with a storage container name"; // This overload creates an Azure-based computation var config = new DryadLinqContext(clusterName); config.JobFriendlyName = "DryadLINQ Sample Wordcount"; // plain text files should be read as type LineRecord var input = config.FromStore<LineRecord>(Utils.ToAzureUri(accountName, containerName, "example/data/gutenberg/davinci.txt")); #else // to use a yarn cluster, fill in the username, resource node machine name and port, and name node and hdfs port below (use -1 for the default hdfs port). string user = "******"; string resourceNode = "Replace with the name of the computer your resource node is running on"; int rmPort = 8088; string nameNode = "Replace with the name of the computer your name node is running on"; int hdfsPort = -1; // set the YARN queue to submit your job on below. Leave null to use the default queue string queue = null; // set the number of worker containers to start for the DryadLINQ job below int numberOfWorkers = 2; // set the amount of memory requested for the DryadLINQ job manager container below: 8GB should be enough for even the largest jobs, and 2GB will normally suffice int amMemoryMB = 2000; // set the amount of memory requested for the DryadLINQ worker containers below. The amount needed will depend on the code you are running int workerMemoryMB = 8000; // This overload runs the computation on your local computer using a single worker var cluster = new DryadLinqYarnCluster(user, numberOfWorkers, amMemoryMB, workerMemoryMB, queue, resourceNode, rmPort, nameNode, hdfsPort); var config = new DryadLinqContext(cluster); var lines = new LineRecord[] { new LineRecord("This is a dummy line for a short job") }; // You can create inputs from any IEnumerable source using this method var input = config.FromEnumerable(lines); #endif #endif var words = input.SelectMany(x => x.Line.Split(' ')); var groups = words.GroupBy(x => x); var counts = groups.Select(x => new KeyValuePair<string, int>(x.Key, x.Count())); var toOutput = counts.Select(x => new LineRecord(String.Format("{0}: {1}", x.Key, x.Value))); #if azure // the 'true' parameter to ToStore means the output will be over-written if you run // the job more than once var info = toOutput.ToStore(Utils.ToAzureUri(accountName, containerName, "wc-out.txt"), true).SubmitAndWait(); #else // any collection computed by the query can be materialized back at the client, // not just the 'output' collection. For large collections this is expensive! foreach (LineRecord line in toOutput) { Console.WriteLine(line.Line); } #endif }
public static bool PlainEnumerableAsDryadQueryToStoreSubmit(DryadLinqContext context) { string testName = "PlainEnumerableAsDryadQueryToStoreSubmit"; TestLog.TestStart(testName); bool passed = true; try { context.LocalDebug = false; string outFile = "unittest/output/PlainEnumerableAsDryadQueryToStoreSubmit"; int[] plainData = { 5, 6, 7 }; var q = context.FromEnumerable(plainData) .ToStore(AzureUtils.ToAzureUri(Config.accountName, Config.containerName, outFile)); DryadLinqJobInfo info = q.Submit(); info.Wait(); passed &= Validate.outFileExists(outFile); } catch (Exception Ex) { TestLog.Message("Error: " + Ex.Message); passed &= false; } TestLog.LogResult(new TestResult(testName, context, passed)); return passed; }