public void Execute(string[] args) { // a controller manages an instance of Naiad using (var computation = NewComputation.FromArgs(ref args)) { // define a graph input from a filename and some transformations. var edgeStrings = new[] { args[1] }.AsNaiadStream(computation) .SelectMany(x => ReadLines(x)) .Select(x => x.Split()) .Select(x => x[0].PairWith(x[1])); // define reachability roots from a second filename. var rootStrings = new[] { args[2] }.AsNaiadStream(computation) .SelectMany(x => ReadLines(x)); // convert (string, string) -> edge and string -> node. Stream <Edge, Epoch> edges; // will eventually hold stream of edges Stream <Node, Epoch> roots; // will eventually hold stream of roots // an autorenamer context is used to consistently rename identifiers. using (var renamer = new AutoRenamer <string>()) { var tempEdges = edgeStrings.RenameUsing(renamer, x => x.First) // use the first string to find a name .Select(x => x.node.WithValue(x.value.Second)) // discard the first string .RenameUsing(renamer, x => x.value) // use the second string to find a name .Select(x => new Edge(x.value.node, x.node)); // discard the second string and form an edge var tempRoots = rootStrings.RenameUsing(renamer, x => x) // use the string itself to find a name .Select(x => x.node); // discard the string and keep the node // FinishRenaming only after all RenameUsing edges = tempEdges.FinishRenaming(renamer); roots = tempRoots.FinishRenaming(renamer); } // iteratively expand reachable set as pairs (node, isReachable). var limit = roots.Select(x => x.WithValue(true)) .IterateAndAccumulate((lc, x) => x.TransmitAlong(lc.EnterLoop(edges)) // transmit (node, true) values along edges .StateMachine((bool b, bool s) => true), // any received value sets the state to true x => x.node.index, // partitioning information Int32.MaxValue, // the number of iterations "Reachability") // a nice descriptive name .Concat(roots.Select(x => x.WithValue(true))) // add the original trusted nodes .NodeAggregate((a, b) => true) .Where(x => x.value); // aggregate, for the originals // print the results onto the screen (or write to file, as appopriate) limit.Select(x => x.node.index) .Subscribe(x => Console.WriteLine(x.Count())); // start the computation and wait until it finishes computation.Activate(); computation.Join(); } }
static void ExecuteNaiad(string[] args, string dataDir, string uriBase) { string ukFile = Path.Combine(dataDir, @"uk-2007-05"); string twitterFile = Path.Combine(dataDir, @"twitter_rv.bin"); string livejournalFile = Path.Combine(dataDir, @"livejournal.bin"); var configuration = Configuration.FromArgs(ref args); var algorithm = args[1]; var dataset = args[2]; #region file partitioning if (algorithm == "partition" && dataset == "twitter") { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = NewComputation.FromConfig(configuration)) { int parts = Int32.Parse(args[3]); var format = Path.Combine(dataDir, @"twitter-part-{0}-of-" + (parts * parts).ToString()); computation.LoadGraph(twitterFile) .Partition(parts, parts) .WriteBinaryToFiles(format); computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } if (algorithm == "repartition" && dataset == "twitter") { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = NewComputation.FromConfig(configuration)) { int parts = Int32.Parse(args[3]); computation.ReadHdfsBinaryCollection <Edge>(new Uri(uriBase + "twitter-10")) .Partition(parts, parts) .WriteHdfsBinary(new Uri(uriBase + "twitter-" + parts), 1024 * 1024, -1L, 100L * 1024L * 1024L * 1024L); computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } if (algorithm == "compact" && dataset == "twitter") { using (var computation = NewComputation.FromConfig(configuration)) { var edges = System.IO.File.OpenRead(twitterFile) .ReadEdges() .AsNaiadStream(computation); using (var renamer = new AutoRenamer <Int32>()) { var newEdges = edges.RenameUsing(renamer, edge => edge.source) .Select(x => new Edge(x.node, x.value.target)) .RenameUsing(renamer, edge => edge.target) .Select(x => new Edge(x.value.source, x.node)); edges = newEdges.FinishRenaming(renamer); } computation.Activate(); computation.Join(); } } #endregion #region page rank if (algorithm == "pagerank" && dataset == "twitter") { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = NewComputation.FromConfig(configuration)) { computation.OnFrontierChange += (x, y) => { Console.WriteLine(System.DateTime.Now + "\t" + string.Join(", ", y.NewFrontier)); System.GC.GetTotalMemory(true); }; var edges = System.IO.File.OpenRead(twitterFile) .ReadEdges() .AsNaiadStream(computation); edges.PageRank(20, "twitter").Subscribe(); computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } if (algorithm == "pagerank" && dataset == "livejournal") { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = NewComputation.FromConfig(configuration)) { computation.OnFrontierChange += (x, y) => { Console.WriteLine(System.DateTime.Now + "\t" + string.Join(", ", y.NewFrontier)); }; var edges = System.IO.File.OpenRead(livejournalFile) .ReadEdges() .AsNaiadStream(computation); edges.PageRank(20, "livejournal").Subscribe(); computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } #endregion #region connected components if (algorithm == "connectedcomponents" && dataset == "uk-2007-05") { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = NewComputation.FromConfig(configuration)) { var format = Path.Combine(dataDir, @"uk-2007-05-part-{0}-of-{1}"); var extraInput = new[] { string.Format(format, 3, 4) }.AsNaiadStream(computation) .PartitionBy(x => 3) .ReadGraph(); computation.LoadGraph(format, 3, 4) .UnionFind(106000000) .PartitionBy(x => 3) .Concat(extraInput) .UnionFind(106000000); computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } if (algorithm == "connectedcomponents" && dataset == "twitter") { using (Microsoft.Research.Peloponnese.Hdfs.HdfsInstance hdfs = new Microsoft.Research.Peloponnese.Hdfs.HdfsInstance(new Uri(uriBase))) { // HDFS needs to be initialized from the main thread before distributed use bool exists = hdfs.IsFileExists("/dummy"); } var readWatch = System.Diagnostics.Stopwatch.StartNew(); using (var controller = NewController.FromConfig(configuration)) { using (var readComputation = controller.NewComputation()) { int parts = (args.Length > 4) ? Int32.Parse(args[4]) : 1; int machines = (args.Length > 5) ? Int32.Parse(args[5]) : 1; int another = (args.Length > 6) ? Int32.Parse(args[6]) : 1; var format = new Uri(@uriBase + "twitter-40"); var collection = readComputation .ReadHdfsBinaryCollection <Edge>(format); Stream <int[], Epoch> readStuff = null; switch (args[3]) { case "sp": readStuff = collection.GroupEdgesSingleProcess(parts, parts); break; case "pp": readStuff = collection.GroupEdgesPartsPerProcess(parts, parts, 16); break; case "op": readStuff = collection.GroupEdgesOnePerProcess(parts, parts, 16); break; case "hp": readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines, 16); break; case "hhp": readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines * another, 16); break; default: throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp"); } var sink = new InterGraphDataSink <int[]>(readStuff); readComputation.Activate(); readComputation.Join(); Console.WriteLine("Reading done: " + readWatch.Elapsed); for (int i = 0; i < 20; ++i) { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = controller.NewComputation()) { var firstStage = computation.NewInput(sink.NewDataSource()) .ReformatInts(); if (parts * machines * another > 1) { firstStage = firstStage .UnionFindStruct(65000000, parts * machines * another, machines * another); } switch (args[3]) { case "sp": firstStage .PartitionBy(x => parts * parts) .UnionFind(65000000); break; case "pp": firstStage .PartitionBy(x => 16 * parts) .UnionFind(65000000); break; case "op": firstStage .PartitionBy(x => 16 * (parts * parts)) .UnionFind(65000000); break; case "hp": if (parts * parts < 16) { firstStage .PartitionBy(x => 16 * x.destination + (parts * parts)) .UnionFindStruct(65000000, 0, 0) .PartitionBy(x => 16 * (machines * machines)) .UnionFind(65000000); } else { firstStage .PartitionBy(x => 16 * (x.destination + (machines * machines))) .UnionFindStruct(65000000, 0, 0) .PartitionBy(x => 16 * ((machines * machines) + (machines * machines))) .UnionFind(65000000); } break; case "hhp": firstStage .PartitionBy(x => 16 * ((x.destination / (machines * machines)) + (machines * machines * another * another)) + (x.destination % (machines * machines))) .UnionFindStruct(65000000, -machines * another, another) .PartitionBy(x => 16 * (x.destination + (another * another) + (machines * machines * another * another))) .UnionFindStruct(65000000, -another, 1) .PartitionBy(x => 16 * ((another * another) + (another * another) + (machines * machines * another * another))) .UnionFind(65000000); break; default: throw new ApplicationException("Grouping type must be sp, pp, op, hp or hhp"); } computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } } controller.Join(); } } if (algorithm == "hashtablecc" && dataset == "twitter") { using (Microsoft.Research.Peloponnese.Hdfs.HdfsInstance hdfs = new Microsoft.Research.Peloponnese.Hdfs.HdfsInstance(new Uri(uriBase))) { // HDFS needs to be initialized from the main thread before distributed use bool exists = hdfs.IsFileExists("/dummy"); } var readWatch = System.Diagnostics.Stopwatch.StartNew(); using (var controller = NewController.FromConfig(configuration)) { using (var readComputation = controller.NewComputation()) { int parts = (args.Length > 4) ? Int32.Parse(args[4]) : 1; int machines = (args.Length > 5) ? Int32.Parse(args[5]) : 1; int another = (args.Length > 6) ? Int32.Parse(args[6]) : 1; var format = new Uri(@uriBase + "twitter-40"); var collection = readComputation .ReadHdfsBinaryCollection <Edge>(format); Stream <int[], Epoch> readStuff = null; switch (args[3]) { case "sp": readStuff = collection.GroupEdgesSingleProcess(parts, parts); break; case "pp": readStuff = collection.GroupEdgesPartsPerProcess(parts, parts, 16); break; case "op": readStuff = collection.GroupEdgesOnePerProcess(parts, parts, 16); break; case "hp": readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines, 16); break; case "hhp": readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines * another, 16); break; default: throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp"); } var sink = new InterGraphDataSink <int[]>(readStuff); readComputation.Activate(); readComputation.Join(); Console.WriteLine("Reading done: " + readWatch.Elapsed); for (int i = 0; i < 20; ++i) { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = controller.NewComputation()) { var firstStage = computation.NewInput(sink.NewDataSource()) .ReformatInts() .UnionFindHashTable(65000000, parts * machines * another, machines * another); switch (args[3]) { case "sp": firstStage .PartitionBy(x => parts * parts) .UnionFind(65000000); break; case "pp": firstStage .PartitionBy(x => 16 * parts) .UnionFind(65000000); break; case "op": firstStage .PartitionBy(x => 16 * (parts * parts)) .UnionFind(65000000); break; case "hp": if (parts * parts < 16) { firstStage .PartitionBy(x => 16 * x.destination + (parts * parts)) .UnionFindStruct(65000000, 0, 0) .PartitionBy(x => 16 * (machines * machines)) .UnionFind(65000000); } else { firstStage .PartitionBy(x => 16 * (x.destination + (machines * machines))) .UnionFindStruct(65000000, 0, 0) .PartitionBy(x => 16 * ((machines * machines) + (machines * machines))) .UnionFind(65000000); } break; case "hhp": firstStage .PartitionBy(x => 16 * ((x.destination / (machines * machines)) + (machines * machines * another * another)) + (x.destination % (machines * machines))) .UnionFindStruct(65000000, -machines * another, another) .PartitionBy(x => 16 * (x.destination + (another * another) + (machines * machines * another * another))) .UnionFindStruct(65000000, -another, 1) .PartitionBy(x => 16 * ((another * another) + (another * another) + (machines * machines * another * another))) .UnionFind(65000000); break; default: throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp"); } computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } } controller.Join(); } } if (algorithm == "hashtableonlycc" && dataset == "twitter") { using (Microsoft.Research.Peloponnese.Hdfs.HdfsInstance hdfs = new Microsoft.Research.Peloponnese.Hdfs.HdfsInstance(new Uri(uriBase))) { // HDFS needs to be initialized from the main thread before distributed use bool exists = hdfs.IsFileExists("/dummy"); } var readWatch = System.Diagnostics.Stopwatch.StartNew(); using (var controller = NewController.FromConfig(configuration)) { using (var readComputation = controller.NewComputation()) { int parts = (args.Length > 4) ? Int32.Parse(args[4]) : 1; int machines = (args.Length > 5) ? Int32.Parse(args[5]) : 1; int another = (args.Length > 6) ? Int32.Parse(args[6]) : 1; var format = new Uri(@uriBase + "twitter-40"); var collection = readComputation .ReadHdfsBinaryCollection <Edge>(format); Stream <int[], Epoch> readStuff = null; switch (args[3]) { case "sp": readStuff = collection.GroupEdgesSingleProcess(parts, parts); break; case "pp": readStuff = collection.GroupEdgesPartsPerProcess(parts, parts, 16); break; case "op": readStuff = collection.GroupEdgesOnePerProcess(parts, parts, 16); break; case "hp": readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines, 16); break; case "hhp": readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines * another, 16); break; default: throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp"); } var sink = new InterGraphDataSink <int[]>(readStuff); readComputation.Activate(); readComputation.Join(); Console.WriteLine("Reading done: " + readWatch.Elapsed); for (int i = 0; i < 20; ++i) { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = controller.NewComputation()) { var firstStage = computation.NewInput(sink.NewDataSource()) .ReformatInts(); if (parts * machines * another > 1) { firstStage = firstStage .UnionFindHashTable(65000000, parts * machines * another, machines * another); } switch (args[3]) { case "sp": firstStage .PartitionBy(x => parts * parts) .UnionFindHashTable(65000000); break; case "pp": firstStage .PartitionBy(x => 16 * parts) .UnionFindHashTable(65000000); break; case "op": firstStage .PartitionBy(x => 16 * (parts * parts)) .UnionFindHashTable(65000000); break; case "hp": if (parts * parts < 16) { firstStage .PartitionBy(x => 16 * x.destination + (parts * parts)) .UnionFindHashTable(65000000, 0, 0) .PartitionBy(x => 16 * (machines * machines)) .UnionFindHashTable(65000000); } else { firstStage .PartitionBy(x => 16 * (x.destination + (machines * machines))) .UnionFindHashTable(65000000, 0, 0) .PartitionBy(x => 16 * ((machines * machines) + (machines * machines))) .UnionFindHashTable(65000000); } break; case "hhp": firstStage .PartitionBy(x => 16 * ((x.destination / (machines * machines)) + (machines * machines * another * another)) + (x.destination % (machines * machines))) .UnionFindHashTable(65000000, -machines * another, another) .PartitionBy(x => 16 * (x.destination + (another * another) + (machines * machines * another * another))) .UnionFindHashTable(65000000, -another, 1) .PartitionBy(x => 16 * ((another * another) + (another * another) + (machines * machines * another * another))) .UnionFindHashTable(65000000); break; default: throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp"); } computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } } controller.Join(); } } if (algorithm == "connectedcomponents" && dataset == "livejournal") { var stopwatch = System.Diagnostics.Stopwatch.StartNew(); using (var computation = NewComputation.FromConfig(configuration)) { var edges = System.IO.File.OpenRead(livejournalFile) .ReadEdges() .AsNaiadStream(computation); edges.UnionFind(5000000) .PartitionBy(x => 0) .UnionFind(5000000); computation.Activate(); computation.Join(); } Console.WriteLine(stopwatch.Elapsed); } #endregion }