Ejemplo n.º 1
0
        public TypedKeyIndex(InterGraphDataSink <TRecord> relation, Controller controller, Func <TRecord, TKey> keySelector, Func <TRecord, TValue> valueSelector)
        {
            using (var compuation = controller.NewComputation())
            {
                var stream = compuation.NewInput(relation.NewDataSource())
                             .Select(x => keySelector(x).PairWith(valueSelector(x)))
                             .NewUnaryStage((i, s) => new IndexBuilder(i, s), x => x.First.GetHashCode(), null, "IndexBuilder");

                result = new InterGraphDataSink <Fragment>(stream);
                compuation.Activate();
                compuation.Join();
            }
        }
Ejemplo n.º 2
0
        public EmptyKeyIndex(InterGraphDataSink <TRecord> relation, Controller controller, Func <TRecord, TValue> valueSelector)
        {
            using (var compuation = controller.NewComputation())
            {
                var stream = compuation.NewInput(relation.NewDataSource())
                             .Select(valueSelector)
                             .NewUnaryStage((i, s) => new HashSetBuilder(i, s), x => x.GetHashCode(), null, "HashSetBuilder");

                IndexFragmentStream = new InterGraphDataSink <HashSet <TValue> >(stream);

                var localcount = 0L;

                stream.Select(x => x.Count)
                .Aggregate(x => true, x => (long)x, (x, y) => x + y, (k, c) => c)
                .SelectMany(c => stream.ForStage.Placement.Select(p => p.VertexId.PairWith(c)))
                .PartitionBy(x => x.First)
                .Subscribe((a, b, c) => { localcount = c.Single().Second; });

                compuation.Activate();
                compuation.Join();

                count = localcount;
            }
        }
Ejemplo n.º 3
0
        public static InterGraphDataSink <TOutput> NewInterGraphStream <TRecord, TOutput>(this Controller controller, InterGraphDataSink <TRecord> source, Func <Stream <TRecord, Epoch>, Stream <TOutput, Epoch> > transformation)
        {
            InterGraphDataSink <TOutput> result;

            using (var computation = controller.NewComputation())
            {
                result = new InterGraphDataSink <TOutput>(transformation(computation.NewInput(source.NewDataSource())));

                computation.Activate();
                computation.Join();
            }

            return(result);
        }
Ejemplo n.º 4
0
 private Stream <TOutput, Epoch> FragmentJoin <TInput, TOutput>(Stream <TInput, Epoch> stream, Func <TInput, int> keyFunc, Action <Fragment, TInput[], int, VertexOutputBufferPerTime <TOutput, Epoch> > action)
 {
     return(stream.ForStage.Computation.NewInput(Index.NewDataSource())
            .IndexJoin <Fragment, TInput, int, TOutput>(stream, keyFunc, action));
 }
Ejemplo n.º 5
0
        static void ExecuteNaiad(string[] args, string dataDir, string uriBase)
        {
            string ukFile          = Path.Combine(dataDir, @"uk-2007-05");
            string twitterFile     = Path.Combine(dataDir, @"twitter_rv.bin");
            string livejournalFile = Path.Combine(dataDir, @"livejournal.bin");

            var configuration = Configuration.FromArgs(ref args);

            var algorithm = args[1];
            var dataset   = args[2];

            #region file partitioning
            if (algorithm == "partition" && dataset == "twitter")
            {
                var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                using (var computation = NewComputation.FromConfig(configuration))
                {
                    int parts  = Int32.Parse(args[3]);
                    var format = Path.Combine(dataDir, @"twitter-part-{0}-of-" + (parts * parts).ToString());

                    computation.LoadGraph(twitterFile)
                    .Partition(parts, parts)
                    .WriteBinaryToFiles(format);

                    computation.Activate();
                    computation.Join();
                }

                Console.WriteLine(stopwatch.Elapsed);
            }

            if (algorithm == "repartition" && dataset == "twitter")
            {
                var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                using (var computation = NewComputation.FromConfig(configuration))
                {
                    int parts = Int32.Parse(args[3]);

                    computation.ReadHdfsBinaryCollection <Edge>(new Uri(uriBase + "twitter-10"))
                    .Partition(parts, parts)
                    .WriteHdfsBinary(new Uri(uriBase + "twitter-" + parts), 1024 * 1024, -1L, 100L * 1024L * 1024L * 1024L);

                    computation.Activate();
                    computation.Join();
                }

                Console.WriteLine(stopwatch.Elapsed);
            }

            if (algorithm == "compact" && dataset == "twitter")
            {
                using (var computation = NewComputation.FromConfig(configuration))
                {
                    var edges = System.IO.File.OpenRead(twitterFile)
                                .ReadEdges()
                                .AsNaiadStream(computation);

                    using (var renamer = new AutoRenamer <Int32>())
                    {
                        var newEdges = edges.RenameUsing(renamer, edge => edge.source)
                                       .Select(x => new Edge(x.node, x.value.target))
                                       .RenameUsing(renamer, edge => edge.target)
                                       .Select(x => new Edge(x.value.source, x.node));

                        edges = newEdges.FinishRenaming(renamer);
                    }

                    computation.Activate();
                    computation.Join();
                }
            }
            #endregion

            #region page rank
            if (algorithm == "pagerank" && dataset == "twitter")
            {
                var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                using (var computation = NewComputation.FromConfig(configuration))
                {
                    computation.OnFrontierChange += (x, y) => { Console.WriteLine(System.DateTime.Now + "\t" + string.Join(", ", y.NewFrontier)); System.GC.GetTotalMemory(true); };

                    var edges = System.IO.File.OpenRead(twitterFile)
                                .ReadEdges()
                                .AsNaiadStream(computation);

                    edges.PageRank(20, "twitter").Subscribe();

                    computation.Activate();
                    computation.Join();
                }

                Console.WriteLine(stopwatch.Elapsed);
            }

            if (algorithm == "pagerank" && dataset == "livejournal")
            {
                var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                using (var computation = NewComputation.FromConfig(configuration))
                {
                    computation.OnFrontierChange += (x, y) => { Console.WriteLine(System.DateTime.Now + "\t" + string.Join(", ", y.NewFrontier)); };

                    var edges = System.IO.File.OpenRead(livejournalFile)
                                .ReadEdges()
                                .AsNaiadStream(computation);

                    edges.PageRank(20, "livejournal").Subscribe();

                    computation.Activate();
                    computation.Join();
                }

                Console.WriteLine(stopwatch.Elapsed);
            }
            #endregion

            #region connected components
            if (algorithm == "connectedcomponents" && dataset == "uk-2007-05")
            {
                var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                using (var computation = NewComputation.FromConfig(configuration))
                {
                    var format = Path.Combine(dataDir, @"uk-2007-05-part-{0}-of-{1}");

                    var extraInput = new[] { string.Format(format, 3, 4) }.AsNaiadStream(computation)
                    .PartitionBy(x => 3)
                    .ReadGraph();

                    computation.LoadGraph(format, 3, 4)
                    .UnionFind(106000000)
                    .PartitionBy(x => 3)
                    .Concat(extraInput)
                    .UnionFind(106000000);

                    computation.Activate();
                    computation.Join();
                }

                Console.WriteLine(stopwatch.Elapsed);
            }

            if (algorithm == "connectedcomponents" && dataset == "twitter")
            {
                using (Microsoft.Research.Peloponnese.Hdfs.HdfsInstance hdfs = new Microsoft.Research.Peloponnese.Hdfs.HdfsInstance(new Uri(uriBase)))
                {
                    // HDFS needs to be initialized from the main thread before distributed use
                    bool exists = hdfs.IsFileExists("/dummy");
                }

                var readWatch = System.Diagnostics.Stopwatch.StartNew();

                using (var controller = NewController.FromConfig(configuration))
                {
                    using (var readComputation = controller.NewComputation())
                    {
                        int parts      = (args.Length > 4) ? Int32.Parse(args[4]) : 1;
                        int machines   = (args.Length > 5) ? Int32.Parse(args[5]) : 1;
                        int another    = (args.Length > 6) ? Int32.Parse(args[6]) : 1;
                        var format     = new Uri(@uriBase + "twitter-40");
                        var collection = readComputation
                                         .ReadHdfsBinaryCollection <Edge>(format);

                        Stream <int[], Epoch> readStuff = null;

                        switch (args[3])
                        {
                        case "sp":
                            readStuff = collection.GroupEdgesSingleProcess(parts, parts);
                            break;

                        case "pp":
                            readStuff = collection.GroupEdgesPartsPerProcess(parts, parts, 16);
                            break;

                        case "op":
                            readStuff = collection.GroupEdgesOnePerProcess(parts, parts, 16);
                            break;

                        case "hp":
                            readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines, 16);
                            break;

                        case "hhp":
                            readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines * another, 16);
                            break;

                        default:
                            throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp");
                        }

                        var sink = new InterGraphDataSink <int[]>(readStuff);

                        readComputation.Activate();
                        readComputation.Join();

                        Console.WriteLine("Reading done: " + readWatch.Elapsed);

                        for (int i = 0; i < 20; ++i)
                        {
                            var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                            using (var computation = controller.NewComputation())
                            {
                                var firstStage = computation.NewInput(sink.NewDataSource())
                                                 .ReformatInts();

                                if (parts * machines * another > 1)
                                {
                                    firstStage = firstStage
                                                 .UnionFindStruct(65000000, parts * machines * another, machines * another);
                                }

                                switch (args[3])
                                {
                                case "sp":
                                    firstStage
                                    .PartitionBy(x => parts * parts)
                                    .UnionFind(65000000);
                                    break;

                                case "pp":
                                    firstStage
                                    .PartitionBy(x => 16 * parts)
                                    .UnionFind(65000000);
                                    break;

                                case "op":
                                    firstStage
                                    .PartitionBy(x => 16 * (parts * parts))
                                    .UnionFind(65000000);
                                    break;

                                case "hp":
                                    if (parts * parts < 16)
                                    {
                                        firstStage
                                        .PartitionBy(x => 16 * x.destination + (parts * parts))
                                        .UnionFindStruct(65000000, 0, 0)
                                        .PartitionBy(x => 16 * (machines * machines))
                                        .UnionFind(65000000);
                                    }
                                    else
                                    {
                                        firstStage
                                        .PartitionBy(x => 16 * (x.destination + (machines * machines)))
                                        .UnionFindStruct(65000000, 0, 0)
                                        .PartitionBy(x => 16 * ((machines * machines) + (machines * machines)))
                                        .UnionFind(65000000);
                                    }
                                    break;

                                case "hhp":
                                    firstStage
                                    .PartitionBy(x => 16 * ((x.destination / (machines * machines)) + (machines * machines * another * another)) + (x.destination % (machines * machines)))
                                    .UnionFindStruct(65000000, -machines * another, another)
                                    .PartitionBy(x => 16 * (x.destination + (another * another) + (machines * machines * another * another)))
                                    .UnionFindStruct(65000000, -another, 1)
                                    .PartitionBy(x => 16 * ((another * another) + (another * another) + (machines * machines * another * another)))
                                    .UnionFind(65000000);
                                    break;

                                default:
                                    throw new ApplicationException("Grouping type must be sp, pp, op, hp or hhp");
                                }

                                computation.Activate();
                                computation.Join();
                            }

                            Console.WriteLine(stopwatch.Elapsed);
                        }
                    }

                    controller.Join();
                }
            }

            if (algorithm == "hashtablecc" && dataset == "twitter")
            {
                using (Microsoft.Research.Peloponnese.Hdfs.HdfsInstance hdfs = new Microsoft.Research.Peloponnese.Hdfs.HdfsInstance(new Uri(uriBase)))
                {
                    // HDFS needs to be initialized from the main thread before distributed use
                    bool exists = hdfs.IsFileExists("/dummy");
                }

                var readWatch = System.Diagnostics.Stopwatch.StartNew();

                using (var controller = NewController.FromConfig(configuration))
                {
                    using (var readComputation = controller.NewComputation())
                    {
                        int parts      = (args.Length > 4) ? Int32.Parse(args[4]) : 1;
                        int machines   = (args.Length > 5) ? Int32.Parse(args[5]) : 1;
                        int another    = (args.Length > 6) ? Int32.Parse(args[6]) : 1;
                        var format     = new Uri(@uriBase + "twitter-40");
                        var collection = readComputation
                                         .ReadHdfsBinaryCollection <Edge>(format);

                        Stream <int[], Epoch> readStuff = null;

                        switch (args[3])
                        {
                        case "sp":
                            readStuff = collection.GroupEdgesSingleProcess(parts, parts);
                            break;

                        case "pp":
                            readStuff = collection.GroupEdgesPartsPerProcess(parts, parts, 16);
                            break;

                        case "op":
                            readStuff = collection.GroupEdgesOnePerProcess(parts, parts, 16);
                            break;

                        case "hp":
                            readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines, 16);
                            break;

                        case "hhp":
                            readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines * another, 16);
                            break;

                        default:
                            throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp");
                        }

                        var sink = new InterGraphDataSink <int[]>(readStuff);

                        readComputation.Activate();
                        readComputation.Join();

                        Console.WriteLine("Reading done: " + readWatch.Elapsed);

                        for (int i = 0; i < 20; ++i)
                        {
                            var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                            using (var computation = controller.NewComputation())
                            {
                                var firstStage = computation.NewInput(sink.NewDataSource())
                                                 .ReformatInts()
                                                 .UnionFindHashTable(65000000, parts * machines * another, machines * another);

                                switch (args[3])
                                {
                                case "sp":
                                    firstStage
                                    .PartitionBy(x => parts * parts)
                                    .UnionFind(65000000);
                                    break;

                                case "pp":
                                    firstStage
                                    .PartitionBy(x => 16 * parts)
                                    .UnionFind(65000000);
                                    break;

                                case "op":
                                    firstStage
                                    .PartitionBy(x => 16 * (parts * parts))
                                    .UnionFind(65000000);
                                    break;

                                case "hp":
                                    if (parts * parts < 16)
                                    {
                                        firstStage
                                        .PartitionBy(x => 16 * x.destination + (parts * parts))
                                        .UnionFindStruct(65000000, 0, 0)
                                        .PartitionBy(x => 16 * (machines * machines))
                                        .UnionFind(65000000);
                                    }
                                    else
                                    {
                                        firstStage
                                        .PartitionBy(x => 16 * (x.destination + (machines * machines)))
                                        .UnionFindStruct(65000000, 0, 0)
                                        .PartitionBy(x => 16 * ((machines * machines) + (machines * machines)))
                                        .UnionFind(65000000);
                                    }
                                    break;

                                case "hhp":
                                    firstStage
                                    .PartitionBy(x => 16 * ((x.destination / (machines * machines)) + (machines * machines * another * another)) + (x.destination % (machines * machines)))
                                    .UnionFindStruct(65000000, -machines * another, another)
                                    .PartitionBy(x => 16 * (x.destination + (another * another) + (machines * machines * another * another)))
                                    .UnionFindStruct(65000000, -another, 1)
                                    .PartitionBy(x => 16 * ((another * another) + (another * another) + (machines * machines * another * another)))
                                    .UnionFind(65000000);
                                    break;

                                default:
                                    throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp");
                                }

                                computation.Activate();
                                computation.Join();
                            }

                            Console.WriteLine(stopwatch.Elapsed);
                        }
                    }

                    controller.Join();
                }
            }

            if (algorithm == "hashtableonlycc" && dataset == "twitter")
            {
                using (Microsoft.Research.Peloponnese.Hdfs.HdfsInstance hdfs = new Microsoft.Research.Peloponnese.Hdfs.HdfsInstance(new Uri(uriBase)))
                {
                    // HDFS needs to be initialized from the main thread before distributed use
                    bool exists = hdfs.IsFileExists("/dummy");
                }

                var readWatch = System.Diagnostics.Stopwatch.StartNew();

                using (var controller = NewController.FromConfig(configuration))
                {
                    using (var readComputation = controller.NewComputation())
                    {
                        int parts      = (args.Length > 4) ? Int32.Parse(args[4]) : 1;
                        int machines   = (args.Length > 5) ? Int32.Parse(args[5]) : 1;
                        int another    = (args.Length > 6) ? Int32.Parse(args[6]) : 1;
                        var format     = new Uri(@uriBase + "twitter-40");
                        var collection = readComputation
                                         .ReadHdfsBinaryCollection <Edge>(format);

                        Stream <int[], Epoch> readStuff = null;

                        switch (args[3])
                        {
                        case "sp":
                            readStuff = collection.GroupEdgesSingleProcess(parts, parts);
                            break;

                        case "pp":
                            readStuff = collection.GroupEdgesPartsPerProcess(parts, parts, 16);
                            break;

                        case "op":
                            readStuff = collection.GroupEdgesOnePerProcess(parts, parts, 16);
                            break;

                        case "hp":
                            readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines, 16);
                            break;

                        case "hhp":
                            readStuff = collection.GroupEdgesHierarchyPerProcess(parts, machines * another, 16);
                            break;

                        default:
                            throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp");
                        }

                        var sink = new InterGraphDataSink <int[]>(readStuff);

                        readComputation.Activate();
                        readComputation.Join();

                        Console.WriteLine("Reading done: " + readWatch.Elapsed);

                        for (int i = 0; i < 20; ++i)
                        {
                            var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                            using (var computation = controller.NewComputation())
                            {
                                var firstStage = computation.NewInput(sink.NewDataSource())
                                                 .ReformatInts();

                                if (parts * machines * another > 1)
                                {
                                    firstStage = firstStage
                                                 .UnionFindHashTable(65000000, parts * machines * another, machines * another);
                                }

                                switch (args[3])
                                {
                                case "sp":
                                    firstStage
                                    .PartitionBy(x => parts * parts)
                                    .UnionFindHashTable(65000000);
                                    break;

                                case "pp":
                                    firstStage
                                    .PartitionBy(x => 16 * parts)
                                    .UnionFindHashTable(65000000);
                                    break;

                                case "op":
                                    firstStage
                                    .PartitionBy(x => 16 * (parts * parts))
                                    .UnionFindHashTable(65000000);
                                    break;

                                case "hp":
                                    if (parts * parts < 16)
                                    {
                                        firstStage
                                        .PartitionBy(x => 16 * x.destination + (parts * parts))
                                        .UnionFindHashTable(65000000, 0, 0)
                                        .PartitionBy(x => 16 * (machines * machines))
                                        .UnionFindHashTable(65000000);
                                    }
                                    else
                                    {
                                        firstStage
                                        .PartitionBy(x => 16 * (x.destination + (machines * machines)))
                                        .UnionFindHashTable(65000000, 0, 0)
                                        .PartitionBy(x => 16 * ((machines * machines) + (machines * machines)))
                                        .UnionFindHashTable(65000000);
                                    }
                                    break;

                                case "hhp":
                                    firstStage
                                    .PartitionBy(x => 16 * ((x.destination / (machines * machines)) + (machines * machines * another * another)) + (x.destination % (machines * machines)))
                                    .UnionFindHashTable(65000000, -machines * another, another)
                                    .PartitionBy(x => 16 * (x.destination + (another * another) + (machines * machines * another * another)))
                                    .UnionFindHashTable(65000000, -another, 1)
                                    .PartitionBy(x => 16 * ((another * another) + (another * another) + (machines * machines * another * another)))
                                    .UnionFindHashTable(65000000);
                                    break;

                                default:
                                    throw new ApplicationException("Grouping type must be sp, pp, op, hp or hpp");
                                }

                                computation.Activate();
                                computation.Join();
                            }

                            Console.WriteLine(stopwatch.Elapsed);
                        }
                    }

                    controller.Join();
                }
            }


            if (algorithm == "connectedcomponents" && dataset == "livejournal")
            {
                var stopwatch = System.Diagnostics.Stopwatch.StartNew();

                using (var computation = NewComputation.FromConfig(configuration))
                {
                    var edges = System.IO.File.OpenRead(livejournalFile)
                                .ReadEdges()
                                .AsNaiadStream(computation);

                    edges.UnionFind(5000000)
                    .PartitionBy(x => 0)
                    .UnionFind(5000000);

                    computation.Activate();
                    computation.Join();
                }

                Console.WriteLine(stopwatch.Elapsed);
            }
            #endregion
        }