static void Main(string[] args) { Arguments arg; if ((arg = Arguments.Parse(args)) == null) { return; } var sw = System.Diagnostics.Stopwatch.StartNew(); Console.Write("Initializing/Connecting to cluster... "); Logger.ParseArgs(new string[] { "-log", "DistributedSGD.log" }); Prajna.Core.Environment.Init(); var cluster = new Cluster(arg.Cluster); Console.WriteLine($"done. (took: {sw.Elapsed})"); sw.Restart(); Console.Write("Loading dataset(s)... "); DSet <Example> trainSet = LoadDSet(cluster, arg.TrainFile, arg.NumPartitions, arg.NumTrain > 0 ? arg.NumTrain : int.MaxValue, 0); DSet <Example> testSet = arg.TestFile == null ? null : LoadDSet(cluster, arg.TestFile, arg.NumPartitions, arg.NumTest > 0 ? arg.NumTest : int.MaxValue, 0); Func <DSet <Example>, int> getDimension = ds => ds.Fold((max, ex) => Math.Max(ex.Features.Indices.Max(), max), Math.Max, 0) + 1; var dimension = Math.Max(getDimension(trainSet), arg.TestFile == null ? 0 : getDimension(testSet)); Console.WriteLine($"done. (took: {sw.Elapsed})"); SetDimension(trainSet, dimension); Console.WriteLine($"Train Count: {trainSet.Count()}"); float trainPrior = GetPrior(trainSet); Console.WriteLine($"Train Prior: {trainPrior}"); if (arg.TestFile != null) { SetDimension(testSet, dimension); Console.WriteLine($"Test Count: {testSet.Count()}"); float testPrior = GetPrior(testSet); Console.WriteLine($"Test Prior: {testPrior}"); } ILossFunction loss; switch (arg.Loss.ToLower()) { case "logistic": loss = new LogisticLoss(); break; case "hinge": loss = new HingeLoss(); break; default: Console.WriteLine($"Unrecognized loss function: {arg.Loss}. Supported values: Hinge, Logistic."); return; } if (arg.ModelOut != null) { if (!Directory.Exists(Path.GetDirectoryName(arg.ModelOut))) { Console.WriteLine($"Directory {Path.GetDirectoryName(arg.ModelOut)} not found."); return; } } var model = new LinearModel(new HingeLoss(), arg.LearningRate, arg.L2, arg.L1); var initialParams = new WeightsAndBias(new float[dimension], 0.0f); WeightsAndBias finalParams = RunSGD(trainSet, testSet, model, initialParams, DistributedSGD <WeightsAndBias> .Instance, arg.NumEpochs); if (arg.ModelOut != null) { Console.WriteLine(); Console.Write("Done training. Saving model... "); sw.Restart(); using (var writer = new StreamWriter(arg.ModelOut)) { writer.WriteLine("Dimension:"); writer.WriteLine(finalParams.Weights.Length); writer.WriteLine("Weights:"); foreach (var w in finalParams.Weights) { writer.WriteLine(w); } writer.WriteLine("Bias:"); writer.WriteLine(finalParams.Bias); } Console.WriteLine($"done. (took: {sw.Elapsed})"); } }
public float GetAccuracy(DSet <Example> dataset, ISGDModel <Params> model, Params curParams) { int hits = dataset.Fold((c, ex) => model.Predict(curParams, ex) == ex.Label ? c + 1 : c, (c1, c2) => c1 + c2, 0); return((float)hits / dataset.Count()); }
/// <summary> /// The "prior" of a dataset is simply the number of positive examples divided by the size of the dataset. /// </summary> /// <param name="examples"></param> /// <returns></returns> private static float GetPrior(DSet <Example> examples) { long hits = examples.Fold((c, ex) => ex.Label == 1.0f ? c + 1 : c, (c1, c2) => c1 + c2, 0); return((float)hits / examples.Count()); }