Example #1
0
 /// <summary>
 /// Split the dataset into the train set and test set according to the given fraction.
 /// Respects the <paramref name="stratificationColumn"/> if provided.
 /// </summary>
 /// <typeparam name="T">The tuple describing the data schema.</typeparam>
 /// <param name="catalog">The training catalog.</param>
 /// <param name="data">The dataset to split.</param>
 /// <param name="testFraction">The fraction of data to go into the test set.</param>
 /// <param name="stratificationColumn">Optional selector for the column to use as a stratification column. If two examples share the same value of the <paramref name="stratificationColumn"/>
 /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set.
 /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers .</param>
 /// <param name="seed">Optional parameter used in combination with the <paramref name="stratificationColumn"/>.
 /// If the <paramref name="stratificationColumn"/> is not provided, the random numbers generated to create it, will use this seed as value.
 /// And if it is not provided, the default value will be used.</param>
 /// <returns>A pair of datasets, for the train and test set.</returns>
 public static (DataView <T> trainSet, DataView <T> testSet) TrainTestSplit <T>(this TrainCatalogBase catalog,
                                                                                DataView <T> data, double testFraction = 0.1, Func <T, PipelineColumn> stratificationColumn = null, uint?seed = null)
 {
 public static IHostEnvironment GetEnvironment(TrainCatalogBase catalog) => Contracts.CheckRef(catalog, nameof(catalog)).Environment;