public static DataTable OneHot(this DataTable dataTable, Type[] columnTypes, bool dropFirst = false) { DataTable result = new DataTable(); result.Merge(dataTable); for (int i = 0; i < dataTable.Columns.Count; i++) { Console.WriteLine($"Processing {i}'th column of {dataTable.Columns.Count}"); //TODO remove if (columnTypes[i] == typeof(object)) { LabelEncoder le = new LabelEncoder(); var values = le.FitTransform(dataTable.Columns[i]); var oneHotValues = Matrix.OneHot(values); var oneHotLabels = le.Classes.Select(x => $"{dataTable.Columns[i].ColumnName}_{x}").ToArray <string>(); Console.WriteLine($"Shrink with {le.Classes.Length} columns"); int j = (dropFirst ? 1 : 0); for (; j < le.Classes.Length; j++) { Console.Write(j + " "); result.SetColumn(oneHotLabels[j], Matrix.GetColumn(oneHotValues, j)); } } } for (int i = 0; i < dataTable.Columns.Count; i++) { if (columnTypes[i] == typeof(object)) { result.DropColumn(dataTable.Columns[i].ColumnName); } } return(result); }
public static void PrepareData(out double[][] x_train, out double[][] x_test, out int[] y_train, out int[] y_test, out string[] y_labels, bool forcePrepare = false) { if (!forcePrepare && Cache.IsCacheExists) { Cache.LoadFromCache("x_train.txt", out x_train); Cache.LoadFromCache("x_test.txt", out x_test); Cache.LoadFromCache("y_train.txt", out y_train); Cache.LoadFromCache("y_test.txt", out y_test); Cache.LoadFromCache("y_labels.txt", out y_labels); } else { //[1] Read data from KDDTrain+.arff List <string> data_columns_list = new List <string>(); Type[] data_columns_types; using (var sReader = new StreamReader(File.OpenRead($"{Settings.PathToData}\\KDDTrain+.arff"))) { sReader.ReadLine(); //skip first line string line; while ((line = sReader.ReadLine()).StartsWith("@attribute")) { data_columns_list.Add(line.Split()[1].Replace("'", "")); } var firstDataRow = sReader.ReadLine().Split(','); Utils.InferentTypes(firstDataRow, out data_columns_types); } var data_columns = data_columns_list.ToArray(); var data_columns_ext = new string[data_columns.Length + 1]; Array.Copy(data_columns, data_columns_ext, data_columns.Length); data_columns_ext[data_columns_ext.Length - 1] = "difficulty"; //[2] Read train data from file KDDTrain+.txt var df_train_reader = new CsvReader($"{Settings.PathToData}\\KDDTrain+.txt", hasHeaders: false); DataTable df_train = df_train_reader.ToTable(); //returns all columns with string type and empty headers df_train = df_train.ChangeTypes(data_columns_types); //transform types in order to be able to manipulate with them df_train.AssignHeaders(data_columns_ext); //add headers df_train.Columns.Remove("difficulty"); //[3] Read test data from file KDDTest+.txt (same actions as above) var df_test_reader = new CsvReader($"{Settings.PathToData}\\KDDTest+.txt", hasHeaders: false); DataTable df_test = df_test_reader.ToTable(); df_test = df_test.ChangeTypes(data_columns_types); df_test.AssignHeaders(data_columns_ext); df_test.Columns.Remove("difficulty"); //[4] - optional //Console.WriteLine(df_train.Head()); //Console.WriteLine(df_test.Head()); //[5] - optional //Console.WriteLine(df_train.ValueCounts<string>("class")); //Console.WriteLine(df_test.ValueCounts<string>("class")); //[6] Find classes existing in both train and test datasets var train_set = new HashSet <string>(df_train.Columns["class"].ToArray <string>()); var test_set = new HashSet <string>(df_test.Columns["class"].ToArray <string>()); var common_values = train_set.Intersect(test_set).ToArray <string>(); //opt. //Console.WriteLine($"Common 'class' labels: {common_values.Length}"); //Console.WriteLine(string.Join(", ", common_values)); //[13] - optional //Console.WriteLine($"'class' objects count before deletion: train={df_train.Shape("class")}, test={df_test.Shape("class")}"); //[14] Remove classes that don't present in common for both datasets classes df_train.RemoveRows(x => !common_values.Contains(x["class"]), acceptChanges: true); df_test.RemoveRows(x => !common_values.Contains(x["class"]), acceptChanges: true); //opt. //Console.WriteLine($"'class' objects count after deletion: train={df_train.Shape("class")}, test={df_test.Shape("class")}"); //[15] Assign to string classes the corresponding numeric value LabelEncoder le = new LabelEncoder(); y_train = le.FitTransform(df_train.Columns["class"]); y_test = le.Transform(df_test.Columns["class"]); y_labels = le.Classes; //opt. //Console.WriteLine(string.Join(", ", le.Classes)); //[16] Do 'one-hot' encoding of a categorical data df_train.SetColumn("train", 1, acceptChanges: true); //add marker column in order to split sets back after one-hot df_test.SetColumn("train", 0, acceptChanges: true); var df_full = Utils.ConcatDataTables(df_train, df_test); //merge two datasets in one df_full.DropColumn("class", acceptChanges: true); //remove 'class' column Utils.InferentTypes(df_full.Rows[0].ItemArray, out data_columns_types); //convert types from string to more specific var df_full_encoded = df_full.OneHot(data_columns_types, dropFirst: true); //one-hot itself //split datasets back var X_train_encoded = df_full_encoded.AsEnumerable().Where(x => x["train"].ToString() == "1").CopyToDataTable(); var X_test_encoded = df_full_encoded.AsEnumerable().Where(x => x["train"].ToString() == "0").CopyToDataTable(); //..and remove marker column df_full_encoded.DropColumn("train", acceptChanges: true); X_train_encoded.DropColumn("train", acceptChanges: true); X_test_encoded.DropColumn("train", acceptChanges: true); //convert DataTable to double array - that dramatically increases speed of learning x_train = X_train_encoded.ToJagged(); x_test = X_test_encoded.ToJagged(); Cache.SaveToCache(df_full_encoded, "df_full_encoded.txt"); Cache.SaveToCache(x_train, "x_train.txt"); Cache.SaveToCache(x_test, "x_test.txt"); Cache.SaveToCache(y_train, "y_train.txt"); Cache.SaveToCache(y_test, "y_test.txt"); Cache.SaveToCache(le.Classes, "y_labels.txt"); } }