private static (DataFrame, string[]) BinaryEncoding2(this DataFrame df, string colName, bool encodedOnly = false) { var colVector = df[colName]; var classValues = colVector.Where(x => DataFrame.NAN != x).Select(x => x.ToString()).Distinct().ToList(); //define encoded columns var dict = new Dictionary <string, List <object> >(); var encodedValues = new List <object>(); foreach (var value in colVector) { int ordinalValue = classValues.IndexOf(value.ToString()); if (ordinalValue == 0) { ordinalValue = -1; } encodedValues.Add(ordinalValue); } // dict.Add(colName + "_cvalues", encodedValues); var newDf = df.AddColumns(dict); if (encodedOnly) { return(newDf[new string[] { colName + "_cvalues" }], classValues.ToArray()); } else { return(newDf, classValues.ToArray()); } }
public void AddSameColumns_Test01() { var dict = new Dictionary <string, List <object> > { { "col1", new List <object>() { 1, 11, 21, 31, 41, 51, 61, 71, 81, 91 } }, { "col2", new List <object>() { 2, 12, 22, 32, 42, 52, 62, 72, 82, 92 } }, { "col3", new List <object>() { 3, 13, 23, 33, 43, 53, 63, 73, 83, 93 } }, { "col4", new List <object>() { 4, 14, 24, 34, 44, 54, 64, 74, 84, 94 } }, { "col5", new List <object>() { 5, 15, 25, 35, 45, 55, 65, 75, 85, 95 } }, { "col6", new List <object>() { 6, 16, 26, 36, 46, 56, 66, 76, 86, 96 } }, { "col7", new List <object>() { 7, 17, 27, 37, 47, 57, 67, 77, 87, 97 } }, }; // var df = new DataFrame(dict); //define three new columns var d = new Dictionary <string, List <object> > { { "col5", new List <object>() { 8, 18, 28, 38, 48, 58, 68, 78, 88, 98 } }, { "col2", new List <object>() { 9, 19, 29, 39, 49, 59, 69, 79, 89, 99 } }, { "col10", new List <object>() { 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 } }, }; //exception the number of list object is not divisible with column counts var exception = Assert.ThrowsAny <System.Exception>(() => df.AddColumns(d)); Assert.Equal("Column(s) 'col2, col5' already exist(s) in the data frame.", exception.Message); }
public static DataFrame EncodeColumn(this DataFrame df, MLContext mlContext, string colName, bool encodedOnly = false) { var colVector = df[colName]; IDataView data = mlContext.Data.LoadFromEnumerable <CategoryColumn>(colVector.Select(x => new CategoryColumn() { Classes = x.ToString() })); var fitData = mlContext.Transforms.Categorical.OneHotEncoding(nameof(CategoryColumn.Classes)).Fit(data); var transData = fitData.Transform(data); //retrieve annotation from the column about slotnames VBuffer <ReadOnlyMemory <char> > labels = default; transData.Schema[nameof(CategoryColumn.Classes)].GetSlotNames(ref labels); var convertedData = mlContext.Data.CreateEnumerable <EncodedColumn>(transData, true); var originalLabels = labels.DenseValues().Select(x => x.ToString()).ToList(); var dict = new Dictionary <string, List <object> >(); foreach (var r in convertedData) { for (int i = 0; i < originalLabels.Count; i++) { if (!dict.ContainsKey(originalLabels[i])) { var lst = new List <object>(); lst.Add((object)r.Classes[i]); dict.Add(originalLabels[i], lst); } else { dict[originalLabels[i]].Add((object)r.Classes[i]); } } } if (encodedOnly) { var newDf = new DataFrame(dict); return(newDf); } else { var newDf = df.AddColumns(dict); return(newDf); } }
private static (DataFrame, string[]) DummyEncodeColumn(this DataFrame df, string colName, bool encodedOnly = false) { var colVector = df[colName]; var classValues = colVector.Where(x => DataFrame.NAN != x).Select(x => x.ToString()).Distinct().ToArray(); //define encoded columns var dict = new Dictionary <string, List <object> >(); var dummyClasses = classValues.SkipLast(1); //add dummy encoded columns foreach (var c in dummyClasses) { dict.Add(c, new List <object>()); } //encode values foreach (var cValue in colVector) { for (int i = 0; i < dummyClasses.Count(); i++) { if (cValue.ToString() == classValues[i]) { dict[classValues[i]].Add((int)1); } else { dict[classValues[i]].Add((int)0); } } } if (encodedOnly) { var newDf = new DataFrame(dict); return(newDf, classValues); } else { var newDf = df.AddColumns(dict); return(newDf, classValues); } }
public static void CategoryToKey(this DataFrame df, MLContext mlContext, string colName) { var colVector = df[colName]; IDataView data = mlContext.Data.LoadFromEnumerable <CategoryColumn>(colVector.Select(x => new CategoryColumn() { Classes = x.ToString() })); var fitData = mlContext.Transforms.Categorical.OneHotEncoding(nameof(CategoryColumn.Classes), outputKind: OneHotEncodingEstimator.OutputKind.Key).Fit(data); var transData = fitData.Transform(data); var convertedData = mlContext.Data.CreateEnumerable <CategoryValues>(transData, true); var dict = new Dictionary <string, List <object> >(); var colValues = new List <object>(); foreach (var r in convertedData) { colValues.Add(r.Classes); } dict.Add(colName + "_cvalues", colValues); df.AddColumns(dict); return; }
public void AddColumns_Test01() { var dict = new Dictionary <string, List <object> > { { "col1", new List <object>() { 1, 11, 21, 31, 41, 51, 61, 71, 81, 91 } }, { "col2", new List <object>() { 2, 12, 22, 32, 42, 52, 62, 72, 82, 92 } }, { "col3", new List <object>() { 3, 13, 23, 33, 43, 53, 63, 73, 83, 93 } }, { "col4", new List <object>() { 4, 14, 24, 34, 44, 54, 64, 74, 84, 94 } }, { "col5", new List <object>() { 5, 15, 25, 35, 45, 55, 65, 75, 85, 95 } }, { "col6", new List <object>() { 6, 16, 26, 36, 46, 56, 66, 76, 86, 96 } }, { "col7", new List <object>() { 7, 17, 27, 37, 47, 57, 67, 77, 87, 97 } }, }; // var df = new DataFrame(dict); //define three new columns var d = new Dictionary <string, List <object> > { { "col8", new List <object>() { 8, 18, 28, 38, 48, 58, 68, 78, 88, 98 } }, { "col9", new List <object>() { 9, 19, 29, 39, 49, 59, 69, 79, 89, 99 } }, { "col10", new List <object>() { 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 } }, }; //add three new columns var newDf = df.AddColumns(d); Assert.Equal(7, df.ColCount()); Assert.Equal(10, newDf.ColCount()); for (int i = 0; i < newDf.Values.Count; i++) { Assert.Equal(i + 1, newDf.Values[i]); } }