Ejemplo n.º 1
0
        private static (DataFrame, string[]) BinaryEncoding2(this DataFrame df, string colName, bool encodedOnly = false)
        {
            var colVector = df[colName];

            var classValues = colVector.Where(x => DataFrame.NAN != x).Select(x => x.ToString()).Distinct().ToList();

            //define encoded columns
            var dict          = new Dictionary <string, List <object> >();
            var encodedValues = new List <object>();

            foreach (var value in colVector)
            {
                int ordinalValue = classValues.IndexOf(value.ToString());
                if (ordinalValue == 0)
                {
                    ordinalValue = -1;
                }
                encodedValues.Add(ordinalValue);
            }

            //
            dict.Add(colName + "_cvalues", encodedValues);
            var newDf = df.AddColumns(dict);

            if (encodedOnly)
            {
                return(newDf[new string[] { colName + "_cvalues" }], classValues.ToArray());
            }
            else
            {
                return(newDf, classValues.ToArray());
            }
        }
Ejemplo n.º 2
0
        public void AddSameColumns_Test01()
        {
            var dict = new Dictionary <string, List <object> >
            {
                { "col1", new List <object>()
                  {
                      1, 11, 21, 31, 41, 51, 61, 71, 81, 91
                  } },
                { "col2", new List <object>()
                  {
                      2, 12, 22, 32, 42, 52, 62, 72, 82, 92
                  } },
                { "col3", new List <object>()
                  {
                      3, 13, 23, 33, 43, 53, 63, 73, 83, 93
                  } },
                { "col4", new List <object>()
                  {
                      4, 14, 24, 34, 44, 54, 64, 74, 84, 94
                  } },
                { "col5", new List <object>()
                  {
                      5, 15, 25, 35, 45, 55, 65, 75, 85, 95
                  } },
                { "col6", new List <object>()
                  {
                      6, 16, 26, 36, 46, 56, 66, 76, 86, 96
                  } },
                { "col7", new List <object>()
                  {
                      7, 17, 27, 37, 47, 57, 67, 77, 87, 97
                  } },
            };
            //
            var df = new DataFrame(dict);

            //define three new columns
            var d = new Dictionary <string, List <object> >
            {
                { "col5", new List <object>()
                  {
                      8, 18, 28, 38, 48, 58, 68, 78, 88, 98
                  } },
                { "col2", new List <object>()
                  {
                      9, 19, 29, 39, 49, 59, 69, 79, 89, 99
                  } },
                { "col10", new List <object>()
                  {
                      10, 20, 30, 40, 50, 60, 70, 80, 90, 100
                  } },
            };

            //exception the number of list object is not divisible with column counts
            var exception = Assert.ThrowsAny <System.Exception>(() => df.AddColumns(d));

            Assert.Equal("Column(s) 'col2, col5' already exist(s) in the data frame.", exception.Message);
        }
Ejemplo n.º 3
0
        public static DataFrame EncodeColumn(this DataFrame df, MLContext mlContext, string colName, bool encodedOnly = false)
        {
            var       colVector = df[colName];
            IDataView data      = mlContext.Data.LoadFromEnumerable <CategoryColumn>(colVector.Select(x => new CategoryColumn()
            {
                Classes = x.ToString()
            }));
            var fitData   = mlContext.Transforms.Categorical.OneHotEncoding(nameof(CategoryColumn.Classes)).Fit(data);
            var transData = fitData.Transform(data);

            //retrieve annotation from the column about slotnames
            VBuffer <ReadOnlyMemory <char> > labels = default;

            transData.Schema[nameof(CategoryColumn.Classes)].GetSlotNames(ref labels);

            var convertedData  = mlContext.Data.CreateEnumerable <EncodedColumn>(transData, true);
            var originalLabels = labels.DenseValues().Select(x => x.ToString()).ToList();
            var dict           = new Dictionary <string, List <object> >();

            foreach (var r in convertedData)
            {
                for (int i = 0; i < originalLabels.Count; i++)
                {
                    if (!dict.ContainsKey(originalLabels[i]))
                    {
                        var lst = new List <object>();
                        lst.Add((object)r.Classes[i]);
                        dict.Add(originalLabels[i], lst);
                    }
                    else
                    {
                        dict[originalLabels[i]].Add((object)r.Classes[i]);
                    }
                }
            }
            if (encodedOnly)
            {
                var newDf = new DataFrame(dict);
                return(newDf);
            }
            else
            {
                var newDf = df.AddColumns(dict);
                return(newDf);
            }
        }
Ejemplo n.º 4
0
        private static (DataFrame, string[]) DummyEncodeColumn(this DataFrame df, string colName, bool encodedOnly = false)
        {
            var colVector   = df[colName];
            var classValues = colVector.Where(x => DataFrame.NAN != x).Select(x => x.ToString()).Distinct().ToArray();

            //define encoded columns
            var dict         = new Dictionary <string, List <object> >();
            var dummyClasses = classValues.SkipLast(1);

            //add dummy encoded columns
            foreach (var c in dummyClasses)
            {
                dict.Add(c, new List <object>());
            }

            //encode values
            foreach (var cValue in colVector)
            {
                for (int i = 0; i < dummyClasses.Count(); i++)
                {
                    if (cValue.ToString() == classValues[i])
                    {
                        dict[classValues[i]].Add((int)1);
                    }
                    else
                    {
                        dict[classValues[i]].Add((int)0);
                    }
                }
            }
            if (encodedOnly)
            {
                var newDf = new DataFrame(dict);
                return(newDf, classValues);
            }
            else
            {
                var newDf = df.AddColumns(dict);
                return(newDf, classValues);
            }
        }
Ejemplo n.º 5
0
        public static void CategoryToKey(this DataFrame df, MLContext mlContext, string colName)
        {
            var       colVector = df[colName];
            IDataView data      = mlContext.Data.LoadFromEnumerable <CategoryColumn>(colVector.Select(x => new CategoryColumn()
            {
                Classes = x.ToString()
            }));
            var fitData       = mlContext.Transforms.Categorical.OneHotEncoding(nameof(CategoryColumn.Classes), outputKind: OneHotEncodingEstimator.OutputKind.Key).Fit(data);
            var transData     = fitData.Transform(data);
            var convertedData = mlContext.Data.CreateEnumerable <CategoryValues>(transData, true);

            var dict      = new Dictionary <string, List <object> >();
            var colValues = new List <object>();

            foreach (var r in convertedData)
            {
                colValues.Add(r.Classes);
            }
            dict.Add(colName + "_cvalues", colValues);
            df.AddColumns(dict);
            return;
        }
Ejemplo n.º 6
0
        public void AddColumns_Test01()
        {
            var dict = new Dictionary <string, List <object> >
            {
                { "col1", new List <object>()
                  {
                      1, 11, 21, 31, 41, 51, 61, 71, 81, 91
                  } },
                { "col2", new List <object>()
                  {
                      2, 12, 22, 32, 42, 52, 62, 72, 82, 92
                  } },
                { "col3", new List <object>()
                  {
                      3, 13, 23, 33, 43, 53, 63, 73, 83, 93
                  } },
                { "col4", new List <object>()
                  {
                      4, 14, 24, 34, 44, 54, 64, 74, 84, 94
                  } },
                { "col5", new List <object>()
                  {
                      5, 15, 25, 35, 45, 55, 65, 75, 85, 95
                  } },
                { "col6", new List <object>()
                  {
                      6, 16, 26, 36, 46, 56, 66, 76, 86, 96
                  } },
                { "col7", new List <object>()
                  {
                      7, 17, 27, 37, 47, 57, 67, 77, 87, 97
                  } },
            };
            //
            var df = new DataFrame(dict);

            //define three new columns
            var d = new Dictionary <string, List <object> >
            {
                { "col8", new List <object>()
                  {
                      8, 18, 28, 38, 48, 58, 68, 78, 88, 98
                  } },
                { "col9", new List <object>()
                  {
                      9, 19, 29, 39, 49, 59, 69, 79, 89, 99
                  } },
                { "col10", new List <object>()
                  {
                      10, 20, 30, 40, 50, 60, 70, 80, 90, 100
                  } },
            };

            //add three new columns
            var newDf = df.AddColumns(d);

            Assert.Equal(7, df.ColCount());
            Assert.Equal(10, newDf.ColCount());

            for (int i = 0; i < newDf.Values.Count; i++)
            {
                Assert.Equal(i + 1, newDf.Values[i]);
            }
        }