コード例 #1
0
                protected override void VerifyView(IDataView view)
                {
                    Host.AssertValue(view);
                    // This must have precisely one column, of type vector.
                    var schema = view.Schema;

                    Host.CheckDecode(schema.ColumnCount == 1);
                    var ttype = schema.GetColumnType(0);

                    Host.CheckDecode(ttype.IsVector);
                    // We have no way to encode a type of zero length vectors per se in the case
                    // when there are no rows in the original dataset, but accept that if the vector
                    // count is "unknown" then it's really a zero-row dataset.
                    Host.CheckDecode(ttype.ValueCount == _parent._header.RowCount);
                    // This came from a binary IDV, so it must have an actual "row" count,
                    // though this row count for this is more like a "slot" count.
                    var rowCountNull = view.GetRowCount();

                    Host.Assert(rowCountNull.HasValue);
                    long rowCount = rowCountNull.Value;
                    // There must be one "row" per "slot" on the column this is a transpose of.
                    // Check that.
                    var type = _parent.Schema.GetColumnType(_col);

                    Host.CheckDecode(type.ValueCount == rowCount);
                    // The item types should be the same.
                    Host.CheckDecode(type.ItemType.Equals(ttype.ItemType));
                }
コード例 #2
0
        private AppendRowsDataView(IHostEnvironment env, Schema schema, IDataView[] sources)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(RegistrationName);

            _host.AssertValueOrNull(schema);
            _host.AssertValue(sources);
            _host.Assert(sources.Length >= 2);

            _sources = sources;
            _schema  = schema ?? _sources[0].Schema;

            CheckSchemaConsistency();

            _canShuffle = true;
            _counts     = new int[_sources.Length];
            for (int i = 0; i < _sources.Length; i++)
            {
                IDataView dv = _sources[i];
                if (!dv.CanShuffle)
                {
                    _canShuffle = false;
                    _counts     = null;
                    break;
                }
                long?count = dv.GetRowCount();
                if (count == null || count < 0 || count > int.MaxValue)
                {
                    _canShuffle = false;
                    _counts     = null;
                    break;
                }
                _counts[i] = (int)count;
            }
        }
コード例 #3
0
        static void Main(string[] args)
        {
            Console.WriteLine("Training time series analysis");
            //Step 1. Create a ML Context
            var ctx = new MLContext();

            string connectionString = "Data Source=localhost;Initial Catalog=kaggle_wallmart;Provider=SQLNCLI11.1;Integrated Security=SSPI;Auto Translate=False;";

            connectionString = "Server=localhost;Database=kaggle_wallmart;Integrated Security=True";

            string Query = @"
                SELECT 
                      CAST(X.[Value] AS REAL) AS [TotalSales],
                      CAST(Y.date AS DATE) AS [SalesDate],
	                  CAST(year(Y.date) AS REAL) As [Year]
                  FROM [dbo].[RAW_Train_Eval] AS X
                  INNER JOIN [dbo].RAW_Calendar AS Y ON Y.d=X.dCode
                  where Id='HOBBIES_1_278_CA_1_evaluation' 
                  order by 2

            ";

            Console.WriteLine("Connecting to the database...");
            //dbChecks dbchecks = new dbChecks();
            //dbchecks.ExecuteQuery(connectionString, Query);


            System.Data.SqlClient.SqlClientFactory newFactory = SqlClientFactory.Instance;
            Console.WriteLine("Loading data...");
            DatabaseSource dbSource = new DatabaseSource(SqlClientFactory.Instance, connectionString, Query);
            DatabaseLoader loader   = ctx.Data.CreateDatabaseLoader <ModelInput>();
            IDataView      dataView = loader.Load(dbSource);

            Console.WriteLine($"Loaded {dataView.GetRowCount()} rows...");

            IDataView trainingData   = ctx.Data.FilterRowsByColumn(dataView, "Year", upperBound: 2016);
            IDataView ValidationData = ctx.Data.FilterRowsByColumn(dataView, "Year", lowerBound: 2016);

            var forecastingPipeline = ctx.Forecasting.ForecastBySsa(
                outputColumnName: "ForecastedSales",
                inputColumnName: "TotalSales",
                windowSize: 7,
                seriesLength: 60,
                trainSize: 300,
                horizon: 30,
                confidenceLevel: 0.95f,
                confidenceLowerBoundColumn: "LowerBoundSales",
                confidenceUpperBoundColumn: "UpperBoundSales");
            SsaForecastingTransformer forecaster = forecastingPipeline.Fit(trainingData);

            Evaluate(ValidationData, forecaster, ctx);
            var forecastEngine = forecaster.CreateTimeSeriesEngine <ModelInput, ModelOutput>(ctx);

            forecastEngine.CheckPoint(ctx, "c:\\temp\\Model.zip");
            forecastEngine.CheckPoint(ctx, "C:\\Temp\\WallMartModels\\evaluation\\Model_HOBBIES_1_278_CA_1_evaluation.zip");
            Forecast(ValidationData, 7, forecastEngine, ctx);
            Console.WriteLine("Training time series analysis completed");
        }
コード例 #4
0
ファイル: Program.cs プロジェクト: nimaeru05/MLNETSeries
        static void Main(string[] args)
        {
            //Data prep.
            var TrainData  = new List <DiabetesRecord>();
            var DataFolder = GetAbsolutePath("../../../Data/");
            var Files      = Directory.GetFiles(DataFolder, "*");

            foreach (var filePath in Files)
            {
                foreach (var line in File.ReadAllLines(filePath))
                {
                    var         cols      = line.Split('\t');
                    CultureInfo ci        = new CultureInfo("id-ID");
                    var         DateStr   = $"{cols[0]} {cols[1]}";//Convert.ToDateTime($"{cols[0]} {cols[1]}", ci );
                    var         len       = DateStr.Length;
                    float       dataValue = 0;
                    float.TryParse(cols[3], out dataValue);
                    //make sure this line can be processed / contains correct time-series data
                    if (len >= 15)
                    {
                        //parse string of date to datetime
                        DateTime.TryParse(DateStr, out DateTime dt);
                        if (dt.Year > DateTime.MinValue.Year)
                        {
                            TrainData.Add(new DiabetesRecord()
                            {
                                TimeStamp = dt, Code = cols[2], Data = dataValue
                            });
                        }
                    }
                }
            }
            HashSet <string> CodeIn = new HashSet <string>();

            //only observe data with code 48,57-61
            CodeIn.Add("48"); CodeIn.Add("57"); CodeIn.Add("58"); CodeIn.Add("59"); CodeIn.Add("60"); CodeIn.Add("61"); CodeIn.Add("62"); CodeIn.Add("63"); CodeIn.Add("64");
            TrainData = TrainData.Where(x => CodeIn.Contains(x.Code)).OrderBy(a => a.TimeStamp).ToList();
            Console.WriteLine($"Total data : {TrainData.Count}");

            // Create MLContext
            mlContext = new MLContext();

            //Load Data
            IDataView data = mlContext.Data.LoadFromEnumerable <DiabetesRecord>(TrainData);
            //assign the Number of records in dataset file to cosntant variable
            var RowCount = data.GetRowCount();
            int size     = RowCount.HasValue? Convert.ToInt32(RowCount.Value) : 36;

            //STEP 1: Create Esimtator
            DetectSpike(size, data);
            //To detect persistent change in the pattern
            DetectChangepoint(10, data); //set 10 datapoints per-sliding window

            Console.WriteLine("=============== End of process, hit any key to finish ===============");

            Console.ReadLine();
        }
コード例 #5
0
        public void TestDataFrameFromIDataView_SelectColumns()
        {
            DataFrame df            = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);
            IDataView dfAsIDataView = df;
            DataFrame newDf         = dfAsIDataView.ToDataFrame("Int", "Double");

            Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count);
            Assert.Equal(2, newDf.Columns.Count);
            Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All());
            Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All());
        }
コード例 #6
0
        /// <summary>
        /// The InferPipelines methods are just public portals to the internal function that handle different
        /// types of data being passed in: training IDataView, path to training file, or train and test files.
        /// </summary>
        public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine,
                                                   IDataView trainData, IDataView testData, int numTransformLevels, int batchSize, SupportedMetric metric,
                                                   out PipelinePattern bestPipeline, ITerminator terminator, MacroUtils.TrainerKinds trainerKind)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(trainData, nameof(trainData));
            env.CheckValue(testData, nameof(testData));

            int           numOfRows = (int)(trainData.GetRowCount(false) ?? 1000);
            AutoMlMlState amls      = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, trainData, testData);

            bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfRows);
            return(amls);
        }
コード例 #7
0
 public virtual long?GetRowCount()
 {
     _host.CheckValue(_sourceCtx, "_sourceCtx");
     if (!IsInitialized())
     {
         lock (_lock)
             if (!IsInitialized())
             {
                 DelayedInitialisationLockFree();
             }
     }
     _host.CheckValue(_sourcePipe, "_sourcePipe");
     return(_sourcePipe.GetRowCount());
 }
コード例 #8
0
        public void TestDataFrameFromIDataView()
        {
            DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false);

            df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts
            IDataView dfAsIDataView = df;
            DataFrame newDf         = dfAsIDataView.ToDataFrame();

            Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count);
            Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count);
            for (int i = 0; i < df.Columns.Count; i++)
            {
                Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All());
            }
        }
コード例 #9
0
                protected override void VerifyView(IDataView view)
                {
                    Host.AssertValue(view);
                    var rowCountNull = view.GetRowCount();

                    // This came from a binary IDV, so it must have an actual row count.
                    Host.Assert(rowCountNull.HasValue);
                    long rowCount = rowCountNull.Value;

                    // Either we are holding only the schema information and have no rows,
                    // or we have the double-stored hybrid dataview with data stored both
                    // row-wise and column wise.
                    Host.CheckDecode(rowCount == 0 || _parent._header.RowCount == rowCount);

                    var schema = view.Schema;

                    Host.CheckDecode(schema.ColumnCount == _parent._header.ColumnCount);
                }
コード例 #10
0
        /// <summary>
        /// Handles specific cases DataViewUtils does not handle.
        /// </summary>
        /// <param name="view">IDataView</param>
        /// <param name="predicate">column selector (null for all)</param>
        /// <returns>number of rows</returns>
        public static long ComputeRowCount(IDataView view, Func <int, bool> predicate = null)
        {
            var res = view.GetRowCount();

            if (res.HasValue)
            {
                return(res.Value);
            }
            long lres = 0;

            using (var cur = view.GetRowCursor(predicate == null ? i => false : predicate))
            {
                while (cur.MoveNext())
                {
                    ++lres;
                }
            }
            return(lres);
        }
コード例 #11
0
        /// <summary>
        /// Handles specific cases DataViewUtils does not handle.
        /// </summary>
        /// <param name="view">IDataView</param>
        /// <param name="predicate">column selector (null for all)</param>
        /// <returns>number of rows</returns>
        public static long ComputeRowCount(IDataView view, IEnumerable <DataViewSchema.Column> columnsNeeded = null)
        {
            var res = view.GetRowCount();

            if (res.HasValue)
            {
                return(res.Value);
            }
            long lres = 0;

            using (var cur = view.GetRowCursor(columnsNeeded))
            {
                while (cur.MoveNext())
                {
                    ++lres;
                }
            }
            return(lres);
        }
コード例 #12
0
        private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns)
        {
            Single[]   targets  = new Single[previousRuns.Count()];
            Single[][] features = new Single[previousRuns.Count()][];

            int i = 0;

            foreach (RunResult r in previousRuns)
            {
                features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true);
                targets[i]  = (float)r.MetricValue;
                i++;
            }

            ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host);

            dvBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, targets);
            dvBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single, features);

            IDataView view = dvBuilder.GetDataView();

            _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations");

            using (IChannel ch = _host.Start("Single training"))
            {
                // Set relevant random forest arguments.
                // Train random forest.
                var trainer = new FastForestRegressionTrainer(_host,
                                                              new FastForestRegressionTrainer.Options
                {
                    FeatureFraction            = _args.SplitRatio,
                    NumberOfTrees              = _args.NumOfTrees,
                    MinimumExampleCountPerLeaf = _args.NMinForSplit,
                    LabelColumnName            = DefaultColumnNames.Label,
                    FeatureColumnName          = DefaultColumnNames.Features,
                });
                var predictor = trainer.Fit(view);

                // Return random forest predictor.
                return(predictor.Model);
            }
        }
コード例 #13
0
        private FastForestRegressionPredictor FitModel(IEnumerable <IRunResult> previousRuns)
        {
            Single[]   targets  = new Single[previousRuns.Count()];
            Single[][] features = new Single[previousRuns.Count()][];

            int i = 0;

            foreach (RunResult r in previousRuns)
            {
                features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true);
                targets[i]  = (Float)r.MetricValue;
                i++;
            }

            ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host);

            dvBuilder.AddColumn("Label", NumberType.Float, targets);
            dvBuilder.AddColumn("Features", NumberType.Float, features);

            IDataView view = dvBuilder.GetDataView();

            _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations");
            RoleMappedData data = TrainUtils.CreateExamples(view, "Label", "Features");

            using (IChannel ch = _host.Start("Single training"))
            {
                // Set relevant random forest arguments.
                FastForestRegression.Arguments args = new FastForestRegression.Arguments();
                args.FeatureFraction     = _args.SplitRatio;
                args.NumTrees            = _args.NumOfTrees;
                args.MinDocumentsInLeafs = _args.NMinForSplit;

                // Train random forest.
                FastForestRegression trainer = new FastForestRegression(_host, args);
                trainer.Train(data);
                FastForestRegressionPredictor predictor = trainer.CreatePredictor();

                // Return random forest predictor.
                ch.Done();
                return(predictor);
            }
        }
コード例 #14
0
        // Sometime GetRowCount doesn't really return the number of rows in the associated IDataView.
        // A more reliable solution is to turely iterate through all rows via a RowCursor.
        private static long GetRowCount(IDataView inputData, params VectorWhiteningEstimator.ColumnOptions[] columns)
        {
            long?rows = inputData.GetRowCount();

            if (rows != null)
            {
                return(rows.GetValueOrDefault());
            }

            int  maxRows = columns.Max(i => i.MaximumNumberOfRows);
            long r       = 0;

            using (var cursor = inputData.GetRowCursor())
            {
                while (r < maxRows && cursor.MoveNext())
                {
                    r++;
                }
            }
            return(r);
        }
コード例 #15
0
        // Sometime GetRowCount doesn't really return the number of rows in the associated IDataView.
        // A more reliable solution is to turely iterate through all rows via a RowCursor.
        private static long GetRowCount(IDataView inputData, params ColumnInfo[] columns)
        {
            long?rows = inputData.GetRowCount();

            if (rows != null)
            {
                return(rows.GetValueOrDefault());
            }

            int  maxRows = columns.Max(i => i.MaxRow);
            long r       = 0;

            using (var cursor = inputData.GetRowCursor(col => false))
            {
                while (r < maxRows && cursor.MoveNext())
                {
                    r++;
                }
            }
            return(r);
        }
コード例 #16
0
ファイル: SmacSweeper.cs プロジェクト: srsaggam/AutoMLDotNet
        private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns)
        {
            Single[]   targets  = new Single[previousRuns.Count()];
            Single[][] features = new Single[previousRuns.Count()][];

            int i = 0;

            foreach (RunResult r in previousRuns)
            {
                features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, r.ParameterSet, true);
                targets[i]  = (Float)r.MetricValue;
                i++;
            }

            ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_context);

            dvBuilder.AddColumn(DefaultColumnNames.Label, NumberType.Float, targets);
            dvBuilder.AddColumn(DefaultColumnNames.Features, NumberType.Float, features);

            IDataView data = dvBuilder.GetDataView();

            AutoMlUtils.Assert(data.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations");

            // Set relevant random forest arguments.
            // Train random forest.
            var trainer = new FastForestRegression(_context, DefaultColumnNames.Label, DefaultColumnNames.Features, advancedSettings: s =>
            {
                s.FeatureFraction     = _args.SplitRatio;
                s.NumTrees            = _args.NumOfTrees;
                s.MinDocumentsInLeafs = _args.NMinForSplit;
            });
            var predictor = trainer.Train(data).Model;

            // Return random forest predictor.
            return(predictor);
        }
コード例 #17
0
        /// <summary>
        /// This builds the <see cref="TermMap"/> instances per column.
        /// </summary>
        private static TermMap[] Train(IHostEnvironment env, IChannel ch, ColInfo[] infos,
                                       ArgumentsBase args, ColumnBase[] column, IDataView trainingData)
        {
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(infos);
            ch.AssertValue(args);
            ch.AssertValue(column);
            ch.AssertValue(trainingData);

            if ((args.Term != null || !string.IsNullOrEmpty(args.Terms)) &&
                (!string.IsNullOrWhiteSpace(args.DataFile) || args.Loader.IsGood() ||
                 !string.IsNullOrWhiteSpace(args.TermsColumn)))
            {
                ch.Warning("Explicit term list specified. Data file arguments will be ignored");
            }

            if (!Enum.IsDefined(typeof(SortOrder), args.Sort))
            {
                throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected", args.Sort);
            }

            TermMap termsFromFile = null;
            var     termMap       = new TermMap[infos.Length];

            int[]         lims         = new int[infos.Length];
            int           trainsNeeded = 0;
            HashSet <int> toTrain      = null;

            for (int iinfo = 0; iinfo < infos.Length; iinfo++)
            {
                // First check whether we have a terms argument, and handle it appropriately.
                var terms      = new DvText(column[iinfo].Terms);
                var termsArray = column[iinfo].Term;
                if (!terms.HasChars && termsArray == null)
                {
                    terms      = new DvText(args.Terms);
                    termsArray = args.Term;
                }

                terms = terms.Trim();
                if (terms.HasChars || (termsArray != null && termsArray.Length > 0))
                {
                    // We have terms! Pass it in.
                    var sortOrder = column[iinfo].Sort ?? args.Sort;
                    if (!Enum.IsDefined(typeof(SortOrder), sortOrder))
                    {
                        throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, infos[iinfo].Name);
                    }

                    var bldr = Builder.Create(infos[iinfo].TypeSrc, sortOrder);
                    if (terms.HasChars)
                    {
                        bldr.ParseAddTermArg(ref terms, ch);
                    }
                    else
                    {
                        bldr.ParseAddTermArg(termsArray, ch);
                    }
                    termMap[iinfo] = bldr.Finish();
                }
                else if (!string.IsNullOrWhiteSpace(args.DataFile))
                {
                    // First column using this file.
                    if (termsFromFile == null)
                    {
                        var bldr = Builder.Create(infos[iinfo].TypeSrc, column[iinfo].Sort ?? args.Sort);
                        termsFromFile = CreateFileTermMap(env, ch, args, bldr);
                    }
                    if (!termsFromFile.ItemType.Equals(infos[iinfo].TypeSrc.ItemType))
                    {
                        // We have no current plans to support re-interpretation based on different column
                        // type, not only because it's unclear what realistic customer use-cases for such
                        // a complicated feature would be, and also because it's difficult to see how we
                        // can logically reconcile "reinterpretation" for different types with the resulting
                        // data view having an actual type.
                        throw ch.ExceptUserArg(nameof(args.DataFile), "Data file terms loaded as type '{0}' but mismatches column '{1}' item type '{2}'",
                                               termsFromFile.ItemType, infos[iinfo].Name, infos[iinfo].TypeSrc.ItemType);
                    }
                    termMap[iinfo] = termsFromFile;
                }
                else
                {
                    // Auto train this column. Leave the term map null for now, but set the lim appropriately.
                    lims[iinfo] = column[iinfo].MaxNumTerms ?? args.MaxNumTerms;
                    ch.CheckUserArg(lims[iinfo] > 0, nameof(Column.MaxNumTerms), "Must be positive");
                    Utils.Add(ref toTrain, infos[iinfo].Source);
                    ++trainsNeeded;
                }
            }

            ch.Assert((Utils.Size(toTrain) == 0) == (trainsNeeded == 0));
            ch.Assert(Utils.Size(toTrain) <= trainsNeeded);
            if (trainsNeeded > 0)
            {
                Trainer[] trainer     = new Trainer[trainsNeeded];
                int[]     trainerInfo = new int[trainsNeeded];
                // Open the cursor, then instantiate the trainers.
                int itrainer;
                using (var cursor = trainingData.GetRowCursor(toTrain.Contains))
                    using (var pch = env.StartProgressChannel("Building term dictionary"))
                    {
                        long   rowCur   = 0;
                        double rowCount = trainingData.GetRowCount(true) ?? double.NaN;
                        var    header   = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" });

                        itrainer = 0;
                        for (int iinfo = 0; iinfo < infos.Length; ++iinfo)
                        {
                            if (termMap[iinfo] != null)
                            {
                                continue;
                            }
                            var bldr = Builder.Create(infos[iinfo].TypeSrc, column[iinfo].Sort ?? args.Sort);
                            trainerInfo[itrainer] = iinfo;
                            trainer[itrainer++]   = Trainer.Create(cursor, infos[iinfo].Source, false, lims[iinfo], bldr);
                        }
                        ch.Assert(itrainer == trainer.Length);
                        pch.SetHeader(header,
                                      e =>
                        {
                            e.SetProgress(0, rowCur, rowCount);
                            // Purely feedback for the user. That the other thread might be
                            // working in the background is not a problem.
                            e.SetMetric(0, trainer.Sum(t => t.Count));
                        });

                        // The [0,tmin) trainers are finished.
                        int tmin = 0;
                        // We might exit early if all trainers reach their maximum.
                        while (tmin < trainer.Length && cursor.MoveNext())
                        {
                            rowCur++;
                            for (int t = tmin; t < trainer.Length; ++t)
                            {
                                if (!trainer[t].ProcessRow())
                                {
                                    Utils.Swap(ref trainerInfo[t], ref trainerInfo[tmin]);
                                    Utils.Swap(ref trainer[t], ref trainer[tmin++]);
                                }
                            }
                        }

                        pch.Checkpoint(trainer.Sum(t => t.Count), rowCur);
                    }
                for (itrainer = 0; itrainer < trainer.Length; ++itrainer)
                {
                    int iinfo = trainerInfo[itrainer];
                    ch.Assert(termMap[iinfo] == null);
                    if (trainer[itrainer].Count == 0)
                    {
                        ch.Warning("Term map for output column '{0}' contains no entries.", infos[iinfo].Name);
                    }
                    termMap[iinfo] = trainer[itrainer].Finish();
                    // Allow the intermediate structures in the trainer and builder to be released as we iterate
                    // over the columns, as the Finish operation can potentially result in the allocation of
                    // additional structures.
                    trainer[itrainer] = null;
                }
                ch.Assert(termMap.All(tm => tm != null));
                ch.Assert(termMap.Zip(infos, (tm, info) => tm.ItemType.Equals(info.TypeSrc.ItemType)).All(x => x));
            }

            return(termMap);
        }
コード例 #18
0
 public long?GetRowCount(bool lazy = true)
 {
     return(View.GetRowCount(lazy));
 }
コード例 #19
0
 public long?GetRowCount() => _source.GetRowCount();
コード例 #20
0
 public long?GetRowCount()
 {
     return(View.GetRowCount());
 }
コード例 #21
0
 public long?GetRowCount()
 {
     _host.AssertValue(_input, "_input");
     return(_mergedView.GetRowCount());
 }
コード例 #22
0
        /// <summary>
        /// Returns the feature selection scores for each slot of each column.
        /// </summary>
        /// <param name="env">The host environment.</param>
        /// <param name="input">The input dataview.</param>
        /// <param name="columns">The columns for which to compute the feature selection scores.</param>
        /// <param name="colSizes">Outputs an array containing the vector sizes of the input columns</param>
        /// <returns>A list of scores.</returns>
        public static long[][] Train(IHostEnvironment env, IDataView input, string[] columns, out int[] colSizes)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(input, nameof(input));
            env.CheckParam(Utils.Size(columns) > 0, nameof(columns));

            var schema      = input.Schema;
            var size        = columns.Length;
            var activeInput = new bool[schema.ColumnCount];
            var colSrcs     = new int[size];
            var colTypes    = new ColumnType[size];

            colSizes = new int[size];
            for (int i = 0; i < size; i++)
            {
                int colSrc;
                var colName = columns[i];
                if (!schema.TryGetColumnIndex(colName, out colSrc))
                {
                    throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Source column '{0}' not found", colName);
                }

                var colType = schema.GetColumnType(colSrc);
                if (colType.IsVector && !colType.IsKnownSizeVector)
                {
                    throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Variable length column '{0}' is not allowed", colName);
                }

                activeInput[colSrc] = true;
                colSrcs[i]          = colSrc;
                colTypes[i]         = colType;
                colSizes[i]         = colType.ValueCount;
            }

            var    aggregators = new CountAggregator[size];
            long   rowCur      = 0;
            double rowCount    = input.GetRowCount(true) ?? double.NaN;

            using (var pch = env.StartProgressChannel("Aggregating counts"))
                using (var cursor = input.GetRowCursor(col => activeInput[col]))
                {
                    var header = new ProgressHeader(new[] { "rows" });
                    pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); });
                    for (int i = 0; i < size; i++)
                    {
                        if (colTypes[i].IsVector)
                        {
                            aggregators[i] = GetVecAggregator(cursor, colTypes[i], colSrcs[i]);
                        }
                        else
                        {
                            aggregators[i] = GetOneAggregator(cursor, colTypes[i], colSrcs[i]);
                        }
                    }

                    while (cursor.MoveNext())
                    {
                        for (int i = 0; i < size; i++)
                        {
                            aggregators[i].ProcessValue();
                        }
                        rowCur++;
                    }
                    pch.Checkpoint(rowCur);
                }
            return(aggregators.Select(a => a.Count).ToArray());
        }
コード例 #23
0
        private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs)
        {
            // Contains the maximum number of grams to store in the dictionary, for each level of ngrams,
            // from 1 (in position 0) up to ngramLength (in position ngramLength-1)
            var lims = new int[Infos.Length][];

            for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
            {
                var all         = args.Column[iinfo].AllLengths ?? args.AllLengths;
                var ngramLength = _exes[iinfo].NgramLength;
                var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms;
                if (!all)
                {
                    Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 ||
                                      Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms));
                    lims[iinfo] = new int[ngramLength];
                    lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0];
                }
                else
                {
                    Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms));
                    Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms));
                    var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1];
                    lims[iinfo] = Utils.BuildArray(ngramLength,
                                                   i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend);
                }
            }

            var helpers = new NgramBufferBuilder[Infos.Length];
            var getters = new ValueGetter <VBuffer <uint> > [Infos.Length];
            var src     = new VBuffer <uint> [Infos.Length];

            // Keep track of how many grams are in the pool for each value of n. Position
            // i in _counts counts how many (i+1)-grams are in the pool for column iinfo.
            var counts    = new int[Infos.Length][];
            var ngramMaps = new SequencePool[Infos.Length];

            bool[] activeInput = new bool[trainingData.Schema.ColumnCount];
            foreach (var info in Infos)
            {
                activeInput[info.Source] = true;
            }
            using (var cursor = trainingData.GetRowCursor(col => activeInput[col]))
                using (var pch = Host.StartProgressChannel("Building n-gram dictionary"))
                {
                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey);
                        var ngramLength = _exes[iinfo].NgramLength;
                        var skipLength  = _exes[iinfo].SkipLength;

                        getters[iinfo]   = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source);
                        src[iinfo]       = default(VBuffer <uint>);
                        counts[iinfo]    = new int[ngramLength];
                        ngramMaps[iinfo] = new SequencePool();

                        // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will
                        // be added (using lims[iinfo]), therefore we set slotLim to the maximum
                        helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize,
                                                                GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host));
                    }

                    int    cInfoFull = 0;
                    bool[] infoFull  = new bool[Infos.Length];

                    invDocFreqs = new double[Infos.Length][];

                    long   totalDocs = 0;
                    Double rowCount  = trainingData.GetRowCount() ?? Double.NaN;
                    var    buffers   = new VBuffer <float> [Infos.Length];
                    pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }),
                                  e => e.SetProgress(0, totalDocs, rowCount));
                    while (cInfoFull < Infos.Length && cursor.MoveNext())
                    {
                        totalDocs++;
                        for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                        {
                            getters[iinfo](ref src[iinfo]);
                            var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount;
                            if (keyCount == 0)
                            {
                                keyCount = uint.MaxValue;
                            }
                            if (!infoFull[iinfo])
                            {
                                if (_exes[iinfo].RequireIdf())
                                {
                                    helpers[iinfo].Reset();
                                }

                                helpers[iinfo].AddNgrams(in src[iinfo], 0, keyCount);
                                if (_exes[iinfo].RequireIdf())
                                {
                                    int totalNgrams = counts[iinfo].Sum();
                                    Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams);
                                    helpers[iinfo].GetResult(ref buffers[iinfo]);
                                    foreach (var pair in buffers[iinfo].Items())
                                    {
                                        if (pair.Value >= 1)
                                        {
                                            invDocFreqs[iinfo][pair.Key] += 1;
                                        }
                                    }
                                }
                            }
                            AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]);
                        }
                    }

                    pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs);
                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++)
                        {
                            if (invDocFreqs[iinfo][i] != 0)
                            {
                                invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]);
                            }
                        }
                    }

                    for (int iinfo = 0; iinfo < Infos.Length; iinfo++)
                    {
                        AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]);

                        int ngramLength = _exes[iinfo].NgramLength;
                        for (int i = 0; i < ngramLength; i++)
                        {
                            _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0;
                        }
                    }

                    return(ngramMaps);
                }
        }
コード例 #24
0
        private void WriteDataCore(IChannel ch, TextWriter writer, IDataView data,
                                   out string argsLoader, out long count, out int min, out int max, params int[] cols)
        {
            _host.AssertValue(ch);
            ch.AssertValue(writer);
            ch.AssertValue(data);
            ch.AssertNonEmpty(cols);

            // Determine the active columns and whether there is header information.
            bool[] active = new bool[data.Schema.ColumnCount];
            for (int i = 0; i < cols.Length; i++)
            {
                ch.Check(0 <= cols[i] && cols[i] < active.Length);
                ch.Check(data.Schema.GetColumnType(cols[i]).ItemType.RawKind != 0);
                active[cols[i]] = true;
            }

            bool hasHeader = false;

            if (_outputHeader)
            {
                for (int i = 0; i < cols.Length; i++)
                {
                    if (hasHeader)
                    {
                        continue;
                    }
                    var type = data.Schema.GetColumnType(cols[i]);
                    if (!type.IsVector)
                    {
                        hasHeader = true;
                        continue;
                    }
                    if (!type.IsKnownSizeVector)
                    {
                        continue;
                    }
                    var typeNames = data.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, cols[i]);
                    if (typeNames != null && typeNames.VectorSize == type.VectorSize && typeNames.ItemType.IsText)
                    {
                        hasHeader = true;
                    }
                }
            }

            using (var cursor = data.GetRowCursor(i => active[i]))
            {
                var pipes = new ValueWriter[cols.Length];
                for (int i = 0; i < cols.Length; i++)
                {
                    pipes[i] = ValueWriter.Create(cursor, cols[i], _sepChar);
                }

                // REVIEW: This should be outside the cursor creation.
                string header = CreateLoaderArguments(data.Schema, pipes, hasHeader, ch);
                argsLoader = header;
                if (_outputSchema)
                {
                    WriteSchemaAsComment(writer, header);
                }

                double rowCount = data.GetRowCount(true) ?? double.NaN;
                using (var pch = !_silent ? _host.StartProgressChannel("TextSaver: saving data") : null)
                {
                    long stateCount = 0;
                    var  state      = new State(this, writer, pipes, hasHeader);
                    if (pch != null)
                    {
                        pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, stateCount, rowCount));
                    }
                    state.Run(cursor, ref stateCount, out min, out max);
                    count = stateCount;
                    if (pch != null)
                    {
                        pch.Checkpoint(stateCount);
                    }
                }
            }
        }
コード例 #25
0
 public long?GetRowCount(bool lazy = true)
 {
     return(_input.GetRowCount(lazy));
 }
コード例 #26
0
 public long?GetRowCount()
 {
     _host.AssertValue(_source, "_input");
     return(_source.GetRowCount());
 }
コード例 #27
0
 public long?GetRowCount()
 {
     return(_source.GetRowCount());
 }
コード例 #28
0
 public long?GetRowCount(bool lazy = true)
 {
     return(_source.GetRowCount(lazy));
 }
コード例 #29
0
        private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock *penv, IDataView view, Dictionary <string, ColumnMetadataInfo> infos = null)
        {
            Contracts.AssertValue(ch);
            Contracts.Assert(penv != null);
            Contracts.AssertValue(view);
            Contracts.AssertValueOrNull(infos);
            if (penv->dataSink == null)
            {
                // Environment doesn't want any data!
                return;
            }

            var dataSink = MarshalDelegate <DataSink>(penv->dataSink);

            var schema        = view.Schema;
            var colIndices    = new List <int>();
            var kindList      = new List <DataKind>();
            var keyCardList   = new List <int>();
            var nameUtf8Bytes = new List <Byte>();
            var nameIndices   = new List <int>();

            var expandCols = new HashSet <int>();
            var allNames   = new HashSet <string>();

            for (int col = 0; col < schema.Count; col++)
            {
                if (schema[col].IsHidden)
                {
                    continue;
                }

                var fullType = schema[col].Type;
                var itemType = fullType.ItemType;
                var name     = schema[col].Name;

                DataKind kind = itemType.RawKind;
                int      keyCard;

                if (fullType.ValueCount == 0)
                {
                    throw ch.ExceptNotSupp("Column has variable length vector: " +
                                           name + ". Not supported in python. Drop column before sending to Python");
                }

                if (itemType.IsKey)
                {
                    // Key types are returned as their signed counterparts in Python, so that -1 can be the missing value.
                    // For U1 and U2 kinds, we convert to a larger type to prevent overflow. For U4 and U8 kinds, we convert
                    // to I4 if the key count is known (since KeyCount is an I4), and to I8 otherwise.
                    switch (kind)
                    {
                    case DataKind.U1:
                        kind = DataKind.I2;
                        break;

                    case DataKind.U2:
                        kind = DataKind.I4;
                        break;

                    case DataKind.U4:
                        // We convert known-cardinality U4 key types to I4.
                        kind = itemType.KeyCount > 0 ? DataKind.I4 : DataKind.I8;
                        break;

                    case DataKind.U8:
                        // We convert known-cardinality U8 key types to I4.
                        kind = itemType.KeyCount > 0 ? DataKind.I4 : DataKind.I8;
                        break;
                    }

                    keyCard = itemType.KeyCount;
                    if (!schema[col].HasKeyValues(keyCard))
                    {
                        keyCard = -1;
                    }
                }
                else if (itemType.IsStandardScalar())
                {
                    switch (itemType.RawKind)
                    {
                    default:
                        throw Contracts.Except("Data type {0} not handled", itemType.RawKind);

                    case DataKind.I1:
                    case DataKind.I2:
                    case DataKind.I4:
                    case DataKind.I8:
                    case DataKind.U1:
                    case DataKind.U2:
                    case DataKind.U4:
                    case DataKind.U8:
                    case DataKind.R4:
                    case DataKind.R8:
                    case DataKind.BL:
                    case DataKind.TX:
                        break;
                    }
                    keyCard = -1;
                }
                else
                {
                    throw Contracts.Except("Data type {0} not handled", itemType.RawKind);
                }

                int nSlots;
                ColumnMetadataInfo info;
                if (infos != null && infos.TryGetValue(name, out info) && info.Expand)
                {
                    expandCols.Add(col);
                    Contracts.Assert(fullType.IsKnownSizeVector);
                    nSlots = fullType.VectorSize;
                    if (info.SlotNames != null)
                    {
                        Contracts.Assert(info.SlotNames.Length == nSlots);
                        for (int i = 0; i < nSlots; i++)
                        {
                            AddUniqueName(info.SlotNames[i], allNames, nameIndices, nameUtf8Bytes);
                        }
                    }
                    else if (schema[col].HasSlotNames(nSlots))
                    {
                        var romNames = default(VBuffer <ReadOnlyMemory <char> >);
                        schema[col].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref romNames);
                        foreach (var kvp in romNames.Items(true))
                        {
                            // REVIEW: Add the proper number of zeros to the slot index to make them sort in the right order.
                            var slotName = name + "." +
                                           (!kvp.Value.IsEmpty ? kvp.Value.ToString() : kvp.Key.ToString(CultureInfo.InvariantCulture));
                            AddUniqueName(slotName, allNames, nameIndices, nameUtf8Bytes);
                        }
                    }
                    else
                    {
                        for (int i = 0; i < nSlots; i++)
                        {
                            AddUniqueName(name + "." + i, allNames, nameIndices, nameUtf8Bytes);
                        }
                    }
                }
                else
                {
                    nSlots = 1;
                    AddUniqueName(name, allNames, nameIndices, nameUtf8Bytes);
                }

                colIndices.Add(col);
                for (int i = 0; i < nSlots; i++)
                {
                    kindList.Add(kind);
                    keyCardList.Add(keyCard);
                }
            }

            ch.Assert(allNames.Count == kindList.Count);
            ch.Assert(allNames.Count == keyCardList.Count);
            ch.Assert(allNames.Count == nameIndices.Count);

            var kinds     = kindList.ToArray();
            var keyCards  = keyCardList.ToArray();
            var nameBytes = nameUtf8Bytes.ToArray();
            var names     = new byte *[allNames.Count];

            fixed(DataKind *prgkind = kinds)
            fixed(byte *prgbNames = nameBytes)
            fixed(byte **prgname  = names)
            fixed(int *prgkeyCard = keyCards)
            {
                for (int iid = 0; iid < names.Length; iid++)
                {
                    names[iid] = prgbNames + nameIndices[iid];
                }

                DataViewBlock block;

                block.ccol     = allNames.Count;
                block.crow     = view.GetRowCount() ?? 0;
                block.names    = (sbyte **)prgname;
                block.kinds    = prgkind;
                block.keyCards = prgkeyCard;

                dataSink(penv, &block, out var setters, out var keyValueSetter);

                if (setters == null)
                {
                    // REVIEW: What should we do?
                    return;
                }
                ch.Assert(keyValueSetter != null);
                var kvSet = MarshalDelegate <KeyValueSetter>(keyValueSetter);

                using (var cursor = view.GetRowCursor(colIndices.Contains))
                {
                    var fillers  = new BufferFillerBase[colIndices.Count];
                    var pyColumn = 0;
                    var keyIndex = 0;
                    for (int i = 0; i < colIndices.Count; i++)
                    {
                        var type = schema[colIndices[i]].Type;
                        if (type.ItemType.IsKey && schema[colIndices[i]].HasKeyValues(type.ItemType.KeyCount))
                        {
                            ch.Assert(schema[colIndices[i]].HasKeyValues(type.ItemType.KeyCount));
                            var keyValues = default(VBuffer <ReadOnlyMemory <char> >);
                            schema[colIndices[i]].Metadata.GetValue(MetadataUtils.Kinds.KeyValues, ref keyValues);
                            for (int slot = 0; slot < type.ValueCount; slot++)
                            {
                                foreach (var kvp in keyValues.Items())
                                {
                                    if (kvp.Value.IsEmpty)
                                    {
                                        kvSet(penv, keyIndex, kvp.Key, null, 0);
                                    }
                                    else
                                    {
                                        byte[] bt = Encoding.UTF8.GetBytes(kvp.Value.ToString());

                                        fixed(byte *pt = bt)
                                        kvSet(penv, keyIndex, kvp.Key, (sbyte *)pt, bt.Length);
                                    }
                                }
                                keyIndex++;
                            }
                        }
                        fillers[i] = BufferFillerBase.Create(penv, cursor, pyColumn, colIndices[i], kinds[pyColumn], type, setters[pyColumn]);
                        pyColumn  += type.IsVector ? type.VectorSize : 1;
                    }
                    for (int crow = 0; ; crow++)
                    {
                        // Advance to the next row.
                        if (!cursor.MoveNext())
                        {
                            break;
                        }

                        // Fill values for the current row.
                        for (int i = 0; i < fillers.Length; i++)
                        {
                            fillers[i].Set();
                        }
                    }
                }
            }
        }
 public long?GetRowCount()
 {
     return(_input.GetRowCount());
 }