示例#1
0
        internal ExpressionEstimator(IHostEnvironment env, params ColumnOptions[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(ExpressionEstimator));
            _host.CheckNonEmpty(columns, nameof(columns));
            _host.Check(columns.All(col => !string.IsNullOrWhiteSpace(col.Expression)));
            _host.Check(columns.All(col => !string.IsNullOrWhiteSpace(col.Name)));
            _host.Check(columns.All(col => Utils.Size(col.InputColumnNames) > 0));
            _host.CheckParam(columns.All(col => Utils.Size(col.InputColumnNames) <= 5), nameof(ColumnOptions.InputColumnNames), "maximum number of inputs exceeded");

            _columns = columns;
        }
        private protected PredictionTransformerBase(IHost host, TModel model, DataViewSchema trainSchema)
        {
            Contracts.CheckValue(host, nameof(host));
            Host = host;

            Host.CheckValue(model, nameof(model));
            Host.CheckParam(model is IPredictor, nameof(model));
            Model = model;

            Host.CheckValue(trainSchema, nameof(trainSchema));
            TrainSchema = trainSchema;
        }
        private ParquetLoader(Arguments args, IHost host, Stream stream)
        {
            Contracts.AssertValue(host, nameof(host));
            _host = host;

            _host.CheckValue(args, nameof(args));
            _host.CheckValue(stream, nameof(stream));
            _host.CheckParam(stream.CanRead, nameof(stream), "input stream must be readable");
            _host.CheckParam(stream.CanSeek, nameof(stream), "input stream must be seekable");
            _host.CheckParam(stream.Position == 0, nameof(stream), "input stream must be at head");

            using (var ch = _host.Start("Initializing host"))
            {
                _parquetStream  = stream;
                _parquetOptions = new ParquetOptions()
                {
                    TreatByteArrayAsString  = true,
                    TreatBigIntegersAsDates = args.TreatBigIntegersAsDates
                };

                try
                {
                    // We only care about the schema so ignore the rows.
                    ReaderOptions readerOptions = new ReaderOptions()
                    {
                        Count  = 0,
                        Offset = 0
                    };
                    _schemaDataSet = ParquetReader.Read(stream, _parquetOptions, readerOptions);
                }
                catch (Exception ex)
                {
                    throw new InvalidDataException("Cannot read Parquet file", ex);
                }

                _columnChunkReadSize = args.ColumnChunkReadSize;
                InitColumns(ch, out _columnsLoaded);
                Schema = CreateSchema(_host, _columnsLoaded);
            }
        }
        /// <summary>
        /// Initializes a new instance of <see cref="ColumnConcatenatingEstimator"/>
        /// </summary>
        /// <param name="env">The local instance of <see cref="IHostEnvironment"/>.</param>
        /// <param name="outputColumn">The name of the resulting column.</param>
        /// <param name="inputColumns">The columns to concatenate together.</param>
        public ColumnConcatenatingEstimator(IHostEnvironment env, string outputColumn, params string[] inputColumns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register("ColumnConcatenatingEstimator ");

            _host.CheckNonEmpty(outputColumn, nameof(outputColumn));
            _host.CheckValue(inputColumns, nameof(inputColumns));
            _host.CheckParam(!inputColumns.Any(r => string.IsNullOrEmpty(r)), nameof(inputColumns),
                             "Contained some null or empty items");

            _name   = outputColumn;
            _source = inputColumns;
        }
示例#5
0
        public ConcatEstimator(IHostEnvironment env, string name, params string[] source)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register("ConcatEstimator");

            _host.CheckNonEmpty(name, nameof(name));
            _host.CheckNonEmpty(source, nameof(source));
            _host.CheckParam(!source.Any(r => string.IsNullOrEmpty(r)), nameof(source),
                             "Contained some null or empty items");

            _name   = name;
            _source = source;
        }
        private protected TrainerEstimatorBase(IHost host,
                                               SchemaShape.Column feature,
                                               SchemaShape.Column label,
                                               SchemaShape.Column weight = default)
        {
            Contracts.CheckValue(host, nameof(host));
            Host = host;
            Host.CheckParam(feature.IsValid, nameof(feature), "not initialized properly");

            FeatureColumn = feature;
            LabelColumn   = label;
            WeightColumn  = weight;
        }
示例#7
0
        internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable<string> source, Options options = null)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(TextFeaturizingEstimator));
            _host.CheckValue(source, nameof(source));
            _host.CheckParam(source.Any(), nameof(source));
            _host.CheckParam(!source.Any(string.IsNullOrWhiteSpace), nameof(source));
            _host.CheckNonEmpty(name, nameof(name));
            _host.CheckValueOrNull(options);

            _inputColumns = source.ToArray();
            OutputColumn = name;

            OptionalSettings = new Options();
            if (options != null)
                OptionalSettings = options;

            _dictionary = null;
            _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory;
            _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory;

        }
示例#8
0
        private FieldAwareFactorizationMachineModelParameters Train(TrainContext context)
        {
            _host.CheckValue(context, nameof(context));
            var initPredictor = context.InitialPredictor as FieldAwareFactorizationMachineModelParameters;

            _host.CheckParam(context.InitialPredictor == null || initPredictor != null, nameof(context),
                             "Initial predictor should have been " + nameof(FieldAwareFactorizationMachineModelParameters));

            using (var ch = _host.Start("Training"))
                using (var pch = _host.StartProgressChannel("Training"))
                {
                    return(TrainCore(ch, pch, context.TrainingSet, context.ValidationSet, initPredictor));
                }
        }
示例#9
0
        public TextFeaturizingEstimator(IHostEnvironment env, IEnumerable <string> inputColumns, string outputColumn,
                                        Action <Settings> advancedSettings = null)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(TextFeaturizingEstimator));
            _host.CheckValue(inputColumns, nameof(inputColumns));
            _host.CheckParam(inputColumns.Any(), nameof(inputColumns));
            _host.CheckParam(!inputColumns.Any(string.IsNullOrWhiteSpace), nameof(inputColumns));
            _host.CheckNonEmpty(outputColumn, nameof(outputColumn));
            _host.CheckValueOrNull(advancedSettings);

            _inputColumns = inputColumns.ToArray();
            OutputColumn  = outputColumn;

            AdvancedSettings = new Settings();
            advancedSettings?.Invoke(AdvancedSettings);

            _dictionary           = null;
            _wordFeatureExtractor = new NgramExtractingTransformer.NgramExtractorArguments();
            _charFeatureExtractor = new NgramExtractingTransformer.NgramExtractorArguments()
            {
                NgramLength = 3, AllLengths = false
            };
        }
        public CountTargetEncodingFeaturizer(IHostEnvironment env, float[] priorCoef, float[] laplaceScale, long labelBinCount, MultiCountTableBase countTable)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(RegistrationName);
            _host.CheckParam(labelBinCount > 1, nameof(labelBinCount), "Must be greater than 1");

            _labelBinCount = (int)labelBinCount;
            _logOddsCount  = _labelBinCount == 2 ? 1 : _labelBinCount;
            NumFeatures    = _labelBinCount + _logOddsCount + 1;

            PriorCoef = priorCoef;

            LaplaceScale = laplaceScale;

            _host.AssertValue(countTable);
            _countTables = countTable;
        }
示例#11
0
        public void SaveData(Stream stream, IDataView data, params int[] colIndices)
        {
            _host.CheckValue(stream, nameof(stream));
            _host.CheckValue(data, nameof(data));
            _host.CheckValueOrNull(colIndices);
            _host.CheckParam(stream.CanWrite, nameof(stream), "cannot save to non-writable stream");
            _host.CheckParam(stream.CanSeek, nameof(stream), "cannot save to non-seekable stream");
            _host.CheckParam(stream.Position == 0, nameof(stream), "stream must be positioned at head of stream");

            using (IChannel ch = _host.Start("Saving"))
                using (ExceptionMarshaller exMarshaller = new ExceptionMarshaller())
                {
                    var toWrite       = new BlockingCollection <Block>(16);
                    var toCompress    = new BlockingCollection <Block>(16);
                    var activeColumns = GetActiveColumns(data.Schema, colIndices);
                    int rowsPerBlock  = RowsPerBlockHeuristic(data, activeColumns);
                    ch.Assert(rowsPerBlock > 0);
                    Stopwatch sw = new Stopwatch();

                    // Set up the compression and write workers that consume the input information first.
                    Task compressionTask = null;
                    if (activeColumns.Length > 0)
                    {
                        OrderedWaiter waiter             = _deterministicBlockOrder ? new OrderedWaiter() : null;
                        Thread[]      compressionThreads = new Thread[Environment.ProcessorCount];
                        for (int i = 0; i < compressionThreads.Length; ++i)
                        {
                            compressionThreads[i] = Utils.CreateBackgroundThread(
                                () => CompressionWorker(toCompress, toWrite, activeColumns.Length, waiter, exMarshaller));
                            compressionThreads[i].Start();
                        }
                        compressionTask = new Task(() =>
                        {
                            foreach (Thread t in compressionThreads)
                            {
                                t.Join();
                            }
                        });
                        compressionTask.Start();
                    }

                    // While there is an advantage to putting the IO into a separate thread, there is not an
                    // advantage to having more than one worker.
                    Thread writeThread = Utils.CreateBackgroundThread(
                        () => WriteWorker(stream, toWrite, activeColumns, data.Schema, rowsPerBlock, _host, exMarshaller));
                    writeThread.Start();
                    sw.Start();

                    // REVIEW: For now the fetch worker just works in the main thread. If it's
                    // a fairly large view through, it may be advantageous to consider breaking up the
                    // fetchwrite operations on the pipes, somehow.
                    // Despite running in the main thread for now, the fetch worker follows the same
                    // pattern of utilizing exMarshaller.
                    using (var pch = _silent ? null : _host.StartProgressChannel("BinarySaver"))
                    {
                        FetchWorker(toCompress, data, activeColumns, rowsPerBlock, sw, ch, pch, exMarshaller);
                    }

                    _host.Assert(compressionTask != null || toCompress.IsCompleted);
                    if (compressionTask != null)
                    {
                        compressionTask.Wait();
                    }
                    toWrite.CompleteAdding();

                    writeThread.Join();
                    exMarshaller.ThrowIfSet(ch);
                    if (!_silent)
                    {
                        ch.Info("Wrote {0} rows across {1} columns in {2}", _rowCount, activeColumns.Length, sw.Elapsed);
                    }
                    // When we dispose the exception marshaller, this will set the cancellation token when we internally
                    // dispose the cancellation token source, so one way or another those threads are being cancelled, even
                    // if an exception is thrown in the main body of this function.
                }
        }
示例#12
0
 private static void CheckModel(IHost host, byte **ppModelBin, long *pllModelBinLen, int i)
 {
     host.CheckParam(
         ppModelBin != null && ppModelBin[i] != null &&
         pllModelBinLen != null && pllModelBinLen[i] > 0, "pModelBin", "Model is missing");
 }
示例#13
0
        /// <summary>
        /// This call will set <see cref="PfaContext.OutputType"/> to an appropriate output type based
        /// on the columns requested.
        /// </summary>
        /// <param name="schema">The schema corresponding to what we are outputting</param>
        /// <param name="toOutput">The columns to output</param>
        /// <returns>Returns a complete PFA program, where the output will correspond to the subset
        /// of columns from <paramref name="schema"/>.</returns>
        public JObject Finalize(DataViewSchema schema, params string[] toOutput)
        {
            _host.CheckValue(schema, nameof(schema));
            _host.CheckValue(toOutput, nameof(toOutput));
            JToken lastType  = null;
            string lastToken = null;

            var recordType   = new JObject();
            var newStatement = new JObject();

            recordType["type"] = "record";
            recordType["name"] = "DataOutput";
            var fields     = new JArray();
            var fieldNames = new HashSet <string>();

            foreach (var name in toOutput)
            {
                _host.CheckParam(name != null, nameof(toOutput), "Null values in array");
                if (!schema.TryGetColumnIndex(name, out int col))
                {
                    throw _host.ExceptParam(nameof(toOutput), $"Requested column '{name}' not in schema");
                }
                JToken type = PfaTypeOrNullForColumn(schema, col);
                if (type == null)
                {
                    continue;
                }
                string token = TokenOrNullForName(name);
                if (token == null) // Not available.
                {
                    continue;
                }

                // We can write it out.
                lastType  = type;
                lastToken = token;

                string fieldName = ModelUtils.CreateNameCore(name, fieldNames.Contains);
                fieldNames.Add(fieldName);
                var fieldDeclaration = new JObject();
                fieldDeclaration["name"] = fieldName;
                fieldDeclaration["type"] = type;
                fields.Add(fieldDeclaration);

                newStatement[fieldName] = token;
            }
            recordType["fields"] = fields;

            _host.Check(fields.Count >= 1, "Pipeline produced no outputs for the PFA conversion");
            if (fields.Count == 1)
            {
                Pfa.OutputType = lastType;
                Pfa.Final      = lastToken;
                return(Pfa.Finalize());
            }
            var expr = new JObject();

            expr["type"] = "DataOutput";
            expr["new"]  = newStatement;

            Pfa.OutputType = recordType;
            Pfa.Final      = expr;

            return(Pfa.Finalize());
        }
 /// <summary>
 /// Adds an output variable to the list.
 /// </summary>
 public void AddOutputVariable(DataViewType type, string variableName, List <long> dim = null)
 {
     _host.CheckValue(type, nameof(type));
     _host.CheckParam(IsVariableDefined(variableName), nameof(variableName));
     _outputs.Add(OnnxUtils.GetModelArgs(type, variableName, dim));
 }