internal ExpressionEstimator(IHostEnvironment env, params ColumnOptions[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(ExpressionEstimator)); _host.CheckNonEmpty(columns, nameof(columns)); _host.Check(columns.All(col => !string.IsNullOrWhiteSpace(col.Expression))); _host.Check(columns.All(col => !string.IsNullOrWhiteSpace(col.Name))); _host.Check(columns.All(col => Utils.Size(col.InputColumnNames) > 0)); _host.CheckParam(columns.All(col => Utils.Size(col.InputColumnNames) <= 5), nameof(ColumnOptions.InputColumnNames), "maximum number of inputs exceeded"); _columns = columns; }
private protected PredictionTransformerBase(IHost host, TModel model, DataViewSchema trainSchema) { Contracts.CheckValue(host, nameof(host)); Host = host; Host.CheckValue(model, nameof(model)); Host.CheckParam(model is IPredictor, nameof(model)); Model = model; Host.CheckValue(trainSchema, nameof(trainSchema)); TrainSchema = trainSchema; }
private ParquetLoader(Arguments args, IHost host, Stream stream) { Contracts.AssertValue(host, nameof(host)); _host = host; _host.CheckValue(args, nameof(args)); _host.CheckValue(stream, nameof(stream)); _host.CheckParam(stream.CanRead, nameof(stream), "input stream must be readable"); _host.CheckParam(stream.CanSeek, nameof(stream), "input stream must be seekable"); _host.CheckParam(stream.Position == 0, nameof(stream), "input stream must be at head"); using (var ch = _host.Start("Initializing host")) { _parquetStream = stream; _parquetOptions = new ParquetOptions() { TreatByteArrayAsString = true, TreatBigIntegersAsDates = args.TreatBigIntegersAsDates }; try { // We only care about the schema so ignore the rows. ReaderOptions readerOptions = new ReaderOptions() { Count = 0, Offset = 0 }; _schemaDataSet = ParquetReader.Read(stream, _parquetOptions, readerOptions); } catch (Exception ex) { throw new InvalidDataException("Cannot read Parquet file", ex); } _columnChunkReadSize = args.ColumnChunkReadSize; InitColumns(ch, out _columnsLoaded); Schema = CreateSchema(_host, _columnsLoaded); } }
/// <summary> /// Initializes a new instance of <see cref="ColumnConcatenatingEstimator"/> /// </summary> /// <param name="env">The local instance of <see cref="IHostEnvironment"/>.</param> /// <param name="outputColumn">The name of the resulting column.</param> /// <param name="inputColumns">The columns to concatenate together.</param> public ColumnConcatenatingEstimator(IHostEnvironment env, string outputColumn, params string[] inputColumns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register("ColumnConcatenatingEstimator "); _host.CheckNonEmpty(outputColumn, nameof(outputColumn)); _host.CheckValue(inputColumns, nameof(inputColumns)); _host.CheckParam(!inputColumns.Any(r => string.IsNullOrEmpty(r)), nameof(inputColumns), "Contained some null or empty items"); _name = outputColumn; _source = inputColumns; }
public ConcatEstimator(IHostEnvironment env, string name, params string[] source) { Contracts.CheckValue(env, nameof(env)); _host = env.Register("ConcatEstimator"); _host.CheckNonEmpty(name, nameof(name)); _host.CheckNonEmpty(source, nameof(source)); _host.CheckParam(!source.Any(r => string.IsNullOrEmpty(r)), nameof(source), "Contained some null or empty items"); _name = name; _source = source; }
private protected TrainerEstimatorBase(IHost host, SchemaShape.Column feature, SchemaShape.Column label, SchemaShape.Column weight = default) { Contracts.CheckValue(host, nameof(host)); Host = host; Host.CheckParam(feature.IsValid, nameof(feature), "not initialized properly"); FeatureColumn = feature; LabelColumn = label; WeightColumn = weight; }
internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable<string> source, Options options = null) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(TextFeaturizingEstimator)); _host.CheckValue(source, nameof(source)); _host.CheckParam(source.Any(), nameof(source)); _host.CheckParam(!source.Any(string.IsNullOrWhiteSpace), nameof(source)); _host.CheckNonEmpty(name, nameof(name)); _host.CheckValueOrNull(options); _inputColumns = source.ToArray(); OutputColumn = name; OptionalSettings = new Options(); if (options != null) OptionalSettings = options; _dictionary = null; _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory; _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory; }
private FieldAwareFactorizationMachineModelParameters Train(TrainContext context) { _host.CheckValue(context, nameof(context)); var initPredictor = context.InitialPredictor as FieldAwareFactorizationMachineModelParameters; _host.CheckParam(context.InitialPredictor == null || initPredictor != null, nameof(context), "Initial predictor should have been " + nameof(FieldAwareFactorizationMachineModelParameters)); using (var ch = _host.Start("Training")) using (var pch = _host.StartProgressChannel("Training")) { return(TrainCore(ch, pch, context.TrainingSet, context.ValidationSet, initPredictor)); } }
public TextFeaturizingEstimator(IHostEnvironment env, IEnumerable <string> inputColumns, string outputColumn, Action <Settings> advancedSettings = null) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(TextFeaturizingEstimator)); _host.CheckValue(inputColumns, nameof(inputColumns)); _host.CheckParam(inputColumns.Any(), nameof(inputColumns)); _host.CheckParam(!inputColumns.Any(string.IsNullOrWhiteSpace), nameof(inputColumns)); _host.CheckNonEmpty(outputColumn, nameof(outputColumn)); _host.CheckValueOrNull(advancedSettings); _inputColumns = inputColumns.ToArray(); OutputColumn = outputColumn; AdvancedSettings = new Settings(); advancedSettings?.Invoke(AdvancedSettings); _dictionary = null; _wordFeatureExtractor = new NgramExtractingTransformer.NgramExtractorArguments(); _charFeatureExtractor = new NgramExtractingTransformer.NgramExtractorArguments() { NgramLength = 3, AllLengths = false }; }
public CountTargetEncodingFeaturizer(IHostEnvironment env, float[] priorCoef, float[] laplaceScale, long labelBinCount, MultiCountTableBase countTable) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(RegistrationName); _host.CheckParam(labelBinCount > 1, nameof(labelBinCount), "Must be greater than 1"); _labelBinCount = (int)labelBinCount; _logOddsCount = _labelBinCount == 2 ? 1 : _labelBinCount; NumFeatures = _labelBinCount + _logOddsCount + 1; PriorCoef = priorCoef; LaplaceScale = laplaceScale; _host.AssertValue(countTable); _countTables = countTable; }
public void SaveData(Stream stream, IDataView data, params int[] colIndices) { _host.CheckValue(stream, nameof(stream)); _host.CheckValue(data, nameof(data)); _host.CheckValueOrNull(colIndices); _host.CheckParam(stream.CanWrite, nameof(stream), "cannot save to non-writable stream"); _host.CheckParam(stream.CanSeek, nameof(stream), "cannot save to non-seekable stream"); _host.CheckParam(stream.Position == 0, nameof(stream), "stream must be positioned at head of stream"); using (IChannel ch = _host.Start("Saving")) using (ExceptionMarshaller exMarshaller = new ExceptionMarshaller()) { var toWrite = new BlockingCollection <Block>(16); var toCompress = new BlockingCollection <Block>(16); var activeColumns = GetActiveColumns(data.Schema, colIndices); int rowsPerBlock = RowsPerBlockHeuristic(data, activeColumns); ch.Assert(rowsPerBlock > 0); Stopwatch sw = new Stopwatch(); // Set up the compression and write workers that consume the input information first. Task compressionTask = null; if (activeColumns.Length > 0) { OrderedWaiter waiter = _deterministicBlockOrder ? new OrderedWaiter() : null; Thread[] compressionThreads = new Thread[Environment.ProcessorCount]; for (int i = 0; i < compressionThreads.Length; ++i) { compressionThreads[i] = Utils.CreateBackgroundThread( () => CompressionWorker(toCompress, toWrite, activeColumns.Length, waiter, exMarshaller)); compressionThreads[i].Start(); } compressionTask = new Task(() => { foreach (Thread t in compressionThreads) { t.Join(); } }); compressionTask.Start(); } // While there is an advantage to putting the IO into a separate thread, there is not an // advantage to having more than one worker. Thread writeThread = Utils.CreateBackgroundThread( () => WriteWorker(stream, toWrite, activeColumns, data.Schema, rowsPerBlock, _host, exMarshaller)); writeThread.Start(); sw.Start(); // REVIEW: For now the fetch worker just works in the main thread. If it's // a fairly large view through, it may be advantageous to consider breaking up the // fetchwrite operations on the pipes, somehow. // Despite running in the main thread for now, the fetch worker follows the same // pattern of utilizing exMarshaller. using (var pch = _silent ? null : _host.StartProgressChannel("BinarySaver")) { FetchWorker(toCompress, data, activeColumns, rowsPerBlock, sw, ch, pch, exMarshaller); } _host.Assert(compressionTask != null || toCompress.IsCompleted); if (compressionTask != null) { compressionTask.Wait(); } toWrite.CompleteAdding(); writeThread.Join(); exMarshaller.ThrowIfSet(ch); if (!_silent) { ch.Info("Wrote {0} rows across {1} columns in {2}", _rowCount, activeColumns.Length, sw.Elapsed); } // When we dispose the exception marshaller, this will set the cancellation token when we internally // dispose the cancellation token source, so one way or another those threads are being cancelled, even // if an exception is thrown in the main body of this function. } }
private static void CheckModel(IHost host, byte **ppModelBin, long *pllModelBinLen, int i) { host.CheckParam( ppModelBin != null && ppModelBin[i] != null && pllModelBinLen != null && pllModelBinLen[i] > 0, "pModelBin", "Model is missing"); }
/// <summary> /// This call will set <see cref="PfaContext.OutputType"/> to an appropriate output type based /// on the columns requested. /// </summary> /// <param name="schema">The schema corresponding to what we are outputting</param> /// <param name="toOutput">The columns to output</param> /// <returns>Returns a complete PFA program, where the output will correspond to the subset /// of columns from <paramref name="schema"/>.</returns> public JObject Finalize(DataViewSchema schema, params string[] toOutput) { _host.CheckValue(schema, nameof(schema)); _host.CheckValue(toOutput, nameof(toOutput)); JToken lastType = null; string lastToken = null; var recordType = new JObject(); var newStatement = new JObject(); recordType["type"] = "record"; recordType["name"] = "DataOutput"; var fields = new JArray(); var fieldNames = new HashSet <string>(); foreach (var name in toOutput) { _host.CheckParam(name != null, nameof(toOutput), "Null values in array"); if (!schema.TryGetColumnIndex(name, out int col)) { throw _host.ExceptParam(nameof(toOutput), $"Requested column '{name}' not in schema"); } JToken type = PfaTypeOrNullForColumn(schema, col); if (type == null) { continue; } string token = TokenOrNullForName(name); if (token == null) // Not available. { continue; } // We can write it out. lastType = type; lastToken = token; string fieldName = ModelUtils.CreateNameCore(name, fieldNames.Contains); fieldNames.Add(fieldName); var fieldDeclaration = new JObject(); fieldDeclaration["name"] = fieldName; fieldDeclaration["type"] = type; fields.Add(fieldDeclaration); newStatement[fieldName] = token; } recordType["fields"] = fields; _host.Check(fields.Count >= 1, "Pipeline produced no outputs for the PFA conversion"); if (fields.Count == 1) { Pfa.OutputType = lastType; Pfa.Final = lastToken; return(Pfa.Finalize()); } var expr = new JObject(); expr["type"] = "DataOutput"; expr["new"] = newStatement; Pfa.OutputType = recordType; Pfa.Final = expr; return(Pfa.Finalize()); }
/// <summary> /// Adds an output variable to the list. /// </summary> public void AddOutputVariable(DataViewType type, string variableName, List <long> dim = null) { _host.CheckValue(type, nameof(type)); _host.CheckParam(IsVariableDefined(variableName), nameof(variableName)); _outputs.Add(OnnxUtils.GetModelArgs(type, variableName, dim)); }