protected override void VerifyView(IDataView view) { Host.AssertValue(view); // This must have precisely one column, of type vector. var schema = view.Schema; Host.CheckDecode(schema.ColumnCount == 1); var ttype = schema.GetColumnType(0); Host.CheckDecode(ttype.IsVector); // We have no way to encode a type of zero length vectors per se in the case // when there are no rows in the original dataset, but accept that if the vector // count is "unknown" then it's really a zero-row dataset. Host.CheckDecode(ttype.ValueCount == _parent._header.RowCount); // This came from a binary IDV, so it must have an actual "row" count, // though this row count for this is more like a "slot" count. var rowCountNull = view.GetRowCount(); Host.Assert(rowCountNull.HasValue); long rowCount = rowCountNull.Value; // There must be one "row" per "slot" on the column this is a transpose of. // Check that. var type = _parent.Schema.GetColumnType(_col); Host.CheckDecode(type.ValueCount == rowCount); // The item types should be the same. Host.CheckDecode(type.ItemType.Equals(ttype.ItemType)); }
private AppendRowsDataView(IHostEnvironment env, Schema schema, IDataView[] sources) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(RegistrationName); _host.AssertValueOrNull(schema); _host.AssertValue(sources); _host.Assert(sources.Length >= 2); _sources = sources; _schema = schema ?? _sources[0].Schema; CheckSchemaConsistency(); _canShuffle = true; _counts = new int[_sources.Length]; for (int i = 0; i < _sources.Length; i++) { IDataView dv = _sources[i]; if (!dv.CanShuffle) { _canShuffle = false; _counts = null; break; } long?count = dv.GetRowCount(); if (count == null || count < 0 || count > int.MaxValue) { _canShuffle = false; _counts = null; break; } _counts[i] = (int)count; } }
static void Main(string[] args) { Console.WriteLine("Training time series analysis"); //Step 1. Create a ML Context var ctx = new MLContext(); string connectionString = "Data Source=localhost;Initial Catalog=kaggle_wallmart;Provider=SQLNCLI11.1;Integrated Security=SSPI;Auto Translate=False;"; connectionString = "Server=localhost;Database=kaggle_wallmart;Integrated Security=True"; string Query = @" SELECT CAST(X.[Value] AS REAL) AS [TotalSales], CAST(Y.date AS DATE) AS [SalesDate], CAST(year(Y.date) AS REAL) As [Year] FROM [dbo].[RAW_Train_Eval] AS X INNER JOIN [dbo].RAW_Calendar AS Y ON Y.d=X.dCode where Id='HOBBIES_1_278_CA_1_evaluation' order by 2 "; Console.WriteLine("Connecting to the database..."); //dbChecks dbchecks = new dbChecks(); //dbchecks.ExecuteQuery(connectionString, Query); System.Data.SqlClient.SqlClientFactory newFactory = SqlClientFactory.Instance; Console.WriteLine("Loading data..."); DatabaseSource dbSource = new DatabaseSource(SqlClientFactory.Instance, connectionString, Query); DatabaseLoader loader = ctx.Data.CreateDatabaseLoader <ModelInput>(); IDataView dataView = loader.Load(dbSource); Console.WriteLine($"Loaded {dataView.GetRowCount()} rows..."); IDataView trainingData = ctx.Data.FilterRowsByColumn(dataView, "Year", upperBound: 2016); IDataView ValidationData = ctx.Data.FilterRowsByColumn(dataView, "Year", lowerBound: 2016); var forecastingPipeline = ctx.Forecasting.ForecastBySsa( outputColumnName: "ForecastedSales", inputColumnName: "TotalSales", windowSize: 7, seriesLength: 60, trainSize: 300, horizon: 30, confidenceLevel: 0.95f, confidenceLowerBoundColumn: "LowerBoundSales", confidenceUpperBoundColumn: "UpperBoundSales"); SsaForecastingTransformer forecaster = forecastingPipeline.Fit(trainingData); Evaluate(ValidationData, forecaster, ctx); var forecastEngine = forecaster.CreateTimeSeriesEngine <ModelInput, ModelOutput>(ctx); forecastEngine.CheckPoint(ctx, "c:\\temp\\Model.zip"); forecastEngine.CheckPoint(ctx, "C:\\Temp\\WallMartModels\\evaluation\\Model_HOBBIES_1_278_CA_1_evaluation.zip"); Forecast(ValidationData, 7, forecastEngine, ctx); Console.WriteLine("Training time series analysis completed"); }
static void Main(string[] args) { //Data prep. var TrainData = new List <DiabetesRecord>(); var DataFolder = GetAbsolutePath("../../../Data/"); var Files = Directory.GetFiles(DataFolder, "*"); foreach (var filePath in Files) { foreach (var line in File.ReadAllLines(filePath)) { var cols = line.Split('\t'); CultureInfo ci = new CultureInfo("id-ID"); var DateStr = $"{cols[0]} {cols[1]}";//Convert.ToDateTime($"{cols[0]} {cols[1]}", ci ); var len = DateStr.Length; float dataValue = 0; float.TryParse(cols[3], out dataValue); //make sure this line can be processed / contains correct time-series data if (len >= 15) { //parse string of date to datetime DateTime.TryParse(DateStr, out DateTime dt); if (dt.Year > DateTime.MinValue.Year) { TrainData.Add(new DiabetesRecord() { TimeStamp = dt, Code = cols[2], Data = dataValue }); } } } } HashSet <string> CodeIn = new HashSet <string>(); //only observe data with code 48,57-61 CodeIn.Add("48"); CodeIn.Add("57"); CodeIn.Add("58"); CodeIn.Add("59"); CodeIn.Add("60"); CodeIn.Add("61"); CodeIn.Add("62"); CodeIn.Add("63"); CodeIn.Add("64"); TrainData = TrainData.Where(x => CodeIn.Contains(x.Code)).OrderBy(a => a.TimeStamp).ToList(); Console.WriteLine($"Total data : {TrainData.Count}"); // Create MLContext mlContext = new MLContext(); //Load Data IDataView data = mlContext.Data.LoadFromEnumerable <DiabetesRecord>(TrainData); //assign the Number of records in dataset file to cosntant variable var RowCount = data.GetRowCount(); int size = RowCount.HasValue? Convert.ToInt32(RowCount.Value) : 36; //STEP 1: Create Esimtator DetectSpike(size, data); //To detect persistent change in the pattern DetectChangepoint(10, data); //set 10 datapoints per-sliding window Console.WriteLine("=============== End of process, hit any key to finish ==============="); Console.ReadLine(); }
public void TestDataFrameFromIDataView_SelectColumns() { DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); IDataView dfAsIDataView = df; DataFrame newDf = dfAsIDataView.ToDataFrame("Int", "Double"); Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count); Assert.Equal(2, newDf.Columns.Count); Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All()); Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All()); }
/// <summary> /// The InferPipelines methods are just public portals to the internal function that handle different /// types of data being passed in: training IDataView, path to training file, or train and test files. /// </summary> public static AutoMlMlState InferPipelines(IHostEnvironment env, PipelineOptimizerBase autoMlEngine, IDataView trainData, IDataView testData, int numTransformLevels, int batchSize, SupportedMetric metric, out PipelinePattern bestPipeline, ITerminator terminator, MacroUtils.TrainerKinds trainerKind) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(trainData, nameof(trainData)); env.CheckValue(testData, nameof(testData)); int numOfRows = (int)(trainData.GetRowCount(false) ?? 1000); AutoMlMlState amls = new AutoMlMlState(env, metric, autoMlEngine, terminator, trainerKind, trainData, testData); bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfRows); return(amls); }
public virtual long?GetRowCount() { _host.CheckValue(_sourceCtx, "_sourceCtx"); if (!IsInitialized()) { lock (_lock) if (!IsInitialized()) { DelayedInitialisationLockFree(); } } _host.CheckValue(_sourcePipe, "_sourcePipe"); return(_sourcePipe.GetRowCount()); }
public void TestDataFrameFromIDataView() { DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts IDataView dfAsIDataView = df; DataFrame newDf = dfAsIDataView.ToDataFrame(); Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count); Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count); for (int i = 0; i < df.Columns.Count; i++) { Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All()); } }
protected override void VerifyView(IDataView view) { Host.AssertValue(view); var rowCountNull = view.GetRowCount(); // This came from a binary IDV, so it must have an actual row count. Host.Assert(rowCountNull.HasValue); long rowCount = rowCountNull.Value; // Either we are holding only the schema information and have no rows, // or we have the double-stored hybrid dataview with data stored both // row-wise and column wise. Host.CheckDecode(rowCount == 0 || _parent._header.RowCount == rowCount); var schema = view.Schema; Host.CheckDecode(schema.ColumnCount == _parent._header.ColumnCount); }
/// <summary> /// Handles specific cases DataViewUtils does not handle. /// </summary> /// <param name="view">IDataView</param> /// <param name="predicate">column selector (null for all)</param> /// <returns>number of rows</returns> public static long ComputeRowCount(IDataView view, Func <int, bool> predicate = null) { var res = view.GetRowCount(); if (res.HasValue) { return(res.Value); } long lres = 0; using (var cur = view.GetRowCursor(predicate == null ? i => false : predicate)) { while (cur.MoveNext()) { ++lres; } } return(lres); }
/// <summary> /// Handles specific cases DataViewUtils does not handle. /// </summary> /// <param name="view">IDataView</param> /// <param name="predicate">column selector (null for all)</param> /// <returns>number of rows</returns> public static long ComputeRowCount(IDataView view, IEnumerable <DataViewSchema.Column> columnsNeeded = null) { var res = view.GetRowCount(); if (res.HasValue) { return(res.Value); } long lres = 0; using (var cur = view.GetRowCursor(columnsNeeded)) { while (cur.MoveNext()) { ++lres; } } return(lres); }
private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns) { Single[] targets = new Single[previousRuns.Count()]; Single[][] features = new Single[previousRuns.Count()][]; int i = 0; foreach (RunResult r in previousRuns) { features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true); targets[i] = (float)r.MetricValue; i++; } ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host); dvBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, targets); dvBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single, features); IDataView view = dvBuilder.GetDataView(); _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); using (IChannel ch = _host.Start("Single training")) { // Set relevant random forest arguments. // Train random forest. var trainer = new FastForestRegressionTrainer(_host, new FastForestRegressionTrainer.Options { FeatureFraction = _args.SplitRatio, NumberOfTrees = _args.NumOfTrees, MinimumExampleCountPerLeaf = _args.NMinForSplit, LabelColumnName = DefaultColumnNames.Label, FeatureColumnName = DefaultColumnNames.Features, }); var predictor = trainer.Fit(view); // Return random forest predictor. return(predictor.Model); } }
private FastForestRegressionPredictor FitModel(IEnumerable <IRunResult> previousRuns) { Single[] targets = new Single[previousRuns.Count()]; Single[][] features = new Single[previousRuns.Count()][]; int i = 0; foreach (RunResult r in previousRuns) { features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true); targets[i] = (Float)r.MetricValue; i++; } ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host); dvBuilder.AddColumn("Label", NumberType.Float, targets); dvBuilder.AddColumn("Features", NumberType.Float, features); IDataView view = dvBuilder.GetDataView(); _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); RoleMappedData data = TrainUtils.CreateExamples(view, "Label", "Features"); using (IChannel ch = _host.Start("Single training")) { // Set relevant random forest arguments. FastForestRegression.Arguments args = new FastForestRegression.Arguments(); args.FeatureFraction = _args.SplitRatio; args.NumTrees = _args.NumOfTrees; args.MinDocumentsInLeafs = _args.NMinForSplit; // Train random forest. FastForestRegression trainer = new FastForestRegression(_host, args); trainer.Train(data); FastForestRegressionPredictor predictor = trainer.CreatePredictor(); // Return random forest predictor. ch.Done(); return(predictor); } }
// Sometime GetRowCount doesn't really return the number of rows in the associated IDataView. // A more reliable solution is to turely iterate through all rows via a RowCursor. private static long GetRowCount(IDataView inputData, params VectorWhiteningEstimator.ColumnOptions[] columns) { long?rows = inputData.GetRowCount(); if (rows != null) { return(rows.GetValueOrDefault()); } int maxRows = columns.Max(i => i.MaximumNumberOfRows); long r = 0; using (var cursor = inputData.GetRowCursor()) { while (r < maxRows && cursor.MoveNext()) { r++; } } return(r); }
// Sometime GetRowCount doesn't really return the number of rows in the associated IDataView. // A more reliable solution is to turely iterate through all rows via a RowCursor. private static long GetRowCount(IDataView inputData, params ColumnInfo[] columns) { long?rows = inputData.GetRowCount(); if (rows != null) { return(rows.GetValueOrDefault()); } int maxRows = columns.Max(i => i.MaxRow); long r = 0; using (var cursor = inputData.GetRowCursor(col => false)) { while (r < maxRows && cursor.MoveNext()) { r++; } } return(r); }
private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns) { Single[] targets = new Single[previousRuns.Count()]; Single[][] features = new Single[previousRuns.Count()][]; int i = 0; foreach (RunResult r in previousRuns) { features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, r.ParameterSet, true); targets[i] = (Float)r.MetricValue; i++; } ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_context); dvBuilder.AddColumn(DefaultColumnNames.Label, NumberType.Float, targets); dvBuilder.AddColumn(DefaultColumnNames.Features, NumberType.Float, features); IDataView data = dvBuilder.GetDataView(); AutoMlUtils.Assert(data.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); // Set relevant random forest arguments. // Train random forest. var trainer = new FastForestRegression(_context, DefaultColumnNames.Label, DefaultColumnNames.Features, advancedSettings: s => { s.FeatureFraction = _args.SplitRatio; s.NumTrees = _args.NumOfTrees; s.MinDocumentsInLeafs = _args.NMinForSplit; }); var predictor = trainer.Train(data).Model; // Return random forest predictor. return(predictor); }
/// <summary> /// This builds the <see cref="TermMap"/> instances per column. /// </summary> private static TermMap[] Train(IHostEnvironment env, IChannel ch, ColInfo[] infos, ArgumentsBase args, ColumnBase[] column, IDataView trainingData) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(infos); ch.AssertValue(args); ch.AssertValue(column); ch.AssertValue(trainingData); if ((args.Term != null || !string.IsNullOrEmpty(args.Terms)) && (!string.IsNullOrWhiteSpace(args.DataFile) || args.Loader.IsGood() || !string.IsNullOrWhiteSpace(args.TermsColumn))) { ch.Warning("Explicit term list specified. Data file arguments will be ignored"); } if (!Enum.IsDefined(typeof(SortOrder), args.Sort)) { throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected", args.Sort); } TermMap termsFromFile = null; var termMap = new TermMap[infos.Length]; int[] lims = new int[infos.Length]; int trainsNeeded = 0; HashSet <int> toTrain = null; for (int iinfo = 0; iinfo < infos.Length; iinfo++) { // First check whether we have a terms argument, and handle it appropriately. var terms = new DvText(column[iinfo].Terms); var termsArray = column[iinfo].Term; if (!terms.HasChars && termsArray == null) { terms = new DvText(args.Terms); termsArray = args.Term; } terms = terms.Trim(); if (terms.HasChars || (termsArray != null && termsArray.Length > 0)) { // We have terms! Pass it in. var sortOrder = column[iinfo].Sort ?? args.Sort; if (!Enum.IsDefined(typeof(SortOrder), sortOrder)) { throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, infos[iinfo].Name); } var bldr = Builder.Create(infos[iinfo].TypeSrc, sortOrder); if (terms.HasChars) { bldr.ParseAddTermArg(ref terms, ch); } else { bldr.ParseAddTermArg(termsArray, ch); } termMap[iinfo] = bldr.Finish(); } else if (!string.IsNullOrWhiteSpace(args.DataFile)) { // First column using this file. if (termsFromFile == null) { var bldr = Builder.Create(infos[iinfo].TypeSrc, column[iinfo].Sort ?? args.Sort); termsFromFile = CreateFileTermMap(env, ch, args, bldr); } if (!termsFromFile.ItemType.Equals(infos[iinfo].TypeSrc.ItemType)) { // We have no current plans to support re-interpretation based on different column // type, not only because it's unclear what realistic customer use-cases for such // a complicated feature would be, and also because it's difficult to see how we // can logically reconcile "reinterpretation" for different types with the resulting // data view having an actual type. throw ch.ExceptUserArg(nameof(args.DataFile), "Data file terms loaded as type '{0}' but mismatches column '{1}' item type '{2}'", termsFromFile.ItemType, infos[iinfo].Name, infos[iinfo].TypeSrc.ItemType); } termMap[iinfo] = termsFromFile; } else { // Auto train this column. Leave the term map null for now, but set the lim appropriately. lims[iinfo] = column[iinfo].MaxNumTerms ?? args.MaxNumTerms; ch.CheckUserArg(lims[iinfo] > 0, nameof(Column.MaxNumTerms), "Must be positive"); Utils.Add(ref toTrain, infos[iinfo].Source); ++trainsNeeded; } } ch.Assert((Utils.Size(toTrain) == 0) == (trainsNeeded == 0)); ch.Assert(Utils.Size(toTrain) <= trainsNeeded); if (trainsNeeded > 0) { Trainer[] trainer = new Trainer[trainsNeeded]; int[] trainerInfo = new int[trainsNeeded]; // Open the cursor, then instantiate the trainers. int itrainer; using (var cursor = trainingData.GetRowCursor(toTrain.Contains)) using (var pch = env.StartProgressChannel("Building term dictionary")) { long rowCur = 0; double rowCount = trainingData.GetRowCount(true) ?? double.NaN; var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" }); itrainer = 0; for (int iinfo = 0; iinfo < infos.Length; ++iinfo) { if (termMap[iinfo] != null) { continue; } var bldr = Builder.Create(infos[iinfo].TypeSrc, column[iinfo].Sort ?? args.Sort); trainerInfo[itrainer] = iinfo; trainer[itrainer++] = Trainer.Create(cursor, infos[iinfo].Source, false, lims[iinfo], bldr); } ch.Assert(itrainer == trainer.Length); pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); // Purely feedback for the user. That the other thread might be // working in the background is not a problem. e.SetMetric(0, trainer.Sum(t => t.Count)); }); // The [0,tmin) trainers are finished. int tmin = 0; // We might exit early if all trainers reach their maximum. while (tmin < trainer.Length && cursor.MoveNext()) { rowCur++; for (int t = tmin; t < trainer.Length; ++t) { if (!trainer[t].ProcessRow()) { Utils.Swap(ref trainerInfo[t], ref trainerInfo[tmin]); Utils.Swap(ref trainer[t], ref trainer[tmin++]); } } } pch.Checkpoint(trainer.Sum(t => t.Count), rowCur); } for (itrainer = 0; itrainer < trainer.Length; ++itrainer) { int iinfo = trainerInfo[itrainer]; ch.Assert(termMap[iinfo] == null); if (trainer[itrainer].Count == 0) { ch.Warning("Term map for output column '{0}' contains no entries.", infos[iinfo].Name); } termMap[iinfo] = trainer[itrainer].Finish(); // Allow the intermediate structures in the trainer and builder to be released as we iterate // over the columns, as the Finish operation can potentially result in the allocation of // additional structures. trainer[itrainer] = null; } ch.Assert(termMap.All(tm => tm != null)); ch.Assert(termMap.Zip(infos, (tm, info) => tm.ItemType.Equals(info.TypeSrc.ItemType)).All(x => x)); } return(termMap); }
public long?GetRowCount(bool lazy = true) { return(View.GetRowCount(lazy)); }
public long?GetRowCount() => _source.GetRowCount();
public long?GetRowCount() { return(View.GetRowCount()); }
public long?GetRowCount() { _host.AssertValue(_input, "_input"); return(_mergedView.GetRowCount()); }
/// <summary> /// Returns the feature selection scores for each slot of each column. /// </summary> /// <param name="env">The host environment.</param> /// <param name="input">The input dataview.</param> /// <param name="columns">The columns for which to compute the feature selection scores.</param> /// <param name="colSizes">Outputs an array containing the vector sizes of the input columns</param> /// <returns>A list of scores.</returns> public static long[][] Train(IHostEnvironment env, IDataView input, string[] columns, out int[] colSizes) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(input, nameof(input)); env.CheckParam(Utils.Size(columns) > 0, nameof(columns)); var schema = input.Schema; var size = columns.Length; var activeInput = new bool[schema.ColumnCount]; var colSrcs = new int[size]; var colTypes = new ColumnType[size]; colSizes = new int[size]; for (int i = 0; i < size; i++) { int colSrc; var colName = columns[i]; if (!schema.TryGetColumnIndex(colName, out colSrc)) { throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Source column '{0}' not found", colName); } var colType = schema.GetColumnType(colSrc); if (colType.IsVector && !colType.IsKnownSizeVector) { throw env.ExceptUserArg(nameof(CountFeatureSelectionTransform.Arguments.Column), "Variable length column '{0}' is not allowed", colName); } activeInput[colSrc] = true; colSrcs[i] = colSrc; colTypes[i] = colType; colSizes[i] = colType.ValueCount; } var aggregators = new CountAggregator[size]; long rowCur = 0; double rowCount = input.GetRowCount(true) ?? double.NaN; using (var pch = env.StartProgressChannel("Aggregating counts")) using (var cursor = input.GetRowCursor(col => activeInput[col])) { var header = new ProgressHeader(new[] { "rows" }); pch.SetHeader(header, e => { e.SetProgress(0, rowCur, rowCount); }); for (int i = 0; i < size; i++) { if (colTypes[i].IsVector) { aggregators[i] = GetVecAggregator(cursor, colTypes[i], colSrcs[i]); } else { aggregators[i] = GetOneAggregator(cursor, colTypes[i], colSrcs[i]); } } while (cursor.MoveNext()) { for (int i = 0; i < size; i++) { aggregators[i].ProcessValue(); } rowCur++; } pch.Checkpoint(rowCur); } return(aggregators.Select(a => a.Count).ToArray()); }
private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs) { // Contains the maximum number of grams to store in the dictionary, for each level of ngrams, // from 1 (in position 0) up to ngramLength (in position ngramLength-1) var lims = new int[Infos.Length][]; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { var all = args.Column[iinfo].AllLengths ?? args.AllLengths; var ngramLength = _exes[iinfo].NgramLength; var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms; if (!all) { Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms)); lims[iinfo] = new int[ngramLength]; lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0]; } else { Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms)); Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1]; lims[iinfo] = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } } var helpers = new NgramBufferBuilder[Infos.Length]; var getters = new ValueGetter <VBuffer <uint> > [Infos.Length]; var src = new VBuffer <uint> [Infos.Length]; // Keep track of how many grams are in the pool for each value of n. Position // i in _counts counts how many (i+1)-grams are in the pool for column iinfo. var counts = new int[Infos.Length][]; var ngramMaps = new SequencePool[Infos.Length]; bool[] activeInput = new bool[trainingData.Schema.ColumnCount]; foreach (var info in Infos) { activeInput[info.Source] = true; } using (var cursor = trainingData.GetRowCursor(col => activeInput[col])) using (var pch = Host.StartProgressChannel("Building n-gram dictionary")) { for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey); var ngramLength = _exes[iinfo].NgramLength; var skipLength = _exes[iinfo].SkipLength; getters[iinfo] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source); src[iinfo] = default(VBuffer <uint>); counts[iinfo] = new int[ngramLength]; ngramMaps[iinfo] = new SequencePool(); // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host)); } int cInfoFull = 0; bool[] infoFull = new bool[Infos.Length]; invDocFreqs = new double[Infos.Length][]; long totalDocs = 0; Double rowCount = trainingData.GetRowCount() ?? Double.NaN; var buffers = new VBuffer <float> [Infos.Length]; pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }), e => e.SetProgress(0, totalDocs, rowCount)); while (cInfoFull < Infos.Length && cursor.MoveNext()) { totalDocs++; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { getters[iinfo](ref src[iinfo]); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } if (!infoFull[iinfo]) { if (_exes[iinfo].RequireIdf()) { helpers[iinfo].Reset(); } helpers[iinfo].AddNgrams(in src[iinfo], 0, keyCount); if (_exes[iinfo].RequireIdf()) { int totalNgrams = counts[iinfo].Sum(); Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams); helpers[iinfo].GetResult(ref buffers[iinfo]); foreach (var pair in buffers[iinfo].Items()) { if (pair.Value >= 1) { invDocFreqs[iinfo][pair.Key] += 1; } } } } AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); } } pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs); for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++) { if (invDocFreqs[iinfo][i] != 0) { invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]); } } } for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); int ngramLength = _exes[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) { _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0; } } return(ngramMaps); } }
private void WriteDataCore(IChannel ch, TextWriter writer, IDataView data, out string argsLoader, out long count, out int min, out int max, params int[] cols) { _host.AssertValue(ch); ch.AssertValue(writer); ch.AssertValue(data); ch.AssertNonEmpty(cols); // Determine the active columns and whether there is header information. bool[] active = new bool[data.Schema.ColumnCount]; for (int i = 0; i < cols.Length; i++) { ch.Check(0 <= cols[i] && cols[i] < active.Length); ch.Check(data.Schema.GetColumnType(cols[i]).ItemType.RawKind != 0); active[cols[i]] = true; } bool hasHeader = false; if (_outputHeader) { for (int i = 0; i < cols.Length; i++) { if (hasHeader) { continue; } var type = data.Schema.GetColumnType(cols[i]); if (!type.IsVector) { hasHeader = true; continue; } if (!type.IsKnownSizeVector) { continue; } var typeNames = data.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, cols[i]); if (typeNames != null && typeNames.VectorSize == type.VectorSize && typeNames.ItemType.IsText) { hasHeader = true; } } } using (var cursor = data.GetRowCursor(i => active[i])) { var pipes = new ValueWriter[cols.Length]; for (int i = 0; i < cols.Length; i++) { pipes[i] = ValueWriter.Create(cursor, cols[i], _sepChar); } // REVIEW: This should be outside the cursor creation. string header = CreateLoaderArguments(data.Schema, pipes, hasHeader, ch); argsLoader = header; if (_outputSchema) { WriteSchemaAsComment(writer, header); } double rowCount = data.GetRowCount(true) ?? double.NaN; using (var pch = !_silent ? _host.StartProgressChannel("TextSaver: saving data") : null) { long stateCount = 0; var state = new State(this, writer, pipes, hasHeader); if (pch != null) { pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, stateCount, rowCount)); } state.Run(cursor, ref stateCount, out min, out max); count = stateCount; if (pch != null) { pch.Checkpoint(stateCount); } } } }
public long?GetRowCount(bool lazy = true) { return(_input.GetRowCount(lazy)); }
public long?GetRowCount() { _host.AssertValue(_source, "_input"); return(_source.GetRowCount()); }
public long?GetRowCount() { return(_source.GetRowCount()); }
public long?GetRowCount(bool lazy = true) { return(_source.GetRowCount(lazy)); }
private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock *penv, IDataView view, Dictionary <string, ColumnMetadataInfo> infos = null) { Contracts.AssertValue(ch); Contracts.Assert(penv != null); Contracts.AssertValue(view); Contracts.AssertValueOrNull(infos); if (penv->dataSink == null) { // Environment doesn't want any data! return; } var dataSink = MarshalDelegate <DataSink>(penv->dataSink); var schema = view.Schema; var colIndices = new List <int>(); var kindList = new List <DataKind>(); var keyCardList = new List <int>(); var nameUtf8Bytes = new List <Byte>(); var nameIndices = new List <int>(); var expandCols = new HashSet <int>(); var allNames = new HashSet <string>(); for (int col = 0; col < schema.Count; col++) { if (schema[col].IsHidden) { continue; } var fullType = schema[col].Type; var itemType = fullType.ItemType; var name = schema[col].Name; DataKind kind = itemType.RawKind; int keyCard; if (fullType.ValueCount == 0) { throw ch.ExceptNotSupp("Column has variable length vector: " + name + ". Not supported in python. Drop column before sending to Python"); } if (itemType.IsKey) { // Key types are returned as their signed counterparts in Python, so that -1 can be the missing value. // For U1 and U2 kinds, we convert to a larger type to prevent overflow. For U4 and U8 kinds, we convert // to I4 if the key count is known (since KeyCount is an I4), and to I8 otherwise. switch (kind) { case DataKind.U1: kind = DataKind.I2; break; case DataKind.U2: kind = DataKind.I4; break; case DataKind.U4: // We convert known-cardinality U4 key types to I4. kind = itemType.KeyCount > 0 ? DataKind.I4 : DataKind.I8; break; case DataKind.U8: // We convert known-cardinality U8 key types to I4. kind = itemType.KeyCount > 0 ? DataKind.I4 : DataKind.I8; break; } keyCard = itemType.KeyCount; if (!schema[col].HasKeyValues(keyCard)) { keyCard = -1; } } else if (itemType.IsStandardScalar()) { switch (itemType.RawKind) { default: throw Contracts.Except("Data type {0} not handled", itemType.RawKind); case DataKind.I1: case DataKind.I2: case DataKind.I4: case DataKind.I8: case DataKind.U1: case DataKind.U2: case DataKind.U4: case DataKind.U8: case DataKind.R4: case DataKind.R8: case DataKind.BL: case DataKind.TX: break; } keyCard = -1; } else { throw Contracts.Except("Data type {0} not handled", itemType.RawKind); } int nSlots; ColumnMetadataInfo info; if (infos != null && infos.TryGetValue(name, out info) && info.Expand) { expandCols.Add(col); Contracts.Assert(fullType.IsKnownSizeVector); nSlots = fullType.VectorSize; if (info.SlotNames != null) { Contracts.Assert(info.SlotNames.Length == nSlots); for (int i = 0; i < nSlots; i++) { AddUniqueName(info.SlotNames[i], allNames, nameIndices, nameUtf8Bytes); } } else if (schema[col].HasSlotNames(nSlots)) { var romNames = default(VBuffer <ReadOnlyMemory <char> >); schema[col].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref romNames); foreach (var kvp in romNames.Items(true)) { // REVIEW: Add the proper number of zeros to the slot index to make them sort in the right order. var slotName = name + "." + (!kvp.Value.IsEmpty ? kvp.Value.ToString() : kvp.Key.ToString(CultureInfo.InvariantCulture)); AddUniqueName(slotName, allNames, nameIndices, nameUtf8Bytes); } } else { for (int i = 0; i < nSlots; i++) { AddUniqueName(name + "." + i, allNames, nameIndices, nameUtf8Bytes); } } } else { nSlots = 1; AddUniqueName(name, allNames, nameIndices, nameUtf8Bytes); } colIndices.Add(col); for (int i = 0; i < nSlots; i++) { kindList.Add(kind); keyCardList.Add(keyCard); } } ch.Assert(allNames.Count == kindList.Count); ch.Assert(allNames.Count == keyCardList.Count); ch.Assert(allNames.Count == nameIndices.Count); var kinds = kindList.ToArray(); var keyCards = keyCardList.ToArray(); var nameBytes = nameUtf8Bytes.ToArray(); var names = new byte *[allNames.Count]; fixed(DataKind *prgkind = kinds) fixed(byte *prgbNames = nameBytes) fixed(byte **prgname = names) fixed(int *prgkeyCard = keyCards) { for (int iid = 0; iid < names.Length; iid++) { names[iid] = prgbNames + nameIndices[iid]; } DataViewBlock block; block.ccol = allNames.Count; block.crow = view.GetRowCount() ?? 0; block.names = (sbyte **)prgname; block.kinds = prgkind; block.keyCards = prgkeyCard; dataSink(penv, &block, out var setters, out var keyValueSetter); if (setters == null) { // REVIEW: What should we do? return; } ch.Assert(keyValueSetter != null); var kvSet = MarshalDelegate <KeyValueSetter>(keyValueSetter); using (var cursor = view.GetRowCursor(colIndices.Contains)) { var fillers = new BufferFillerBase[colIndices.Count]; var pyColumn = 0; var keyIndex = 0; for (int i = 0; i < colIndices.Count; i++) { var type = schema[colIndices[i]].Type; if (type.ItemType.IsKey && schema[colIndices[i]].HasKeyValues(type.ItemType.KeyCount)) { ch.Assert(schema[colIndices[i]].HasKeyValues(type.ItemType.KeyCount)); var keyValues = default(VBuffer <ReadOnlyMemory <char> >); schema[colIndices[i]].Metadata.GetValue(MetadataUtils.Kinds.KeyValues, ref keyValues); for (int slot = 0; slot < type.ValueCount; slot++) { foreach (var kvp in keyValues.Items()) { if (kvp.Value.IsEmpty) { kvSet(penv, keyIndex, kvp.Key, null, 0); } else { byte[] bt = Encoding.UTF8.GetBytes(kvp.Value.ToString()); fixed(byte *pt = bt) kvSet(penv, keyIndex, kvp.Key, (sbyte *)pt, bt.Length); } } keyIndex++; } } fillers[i] = BufferFillerBase.Create(penv, cursor, pyColumn, colIndices[i], kinds[pyColumn], type, setters[pyColumn]); pyColumn += type.IsVector ? type.VectorSize : 1; } for (int crow = 0; ; crow++) { // Advance to the next row. if (!cursor.MoveNext()) { break; } // Fill values for the current row. for (int i = 0; i < fillers.Length; i++) { fillers[i].Set(); } } } } }
public long?GetRowCount() { return(_input.GetRowCount()); }