/// <summary> /// Creates a new instance out of a parameter info, presumably fetched from a user specified delegate. /// </summary> /// <typeparam name="TShape">The static shape type.</typeparam> /// <param name="info">The parameter info on the method, whose type should be /// <typeparamref name="TShape"/>.</param> /// <returns>A new instance with names and members types enumerated.</returns> public static StaticSchemaShape Make <TShape>(ParameterInfo info) { Contracts.AssertValue(info); var pairs = StaticPipeInternalUtils.GetNamesTypes <TShape, PipelineColumn>(info); return(new StaticSchemaShape(pairs)); }
public Estimator <TInShape, TNewOutShape, ITransformer> Append <[IsShape] TNewOutShape>(Func <TOutShape, TNewOutShape> mapper) { Contracts.CheckValue(mapper, nameof(mapper)); using (var ch = Env.Start(nameof(Append))) { var method = mapper.Method; // Construct the dummy column structure, then apply the mapping. var input = StaticPipeInternalUtils.MakeAnalysisInstance <TOutShape>(out var fakeReconciler); KeyValuePair <string, PipelineColumn>[] inPairs = StaticPipeInternalUtils.GetNamesValues(input, method.GetParameters()[0]); // Initially we suppose we've only assigned names to the inputs. var inputColToName = new Dictionary <PipelineColumn, string>(); foreach (var p in inPairs) { inputColToName[p.Value] = p.Key; } string NameMap(PipelineColumn col) { inputColToName.TryGetValue(col, out var val); return(val); } var readerEst = StaticPipeUtils.GeneralFunctionAnalyzer(Env, ch, input, fakeReconciler, mapper, out var estTail, NameMap); ch.Assert(readerEst == null); ch.AssertValue(estTail); var est = AsDynamic.Append(estTail); var newOut = StaticSchemaShape.Make <TNewOutShape>(method.ReturnParameter); return(new Estimator <TInShape, TNewOutShape, ITransformer>(Env, est, _inShape, newOut)); } }
GeneralFunctionAnalyzer <TIn, TDelegateInput, TOutShape>( IHostEnvironment env, IChannel ch, TDelegateInput input, LoaderReconciler <TIn> baseReconciler, Func <TDelegateInput, TOutShape> mapper, out IEstimator <ITransformer> estimator, Func <PipelineColumn, string> inputNameFunction) { Contracts.CheckValue(mapper, nameof(mapper)); var method = mapper.Method; var output = mapper(input); KeyValuePair <string, PipelineColumn>[] outPairs = StaticPipeInternalUtils.GetNamesValues(output, method.ReturnParameter); // Map where the key depends on the set of things in the value. The value contains the yet unresolved dependencies. var keyDependsOn = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >(); // Map where the set of things in the value depend on the key. var dependsOnKey = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >(); // The set of columns detected with zero dependencies. var zeroDependencies = new List <PipelineColumn>(); // First we build up the two structures above, using a queue and visiting from the outputs up. var toVisit = new Queue <PipelineColumn>(outPairs.Select(p => p.Value)); while (toVisit.Count > 0) { var col = toVisit.Dequeue(); ch.CheckParam(col != null, nameof(mapper), "The delegate seems to have null columns returned somewhere in the pipe."); if (keyDependsOn.ContainsKey(col)) { continue; // Already visited. } var dependsOn = new HashSet <PipelineColumn>(); foreach (var dep in col.Dependencies ?? Enumerable.Empty <PipelineColumn>()) { dependsOn.Add(dep); if (!dependsOnKey.TryGetValue(dep, out var dependsOnDep)) { dependsOnKey[dep] = dependsOnDep = new HashSet <PipelineColumn>(); toVisit.Enqueue(dep); } dependsOnDep.Add(col); } keyDependsOn[col] = dependsOn; if (dependsOn.Count == 0) { zeroDependencies.Add(col); } } // Get the base input columns. var baseInputs = keyDependsOn.Select(p => p.Key).Where(col => col.ReconcilerObj == baseReconciler).ToArray(); // The columns that utilize the base reconciler should have no dependencies. This could only happen if // the caller of this function has introduced a situation whereby they are claiming they can reconcile // to a data-loader object but still have input data dependencies, which does not make sense and // indicates that there is a bug in that component code. Unfortunately we can only detect that condition, // not determine exactly how it arose, but we can still do so to indicate to the user that there is a // problem somewhere in the stack. ch.CheckParam(baseInputs.All(col => keyDependsOn[col].Count == 0), nameof(input), "Bug detected where column producing object was yielding columns with dependencies."); // This holds the mappings of columns to names and back. Note that while the same column could be used on // the *output*, for example, you could hypothetically have `(a: r.Foo, b: r.Foo)`, we treat that as the last thing // that is done. var nameMap = new BidirectionalDictionary <string, PipelineColumn>(); // Check to see if we have any set of initial names. This is important in the case where we are mapping // in an input data view. foreach (var col in baseInputs) { string inputName = inputNameFunction(col); if (inputName != null) { ch.Assert(!nameMap.ContainsKey(col)); ch.Assert(!nameMap.ContainsKey(inputName)); nameMap[col] = inputName; ch.Trace($"Using input with name {inputName}."); } } estimator = null; var toCopy = new List <(string dst, string src)>(); int tempNum = 0; // For all outputs, get potential name collisions with used inputs. Resolve by assigning the input a temporary name. foreach (var p in outPairs) { // If the name for the output is already used by one of the inputs, and this output column does not // happen to have the same name, then we need to rename that input to keep it available. if (nameMap.TryGetValue(p.Key, out var inputCol) && p.Value != inputCol) { ch.Assert(baseInputs.Contains(inputCol)); string tempName = $"#Temp_{tempNum++}"; ch.Trace($"Input/output name collision: Renaming '{p.Key}' to '{tempName}'."); toCopy.Add((tempName, p.Key)); nameMap[tempName] = nameMap[p.Key]; ch.Assert(!nameMap.ContainsKey(p.Key)); } // If we already have a name for this output column, maybe it is used elsewhere. (This can happen when // the only thing done with an input is we rename it, or output it twice, or something like this.) In // this case it is most appropriate to delay renaming till after all other processing has been done in // that case. But otherwise we may as well just take the name. if (!nameMap.ContainsKey(p.Value)) { nameMap[p.Key] = p.Value; } } // If any renamings were necessary, create the CopyColumns estimator. if (toCopy.Count > 0) { estimator = new ColumnCopyingEstimator(env, toCopy.ToArray()); } // First clear the inputs from zero-dependencies yet to be resolved. foreach (var col in baseInputs) { ch.Assert(zeroDependencies.Contains(col)); ch.Assert(col.ReconcilerObj == baseReconciler); zeroDependencies.Remove(col); // Make more efficient... if (!dependsOnKey.TryGetValue(col, out var depends)) { continue; } // If any of these base inputs do not have names because, for example, they do not directly appear // in the outputs and otherwise do not have names, assign them a name. if (!nameMap.ContainsKey(col)) { nameMap[col] = $"Temp_{tempNum++}"; } foreach (var depender in depends) { var dependencies = keyDependsOn[depender]; ch.Assert(dependencies.Contains(col)); dependencies.Remove(col); if (dependencies.Count == 0) { zeroDependencies.Add(depender); } } dependsOnKey.Remove(col); } // Call the reconciler to get the base loader estimator. var loaderEstimator = baseReconciler.Reconcile(env, baseInputs, nameMap.AsOther(baseInputs)); ch.AssertValueOrNull(loaderEstimator); // Next we iteratively find those columns with zero dependencies, "create" them, and if anything depends on // these add them to the collection of zero dependencies, etc. etc. while (zeroDependencies.Count > 0) { // All columns with the same reconciler can be transformed together. // Note that the following policy of just taking the first group is not optimal. So for example, we // could have three columns, (a, b, c). If we had the output (a.X(), b.X() c.Y().X()), then maybe we'd // reconcile a.X() and b.X() together, then reconcile c.Y(), then reconcile c.Y().X() alone. Whereas, we // could have reconciled c.Y() first, then reconciled a.X(), b.X(), and c.Y().X() together. var group = zeroDependencies.GroupBy(p => p.ReconcilerObj).First(); // Beyond that first group that *might* be a data loader reconciler, all subsequent operations will // be on where the data is already loaded and so accept data as an input, that is, they should produce // an estimator. If this is not the case something seriously wonky is going on, most probably that the // user tried to use a column from another source. If this is detected we can produce a sensible error // message to tell them not to do this. if (!(group.Key is EstimatorReconciler rec)) { throw ch.Except("Columns from multiple sources were detected. " + "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); } PipelineColumn[] cols = group.ToArray(); // All dependencies should, by this time, have names. ch.Assert(cols.SelectMany(c => c.Dependencies).All(dep => nameMap.ContainsKey(dep))); foreach (var newCol in cols) { if (!nameMap.ContainsKey(newCol)) { nameMap[newCol] = $"#Temp_{tempNum++}"; } } var localInputNames = nameMap.AsOther(cols.SelectMany(c => c.Dependencies ?? Enumerable.Empty <PipelineColumn>())); var localOutputNames = nameMap.AsOther(cols); var usedNames = new HashSet <string>(nameMap.Keys1.Except(localOutputNames.Values)); var localEstimator = rec.Reconcile(env, cols, localInputNames, localOutputNames, usedNames); loaderEstimator = loaderEstimator?.Append(localEstimator); estimator = estimator?.Append(localEstimator) ?? localEstimator; foreach (var newCol in cols) { zeroDependencies.Remove(newCol); // Make more efficient!! // Finally, we find all columns that depend on this one. If this happened to be the last pending // dependency, then we add it to the list. if (dependsOnKey.TryGetValue(newCol, out var depends)) { foreach (var depender in depends) { var dependencies = keyDependsOn[depender]; Contracts.Assert(dependencies.Contains(newCol)); dependencies.Remove(newCol); if (dependencies.Count == 0) { zeroDependencies.Add(depender); } } dependsOnKey.Remove(newCol); } } } if (keyDependsOn.Any(p => p.Value.Count > 0)) { // This might happen if the user does something incredibly strange, like, say, take some prior // lambda, assign a column to a local variable, then re-use it downstream in a different lambda. // The user would have to go to some extraordinary effort to do that, but nonetheless we want to // fail with a semi-sensible error message. throw ch.Except("There were some leftover columns with unresolved dependencies. " + "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); } // Now do the final renaming, if any is necessary. toCopy.Clear(); foreach (var p in outPairs) { // TODO: Right now we just write stuff out. Once the copy-columns estimator is in place // we ought to do this for real. Contracts.Assert(nameMap.ContainsKey(p.Value)); string currentName = nameMap[p.Value]; if (currentName != p.Key) { ch.Trace($"Will copy '{p.Key}' to '{currentName}'"); toCopy.Add((p.Key, currentName)); } } // If any final renamings were necessary, insert the appropriate CopyColumns transform. if (toCopy.Count > 0) { var copyEstimator = new ColumnCopyingEstimator(env, toCopy.ToArray()); if (estimator == null) { estimator = copyEstimator; } else { estimator = estimator.Append(copyEstimator); } } ch.Trace($"Exiting {nameof(LoaderEstimatorAnalyzerHelper)}"); return(loaderEstimator); }