private void GetPathSlotNames(int col, ref VBuffer <DvText> dst) { var numTrees = _ensemble.NumTrees; var totalNodeCount = _totalLeafCount - numTrees; var names = dst.Values; if (Utils.Size(names) < totalNodeCount) { names = new DvText[totalNodeCount]; } int i = 0; int t = 0; foreach (var tree in _ensemble.GetTrees()) { var numLeaves = tree.NumLeaves; for (int l = 0; l < tree.NumLeaves - 1; l++) { names[i++] = new DvText(string.Format("Tree{0:000}Node{1:000}", t, l)); } t++; } _host.Assert(i == totalNodeCount); dst = new VBuffer <DvText>(totalNodeCount, names, dst.Indices); }
public override void InitializeNextPass(IRow row, RoleMappedSchema schema) { Host.Assert(!_streaming && PassNum < 2 || PassNum < 1); Host.AssertValue(schema.Label); var score = schema.GetUniqueColumn(MetadataUtils.Const.ScoreValueKind.Score); _labelGetter = RowCursorUtils.GetLabelGetter(row, schema.Label.Index); _scoreGetter = row.GetGetter <float>(score.Index); Host.AssertValue(_labelGetter); Host.AssertValue(_scoreGetter); if (IsMainPass()) { Host.Assert(_topExamples.Count == 0); if (_nameIndex < 0) { int rowCounter = 0; _nameGetter = (ref DvText dst) => dst = new DvText((rowCounter++).ToString()); } else { _nameGetter = row.GetGetter <DvText>(_nameIndex); } } }
private int[][] CompileSlotMap(string slotMapString, int srcSlotCount) { var parts = new DvText(slotMapString).Split(new[] { ';' }).ToArray(); var slotMap = new int[parts.Length][]; for (int i = 0; i < slotMap.Length; i++) { var slotIndices = parts[i].Split(new[] { ',' }).ToArray(); var slots = new int[slotIndices.Length]; slotMap[i] = slots; for (int j = 0; j < slots.Length; j++) { int index; if (!int.TryParse(slotIndices[j].ToString(), out index) || index < 0 || index >= srcSlotCount) { throw Host.Except("Unexpected slot index '{1}' in group {0}. Expected 0 to {2}", i, slotIndices[j], srcSlotCount - 1); } slots[j] = index; } if (slots.Distinct().Count() < slots.Length) { throw Host.Except("Group '{0}' has duplicate slot indices", parts[i]); } } return(slotMap); }
private void AddTerms(DvText txt, char[] separators, List <DvText> terms) { Host.AssertNonEmpty(separators); var rest = txt; if (separators.Length > 1) { while (rest.HasChars) { DvText term; rest.SplitOne(separators, out term, out rest); term = term.Trim(); if (term.HasChars) { terms.Add(term); } } } else { var separator = separators[0]; while (rest.HasChars) { DvText term; rest.SplitOne(separator, out term, out rest); term = term.Trim(); if (term.HasChars) { terms.Add(term); } } } }
private ValueGetter <VBuffer <DvText> > MakeGetterOne(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsText); var getSrc = GetSrcGetter <DvText>(input, iinfo); var src = default(DvText); var terms = new List <DvText>(); var separators = _exes[iinfo].Separators; return ((ref VBuffer <DvText> dst) => { getSrc(ref src); terms.Clear(); AddTerms(src, separators, terms); var values = dst.Values; if (terms.Count > 0) { if (Utils.Size(values) < terms.Count) { values = new DvText[terms.Count]; } terms.CopyTo(values); } dst = new VBuffer <DvText>(terms.Count, values, dst.Indices); }); }
private void ComposeNgramString(uint[] ngram, int count, StringBuilder sb, int keyCount, TermGetter termGetter) { Host.AssertValue(sb); Host.AssertValue(ngram); Host.Assert(keyCount > 0); sb.Clear(); DvText term = default(DvText); string sep = ""; for (int iterm = 0; iterm < count; iterm++) { sb.Append(sep); sep = "|"; var unigram = ngram[iterm]; if (unigram <= 0 || unigram > keyCount) { sb.Append("*"); } else { termGetter((int)unigram - 1, ref term); term.AddToStringBuilder(sb); } } }
private static uint HashCore(uint seed, ref DvText value, int i, uint mask) { Contracts.Assert(Utils.IsPowerOfTwo(mask + 1)); if (!value.HasChars) { return(0); } return((value.Trim().Hash(Hashing.MurmurRound(seed, (uint)i)) & mask) + 1); }
private void GetSlotNames(int iinfo, ref VBuffer <DvText> dst) { Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.AssertValue(_exes[iinfo].SlotMap); int n = _exes[iinfo].OutputValueCount; var output = dst.Values; if (Utils.Size(output) < n) { output = new DvText[n]; } var srcColumnName = Source.Schema.GetColumnName(Infos[iinfo].Source); bool useDefaultSlotNames = !Source.Schema.HasSlotNames(Infos[iinfo].Source, Infos[iinfo].TypeSrc.VectorSize); VBuffer <DvText> srcSlotNames = default(VBuffer <DvText>); if (!useDefaultSlotNames) { Source.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, Infos[iinfo].Source, ref srcSlotNames); useDefaultSlotNames = !srcSlotNames.IsDense || srcSlotNames.Length != Infos[iinfo].TypeSrc.ValueCount; } var outputSlotName = new StringBuilder(); for (int slot = 0; slot < n; slot++) { var slotList = _exes[iinfo].SlotMap[slot]; outputSlotName.Clear(); foreach (var inputSlotIndex in slotList) { if (outputSlotName.Length > 0) { outputSlotName.Append("+"); } if (useDefaultSlotNames) { outputSlotName.AppendFormat("{0}[{1}]", srcColumnName, inputSlotIndex); } else { outputSlotName.Append(srcSlotNames.Values[inputSlotIndex]); } } output[slot] = new DvText(outputSlotName.ToString()); } dst = new VBuffer <DvText>(n, output, dst.Indices); }
/// <summary> /// Determine the default scorer for a schema bound mapper. This looks for text-valued ScoreColumnKind /// metadata on the first column of the mapper. If that text is found and maps to a scorer loadable class, /// that component is used. Otherwise, the GenericScorer is used. /// </summary> /// <param name="mapper">The schema bound mapper to get the default scorer.</param>. /// <param name="suffix">An optional suffix to append to the default column names.</param> public static TScorerFactory GetScorerComponent( ISchemaBoundMapper mapper, string suffix = null) { Contracts.AssertValue(mapper); ComponentCatalog.LoadableClassInfo info = null; DvText scoreKind = default; if (mapper.OutputSchema.ColumnCount > 0 && mapper.OutputSchema.TryGetMetadata(TextType.Instance, MetadataUtils.Kinds.ScoreColumnKind, 0, ref scoreKind) && scoreKind.HasChars) { var loadName = scoreKind.ToString(); info = ComponentCatalog.GetLoadableClassInfo <SignatureDataScorer>(loadName); if (info == null || !typeof(IDataScorerTransform).IsAssignableFrom(info.Type)) { info = null; } } Func <IHostEnvironment, IDataView, ISchemaBoundMapper, RoleMappedSchema, IDataScorerTransform> factoryFunc; if (info == null) { factoryFunc = (env, data, innerMapper, trainSchema) => new GenericScorer( env, new GenericScorer.Arguments() { Suffix = suffix }, data, innerMapper, trainSchema); } else { factoryFunc = (env, data, innerMapper, trainSchema) => { object args = info.CreateArguments(); if (args is ScorerArgumentsBase scorerArgs) { scorerArgs.Suffix = suffix; } return((IDataScorerTransform)info.CreateInstance( env, args, new object[] { data, innerMapper, trainSchema })); }; } return(ComponentFactoryUtils.CreateFromFunction(factoryFunc)); }
protected override void GetScoreValueKind(int col, ref DvText dst) { Contracts.Assert(0 <= col && col < ColumnCount); if (col == base.ColumnCount) { dst = new DvText(MetadataUtils.Const.ScoreValueKind.Probability); } else { base.GetScoreValueKind(col, ref dst); } }
private ValueGetter <uint> ComposeGetterOneCore(ValueGetter <DvText> getSrc, uint seed, uint mask) { DvText src = default(DvText); return ((ref uint dst) => { // REVIEW: Should we verify that the key value is within the advertised KeyCount? // Values greater than KeyCount should be treated as zeros. getSrc(ref src); dst = HashCore(seed, ref src, mask); }); }
public void GetSlotNames(ref VBuffer <DvText> slotNames) { var values = slotNames.Values; if (Utils.Size(values) < UnweightedCounters.TruncationLevel) { values = new DvText[UnweightedCounters.TruncationLevel]; } for (int i = 0; i < UnweightedCounters.TruncationLevel; i++) { values[i] = new DvText(string.Format("@{0}", i + 1)); } slotNames = new VBuffer <DvText>(UnweightedCounters.TruncationLevel, values); }
public void GetSlotNames(ref VBuffer <DvText> slotNames) { var values = slotNames.Values; if (Utils.Size(values) < _size) { values = new DvText[_size]; } for (int i = 0; i < _size; i++) { values[i] = new DvText(string.Format("(Label_{0})", i)); } slotNames = new VBuffer <DvText>(_size, values); }
/// <summary> /// Returns the set of column ids which match the value of specified metadata kind. /// The metadata type should be of type text. /// </summary> public static IEnumerable <int> GetColumnSet(this ISchema schema, string metadataKind, string value) { for (int col = 0; col < schema.ColumnCount; col++) { var columnType = schema.GetMetadataTypeOrNull(metadataKind, col); if (columnType != null && columnType.IsText) { DvText val = default(DvText); schema.GetMetadata(metadataKind, col, ref val); if (val.EqualsStr(value)) { yield return(col); } } } }
public bool GetWordVector(ref DvText word, float[] wordVector) { if (word.IsNA) { return(false); } string rawWord = word.GetRawUnderlyingBufferInfo(out int ichMin, out int ichLim); NormStr str = _pool.Get(rawWord, ichMin, ichLim); if (str != null) { _wordVectors.CopyTo(str.Id * Dimension, wordVector, Dimension); return(true); } return(false); }
private DvText Textify(ref StringBuilder sb, ref StringBuilder temp, ref char[] cbuffer, ref Pair[] buffer, HashSet <Pair> pairs) { Contracts.AssertValueOrNull(sb); Contracts.AssertValueOrNull(temp); Contracts.AssertValueOrNull(cbuffer); Contracts.AssertValueOrNull(buffer); Contracts.Assert(Utils.Size(pairs) > 0); int count = pairs.Count; // Keep things in the same order they were inserted, by sorting on order. Utils.EnsureSize(ref buffer, count); pairs.CopyTo(buffer); pairs.Clear(); // Optimize the one value case, where we don't have to use the string builder. if (count == 1) { var value = buffer[0].Value; _stringifyMapper(ref value, ref temp); return(Utils.Size(temp) > 0 ? new DvText(temp.ToString()) : DvText.Empty); } Array.Sort(buffer, 0, count, Comparer <Pair> .Create((x, y) => x.Order - y.Order)); if (sb == null) { sb = new StringBuilder(); } Contracts.Assert(sb.Length == 0); // The more general collision case. sb.Append('{'); for (int i = 0; i < count; ++i) { var pair = buffer[i]; if (i > 0) { sb.Append(','); } var value = pair.Value; _stringifyMapper(ref value, ref temp); InvertHashUtils.AppendToEnd(temp, sb, ref cbuffer); } sb.Append('}'); var retval = new DvText(sb.ToString()); sb.Clear(); return(retval); }
private MetadataUtils.MetadataGetter <VBuffer <DvText> > CreateSlotNamesGetter(string prefix) { return ((int col, ref VBuffer <DvText> dst) => { var values = dst.Values; if (Utils.Size(values) < _scoreSize) { values = new DvText[_scoreSize]; } for (int i = 0; i < _scoreSize; i++) { values[i] = new DvText(string.Format("{0} ({1})", prefix, _quantiles[i])); } dst = new VBuffer <DvText>(_scoreSize, values); }); }
private void SlotNamesGetter(int iinfo, ref VBuffer <DvText> dst) { Contracts.Assert(0 <= iinfo && iinfo < InfoCount); var values = dst.Values; if (Utils.Size(values) < _truncationLevel) { values = new DvText[_truncationLevel]; } for (int i = 0; i < _truncationLevel; i++) { values[i] = new DvText(string.Format("{0}@{1}", iinfo == NdcgCol ? Ndcg : iinfo == DcgCol ? Dcg : MaxDcg, i + 1)); } dst = new VBuffer <DvText>(_truncationLevel, values); }
// REVIEW: Figure out how to avoid having the column name in each slot name. private MetadataUtils.MetadataGetter <VBuffer <DvText> > CreateSlotNamesGetter(int numTopClusters, string suffix) { return ((int col, ref VBuffer <DvText> dst) => { var values = dst.Values; if (Utils.Size(values) < numTopClusters) { values = new DvText[numTopClusters]; } for (int i = 1; i <= numTopClusters; i++) { values[i - 1] = new DvText(string.Format("#{0} {1}", i, suffix)); } dst = new VBuffer <DvText>(numTopClusters, values); }); }
/// <summary> /// Gets the mapping from T into a StringBuilder representation, using various heuristics. /// This StringBuilder representation will be a component of the composed KeyValues for the /// hash outputs. /// </summary> public static ValueMapper <T, StringBuilder> GetSimpleMapper <T>(ISchema schema, int col) { Contracts.AssertValue(schema); Contracts.Assert(0 <= col && col < schema.ColumnCount); var type = schema.GetColumnType(col).ItemType; Contracts.Assert(type.RawType == typeof(T)); var conv = Conversion.Conversions.Instance; // First: if not key, then get the standard string converison. if (!type.IsKey) { return(conv.GetStringConversion <T>(type)); } bool identity; // Second choice: if key, utilize the KeyValues metadata for that key, if it has one and is text. if (schema.HasKeyNames(col, type.KeyCount)) { // REVIEW: Non-textual KeyValues are certainly possible. Should we handle them? // Get the key names. VBuffer <DvText> keyValues = default(VBuffer <DvText>); schema.GetMetadata(MetadataUtils.Kinds.KeyValues, col, ref keyValues); DvText value = default(DvText); // REVIEW: We could optimize for identity, but it's probably not worthwhile. var keyMapper = conv.GetStandardConversion <T, uint>(type, NumberType.U4, out identity); return ((ref T src, ref StringBuilder dst) => { ClearDst(ref dst); uint intermediate = 0; keyMapper(ref src, ref intermediate); if (intermediate == 0) { return; } keyValues.GetItemOrDefault((int)(intermediate - 1), ref value); value.AddToStringBuilder(dst); }); } // Third choice: just use the key value itself, subject to offsetting by the min. return(conv.GetKeyStringConversion <T>(type.AsKey)); }
private void UpdateColumnValues(string path, List <string> values) { // Cache the column values for future Getter calls. for (int i = 0; i < _colValues.Length; i++) { var source = _parent._srcDirIndex[i]; if (source >= 0 && source < values.Count) { _colValues[i] = new DvText(values[source]); } else if (source == FilePathColIndex) { // Force Unix path for consistency. var cleanPath = path.Replace(@"\", @"/"); _colValues[i] = new DvText(cleanPath); } } }
private void GetTreeSlotNames(int col, ref VBuffer <DvText> dst) { var numTrees = _ensemble.NumTrees; var names = dst.Values; if (Utils.Size(names) < numTrees) { names = new DvText[numTrees]; } for (int t = 0; t < numTrees; t++) { names[t] = new DvText(string.Format("Tree{0:000}", t)); } dst = new VBuffer <DvText>(numTrees, names, dst.Indices); }
private void GetSlotNames(int iinfo, ref VBuffer <DvText> dst) { Contracts.Assert(iinfo == 0); Contracts.Assert(Utils.Size(_slotNames) > 0); int size = Utils.Size(_slotNames); var values = dst.Values; if (Utils.Size(values) < size) { values = new DvText[size]; } for (int i = 0; i < _slotNames.Length; i++) { values[i] = new DvText(_slotNames[i]); } dst = new VBuffer <DvText>(size, values, dst.Indices); }
public ValueGetter <VBuffer <DvText> > GetGroupSummarySlotNames(string prefix) { return ((ref VBuffer <DvText> dst) => { var values = dst.Values; if (Utils.Size(values) < UnweightedCounters.TruncationLevel) { values = new DvText[UnweightedCounters.TruncationLevel]; } for (int i = 0; i < UnweightedCounters.TruncationLevel; i++) { values[i] = new DvText(string.Format("{0}@{1}", prefix, i + 1)); } dst = new VBuffer <DvText>(UnweightedCounters.TruncationLevel, values); }); }
private QuantileRegressionPerInstanceEvaluator(IHostEnvironment env, ModelLoadContext ctx, ISchema schema) : base(env, ctx, schema) { CheckInputColumnTypes(schema); // *** Binary format ** // base // int: _scoreSize // int[]: Ids of the quantile names _scoreSize = ctx.Reader.ReadInt32(); Host.CheckDecode(_scoreSize > 0); _quantiles = new DvText[_scoreSize]; for (int i = 0; i < _scoreSize; i++) { _quantiles[i] = new DvText(ctx.LoadNonEmptyString()); } _outputType = new VectorType(NumberType.R8, _scoreSize); }
private object GetSpecifiedValue <T>(string srcStr, ColumnType dstType, RefPredicate <T> isNA) { var val = default(T); if (!string.IsNullOrEmpty(srcStr)) { // Handles converting input strings to correct types. DvText srcTxt = new DvText(srcStr); bool identity; var strToT = Conversions.Instance.GetStandardConversion <DvText, T>(TextType.Instance, dstType.ItemType, out identity); strToT(ref srcTxt, ref val); // Make sure that the srcTxt can legitimately be converted to dstType, throw error otherwise. if (isNA(ref val)) { throw Contracts.Except("No conversion of '{0}' to '{1}'", srcStr, dstType.ItemType); } } return(val); }
private void GetSlotNames(int iinfo, ref VBuffer <DvText> dst) { Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(_slotNamesTypes[iinfo] != null); var keyCount = Infos[iinfo].TypeSrc.ItemType.KeyCount; Host.Assert(Source.Schema.HasKeyNames(Infos[iinfo].Source, keyCount)); var unigramNames = new VBuffer <DvText>(); // Get the key values of the unigrams. Source.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, Infos[iinfo].Source, ref unigramNames); Host.Check(unigramNames.Length == keyCount); var pool = _ngramMaps[iinfo]; var values = dst.Values; var ngramCount = pool.Count; if (Utils.Size(values) < ngramCount) { Array.Resize(ref values, ngramCount); } StringBuilder sb = new StringBuilder(); uint[] ngram = new uint[_exes[iinfo].NgramLength]; for (int slot = 0; slot < pool.Count; slot++) { var n = pool.GetById(slot, ref ngram); Host.Assert(n >= 0); // Get the unigrams composing the current ngram. ComposeNgramString(ngram, n, sb, keyCount, unigramNames.GetItemOrDefault); values[slot] = new DvText(sb.ToString()); } dst = new VBuffer <DvText>(ngramCount, values, dst.Indices); }
public VBuffer <DvText> GetMetadata() { int count = _slotToValueSet.Count; Contracts.Assert(count <= _slots); StringBuilder sb = null; StringBuilder temp = null; Pair[] pairs = null; char[] cbuffer = null; bool sparse = count <= _slots / 2; if (sparse) { // Sparse var indices = new int[count]; var values = new DvText[count]; int i = 0; foreach (var p in _slotToValueSet) { Contracts.Assert(0 <= p.Key && p.Key < _slots); indices[i] = p.Key; values[i++] = Textify(ref sb, ref temp, ref cbuffer, ref pairs, p.Value); } Contracts.Assert(i == count); Array.Sort(indices, values); return(new VBuffer <DvText>((int)_slots, count, values, indices)); } else { // Dense var values = new DvText[_slots]; foreach (var p in _slotToValueSet) { Contracts.Assert(0 <= p.Key && p.Key < _slots); values[p.Key] = Textify(ref sb, ref temp, ref cbuffer, ref pairs, p.Value); } return(new VBuffer <DvText>(values.Length, values)); } }
protected ValueGetter <VBuffer <DvText> > GetKeyValueGetter(AggregatorDictionaryBase[] dictionaries) { if (Utils.Size(dictionaries) == 0) { return(null); } return ((ref VBuffer <DvText> dst) => { var values = dst.Values; if (Utils.Size(values) < dictionaries.Length) { values = new DvText[dictionaries.Length]; } for (int i = 0; i < dictionaries.Length; i++) { values[i] = new DvText(dictionaries[i].ColName); } dst = new VBuffer <DvText>(dictionaries.Length, values, dst.Indices); }); }
private ValueGetter <VBuffer <DvText> > MakeGetterVec(IRow input, int iinfo) { Host.AssertValue(input); Host.Assert(Infos[iinfo].TypeSrc.IsVector); Host.Assert(Infos[iinfo].TypeSrc.ItemType.IsText); int cv = Infos[iinfo].TypeSrc.VectorSize; Contracts.Assert(cv >= 0); var getSrc = GetSrcGetter <VBuffer <DvText> >(input, iinfo); var src = default(VBuffer <DvText>); var terms = new List <DvText>(); var separators = _exes[iinfo].Separators; return ((ref VBuffer <DvText> dst) => { getSrc(ref src); terms.Clear(); for (int i = 0; i < src.Count; i++) { AddTerms(src.Values[i], separators, terms); } var values = dst.Values; if (terms.Count > 0) { if (Utils.Size(values) < terms.Count) { values = new DvText[terms.Count]; } terms.CopyTo(values); } dst = new VBuffer <DvText>(terms.Count, values, dst.Indices); }); }