private NgramCountingTransformer(IHost host, ModelLoadContext ctx, IDataView input) : base(host, ctx, input, TestType) { Host.AssertValue(ctx); // *** Binary format *** // <prefix handled in static Create method> // <base> // for each column // ColInfoEx // the ngram SequencePool // the ngram inverse document frequencies _exes = new ColInfoEx[Infos.Length]; _ngramMaps = new SequencePool[Infos.Length]; _invDocFreqs = new double[Infos.Length][]; for (int i = 0; i < Infos.Length; i++) { _exes[i] = new ColInfoEx(ctx, ctx.Header.ModelVerWritten >= VerTfIdfSupported); _ngramMaps[i] = new SequencePool(ctx.Reader); if (ctx.Header.ModelVerWritten >= VerTfIdfSupported) { _invDocFreqs[i] = ctx.Reader.ReadDoubleArray(); for (int j = 0; j < Utils.Size(_invDocFreqs[i]); j++) { Host.CheckDecode(_invDocFreqs[i][j] >= 0); } } } InitColumnTypeAndMetadata(out _types, out _slotNamesTypes); }
public void WithPool(PoolType t) { if (t == poolType) { return; } poolType = t; if (t == PoolType.Sequence) { pool = new SequencePool(); } else if (t == PoolType.Concurrent) { pool = new ConcurrentPool(); } }
private void AssertValid(int[] counts, int[] lims, SequencePool pool) { int count = 0; int countFull = 0; for (int i = 0; i < lims.Length; i++) { Host.Assert(counts[i] >= 0); Host.Assert(counts[i] <= lims[i]); if (counts[i] == lims[i]) { countFull++; } count += counts[i]; } Host.Assert(count == pool.Count); }
public AI5042() { List <int> list = new List <int> { 0xc23, 0xc25, 0xc27 }; this.callids = list; this.calldelay2 = 0x2710; this.prev_scale = 1f; this.mSeqPool = new SequencePool(); this.mSeqPool2 = new SequencePool(); this.delay = 0.25f; this.poslist = new List <Vector3>(); this.prevs = new List <GameObject>(); this.mLines = new List <BulletRedLineCtrl>(); }
private static NgramIdFinder GetNgramIdFinderAdd(int[] counts, int[] lims, SequencePool pool, bool requireIdf, IHost host) { Contracts.AssertValue(host); host.Assert(Utils.Size(lims) > 0); host.Assert(Utils.Size(lims) == Utils.Size(counts)); int numFull = lims.Count(l => l <= 0); int ngramLength = lims.Length; return ((uint[] ngram, int lim, int icol, ref bool more) => { host.Assert(0 < lim && lim <= Utils.Size(ngram)); host.Assert(lim <= Utils.Size(counts)); host.Assert(lim <= Utils.Size(lims)); host.Assert(icol == 0); var max = lim - 1; int slot = -1; if (counts[max] < lims[max] && pool.TryAdd(ngram, 0, lim, out slot) && ++counts[max] >= lims[max]) { numFull++; } // Note: 'slot' is either the id of the added ngram or -1. In case it is -1, find its id. // Note: 'more' controls whether more ngrams/skip-grams should be processed in the current // row. For IDF, as we are interested in counting the occurrence of ngrams/skip- // grams, more should not be updated. if (requireIdf) { return slot != -1 ? slot : pool.Get(ngram, 0, lim); } more = numFull < ngramLength; return -1; }); }
/// <summary> /// Initializes a new instance of the <see cref="JsonWriter"/> struct. /// </summary> /// <param name="sequencePool">The pool from which to draw an <see cref="IBufferWriter{T}"/> if required..</param> /// <param name="array">An array to start with so we can avoid accessing the <paramref name="sequencePool"/> if possible.</param> internal JsonWriter(SequencePool sequencePool, byte[] array) : this() { Writer = new BufferWriter(sequencePool, array); }
private SequencePool[] Train(Arguments args, IDataView trainingData, out double[][] invDocFreqs) { // Contains the maximum number of grams to store in the dictionary, for each level of ngrams, // from 1 (in position 0) up to ngramLength (in position ngramLength-1) var lims = new int[Infos.Length][]; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { var all = args.Column[iinfo].AllLengths ?? args.AllLengths; var ngramLength = _exes[iinfo].NgramLength; var maxNumTerms = Utils.Size(args.Column[iinfo].MaxNumTerms) > 0 ? args.Column[iinfo].MaxNumTerms : args.MaxNumTerms; if (!all) { Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(args.MaxNumTerms)); lims[iinfo] = new int[ngramLength]; lims[iinfo][ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[0]; } else { Host.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(args.MaxNumTerms)); Host.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(args.MaxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? Arguments.DefaultMaxTerms : maxNumTerms[maxNumTerms.Length - 1]; lims[iinfo] = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } } var helpers = new NgramBufferBuilder[Infos.Length]; var getters = new ValueGetter <VBuffer <uint> > [Infos.Length]; var src = new VBuffer <uint> [Infos.Length]; // Keep track of how many grams are in the pool for each value of n. Position // i in _counts counts how many (i+1)-grams are in the pool for column iinfo. var counts = new int[Infos.Length][]; var ngramMaps = new SequencePool[Infos.Length]; bool[] activeInput = new bool[trainingData.Schema.ColumnCount]; foreach (var info in Infos) { activeInput[info.Source] = true; } using (var cursor = trainingData.GetRowCursor(col => activeInput[col])) using (var pch = Host.StartProgressChannel("Building n-gram dictionary")) { for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { Host.Assert(Infos[iinfo].TypeSrc.IsVector && Infos[iinfo].TypeSrc.ItemType.IsKey); var ngramLength = _exes[iinfo].NgramLength; var skipLength = _exes[iinfo].SkipLength; getters[iinfo] = RowCursorUtils.GetVecGetterAs <uint>(NumberType.U4, cursor, Infos[iinfo].Source); src[iinfo] = default(VBuffer <uint>); counts[iinfo] = new int[ngramLength]; ngramMaps[iinfo] = new SequencePool(); // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, GetNgramIdFinderAdd(counts[iinfo], lims[iinfo], ngramMaps[iinfo], _exes[iinfo].RequireIdf(), Host)); } int cInfoFull = 0; bool[] infoFull = new bool[Infos.Length]; invDocFreqs = new double[Infos.Length][]; long totalDocs = 0; Double rowCount = trainingData.GetRowCount() ?? Double.NaN; var buffers = new VBuffer <float> [Infos.Length]; pch.SetHeader(new ProgressHeader(new[] { "Total n-grams" }, new[] { "documents" }), e => e.SetProgress(0, totalDocs, rowCount)); while (cInfoFull < Infos.Length && cursor.MoveNext()) { totalDocs++; for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { getters[iinfo](ref src[iinfo]); var keyCount = (uint)Infos[iinfo].TypeSrc.ItemType.KeyCount; if (keyCount == 0) { keyCount = uint.MaxValue; } if (!infoFull[iinfo]) { if (_exes[iinfo].RequireIdf()) { helpers[iinfo].Reset(); } helpers[iinfo].AddNgrams(in src[iinfo], 0, keyCount); if (_exes[iinfo].RequireIdf()) { int totalNgrams = counts[iinfo].Sum(); Utils.EnsureSize(ref invDocFreqs[iinfo], totalNgrams); helpers[iinfo].GetResult(ref buffers[iinfo]); foreach (var pair in buffers[iinfo].Items()) { if (pair.Value >= 1) { invDocFreqs[iinfo][pair.Key] += 1; } } } } AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); } } pch.Checkpoint(counts.Sum(c => c.Sum()), totalDocs); for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { for (int i = 0; i < Utils.Size(invDocFreqs[iinfo]); i++) { if (invDocFreqs[iinfo][i] != 0) { invDocFreqs[iinfo][i] = Math.Log(totalDocs / invDocFreqs[iinfo][i]); } } } for (int iinfo = 0; iinfo < Infos.Length; iinfo++) { AssertValid(counts[iinfo], lims[iinfo], ngramMaps[iinfo]); int ngramLength = _exes[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) { _exes[iinfo].NonEmptyLevels[i] = counts[iinfo][i] > 0; } } return(ngramMaps); } }
protected override void OnInit() { base.OnInit(); this.m_pool = new SequencePool(); }