/// <summary> /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// /// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorExtensions.ToBagofWords"/> /// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorExtensions.ToBagofWords"/> tokenizes text internally. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ToNgrams <TKey>(this VarVector <Key <TKey, string> > input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, int maxNumTerms = 10000000, NgramCountingEstimator.WeightingCriteria weighting = NgramCountingEstimator.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
public TransformInfo(ColumnInfo info) { NgramLength = info.NgramLength; SkipLength = info.SkipLength; Weighting = info.Weighting; NonEmptyLevels = new bool[NgramLength]; }
/// <summary> /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ToBagofWords(this Scalar <string> input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, int maxNumTerms = 10000000, NgramCountingEstimator.WeightingCriteria weighting = NgramCountingEstimator.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
/// <summary> /// Describes how the transformer handles one Gcn column pair. /// </summary> /// <param name="input">Name of input column.</param> /// <param name="output">Name of output column.</param> /// <param name="ngramLength">Maximum ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">"Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param> /// <param name="weighting">The weighting criteria.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> public ColumnInfo(string input, string output, int ngramLength = NgramCountingEstimator.Defaults.NgramLength, int skipLength = NgramCountingEstimator.Defaults.SkipLength, bool allLengths = NgramCountingEstimator.Defaults.AllLength, NgramCountingEstimator.WeightingCriteria weighting = NgramCountingEstimator.Defaults.Weighting, int maxNumTerms = NgramCountingEstimator.Defaults.MaxNumTerms) : this(input, output, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms }) { }
public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramCountingEstimator.WeightingCriteria weighting) { _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; _maxNumTerms = maxNumTerms; _weighting = weighting; }
public OutPipelineColumn(PipelineColumn input, int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramCountingEstimator.WeightingCriteria weighting) : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input) { Input = input; }
public TransformInfo(ModelLoadContext ctx, bool readWeighting) { Contracts.AssertValue(ctx); // *** Binary format *** // int: NgramLength // int: SkipLength // int: Weighting Criteria (if readWeighting == true) // bool[NgramLength]: NonEmptyLevels NgramLength = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength); SkipLength = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength); Contracts.CheckDecode(NgramLength <= NgramBufferBuilder.MaxSkipNgramLength - SkipLength); if (readWeighting) { Weighting = (NgramCountingEstimator.WeightingCriteria)ctx.Reader.ReadInt32(); } Contracts.CheckDecode(Enum.IsDefined(typeof(NgramCountingEstimator.WeightingCriteria), Weighting)); NonEmptyLevels = ctx.Reader.ReadBoolArray(NgramLength); }
internal ColumnInfo(string input, string output, int ngramLength, int skipLength, bool allLengths, NgramCountingEstimator.WeightingCriteria weighting, int[] maxNumTerms) { Input = input; Output = output; NgramLength = ngramLength; Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); SkipLength = skipLength; if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) { throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); } AllLengths = allLengths; Weighting = weighting; var limits = new int[ngramLength]; if (!AllLengths) { Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(maxNumTerms)); limits[ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? NgramCountingEstimator.Defaults.MaxNumTerms : maxNumTerms[0]; } else { Contracts.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(maxNumTerms)); Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(maxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? NgramCountingEstimator.Defaults.MaxNumTerms : maxNumTerms[maxNumTerms.Length - 1]; limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } Limits = ImmutableArray.Create(limits); }