/// <summary> /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// /// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ToBagofWords"/> /// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ToBagofWords"/> tokenizes text internally. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ToNgrams <TKey>(this VarVector <Key <TKey, string> > input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, int maxNumTerms = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
public TransformInfo(NgramExtractingEstimator.ColumnOptions info) { NgramLength = info.NgramLength; SkipLength = info.SkipLength; Weighting = info.Weighting; NonEmptyLevels = new bool[NgramLength]; }
public TransformInfo(ColumnInfo info) { NgramLength = info.NgramLength; SkipLength = info.SkipLength; Weighting = info.Weighting; NonEmptyLevels = new bool[NgramLength]; }
/// <summary> /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ToBagofWords(this Scalar <string> input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, int maxNumTerms = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
/// <summary> /// Produces a bag of counts of n-grams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. /// /// /// <see cref="ProduceNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/> /// in a way that <see cref="ProduceNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/> tokenizes text internally. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param> /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ProduceNgrams <TKey>(this VarVector <Key <TKey, string> > input, int ngramLength = 1, int skipLength = 0, bool useAllLengths = true, int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);
public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) { _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; _maxNumTerms = maxNumTerms; _weighting = weighting; }
public Options() { NgramLength = 1; SkipLength = NgramExtractingEstimator.Defaults.SkipLength; UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; Weighting = NgramExtractingEstimator.Defaults.Weighting; }
/// <summary> /// Describes how the transformer handles one Gcn column pair. /// </summary> /// <param name="input">Name of input column.</param> /// <param name="output">Name of output column.</param> /// <param name="ngramLength">Maximum ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">"Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param> /// <param name="weighting">The weighting criteria.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> public ColumnInfo(string input, string output, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting, int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms) : this(input, output, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms }) { }
/// <summary> /// Describes how the transformer handles one Gcn column pair. /// </summary> /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> /// <param name="ngramLength">Maximum ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">"Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param> /// <param name="weighting">The weighting criteria.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> public ColumnInfo(string name, string inputColumnName = null, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting, int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms) : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms }, inputColumnName ?? name) { }
public OutPipelineColumn(PipelineColumn input, int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input) { Input = input; }
public TransformInfo(ModelLoadContext ctx, bool readWeighting) { Contracts.AssertValue(ctx); // *** Binary format *** // int: NgramLength // int: SkipLength // int: Weighting Criteria (if readWeighting == true) // bool[NgramLength]: NonEmptyLevels NgramLength = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength); SkipLength = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength); Contracts.CheckDecode(NgramLength <= NgramBufferBuilder.MaxSkipNgramLength - SkipLength); if (readWeighting) { Weighting = (NgramExtractingEstimator.WeightingCriteria)ctx.Reader.ReadInt32(); } Contracts.CheckDecode(Enum.IsDefined(typeof(NgramExtractingEstimator.WeightingCriteria), Weighting)); NonEmptyLevels = ctx.Reader.ReadBoolArray(NgramLength); }
internal ColumnInfo(string name, int ngramLength, int skipLength, bool allLengths, NgramExtractingEstimator.WeightingCriteria weighting, int[] maxNumTerms, string inputColumnName = null) { Name = name; InputColumnName = inputColumnName ?? name; NgramLength = ngramLength; Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); SkipLength = skipLength; if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) { throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); } AllLengths = allLengths; Weighting = weighting; var limits = new int[ngramLength]; if (!AllLengths) { Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(maxNumTerms)); limits[ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? NgramExtractingEstimator.Defaults.MaxNumTerms : maxNumTerms[0]; } else { Contracts.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(maxNumTerms)); Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(maxNumTerms)); var extend = Utils.Size(maxNumTerms) == 0 ? NgramExtractingEstimator.Defaults.MaxNumTerms : maxNumTerms[maxNumTerms.Length - 1]; limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); } Limits = ImmutableArray.Create(limits); }