/// <summary>
 /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text.
 /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
 ///
 /// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ToBagofWords"/>
 /// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ToBagofWords"/> tokenizes text internally.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ToNgrams <TKey>(this VarVector <Key <TKey, string> > input,
                                              int ngramLength = 1,
                                              int skipLength  = 0,
                                              bool allLengths = true,
                                              int maxNumTerms = 10000000,
                                              NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
 public TransformInfo(NgramExtractingEstimator.ColumnOptions info)
 {
     NgramLength    = info.NgramLength;
     SkipLength     = info.SkipLength;
     Weighting      = info.Weighting;
     NonEmptyLevels = new bool[NgramLength];
 }
Ejemplo n.º 3
0
 public TransformInfo(ColumnInfo info)
 {
     NgramLength    = info.NgramLength;
     SkipLength     = info.SkipLength;
     Weighting      = info.Weighting;
     NonEmptyLevels = new bool[NgramLength];
 }
 /// <summary>
 /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text.
 /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ToBagofWords(this Scalar <string> input,
                                           int ngramLength = 1,
                                           int skipLength  = 0,
                                           bool allLengths = true,
                                           int maxNumTerms = 10000000,
                                           NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
Ejemplo n.º 5
0
 /// <summary>
 /// Produces a bag of counts of n-grams (sequences of consecutive words ) in a given tokenized text.
 /// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
 ///
 /// /// <see cref="ProduceNgrams"/> is different from <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/>
 /// in a way that <see cref="ProduceNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorStaticExtensions.ProduceWordBags"/> tokenizes text internally.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an n-gram.</param>
 /// <param name="useAllLengths">Whether to include all n-gram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maximumNgramsCount">Maximum number of n-grams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ProduceNgrams <TKey>(this VarVector <Key <TKey, string> > input,
                                                   int ngramLength        = 1,
                                                   int skipLength         = 0,
                                                   bool useAllLengths     = true,
                                                   int maximumNgramsCount = 10000000,
                                                   NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);
 public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting)
 {
     _ngramLength = ngramLength;
     _skipLength  = skipLength;
     _allLengths  = allLengths;
     _maxNumTerms = maxNumTerms;
     _weighting   = weighting;
 }
Ejemplo n.º 7
0
 public Options()
 {
     NgramLength        = 1;
     SkipLength         = NgramExtractingEstimator.Defaults.SkipLength;
     UseAllLengths      = NgramExtractingEstimator.Defaults.UseAllLengths;
     MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount };
     Weighting          = NgramExtractingEstimator.Defaults.Weighting;
 }
Ejemplo n.º 8
0
 /// <summary>
 /// Describes how the transformer handles one Gcn column pair.
 /// </summary>
 /// <param name="input">Name of input column.</param>
 /// <param name="output">Name of output column.</param>
 /// <param name="ngramLength">Maximum ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">"Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param>
 /// <param name="weighting">The weighting criteria.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 public ColumnInfo(string input, string output,
                   int ngramLength = NgramExtractingEstimator.Defaults.NgramLength,
                   int skipLength  = NgramExtractingEstimator.Defaults.SkipLength,
                   bool allLengths = NgramExtractingEstimator.Defaults.AllLengths,
                   NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting,
                   int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms) : this(input, output, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms })
 {
 }
Ejemplo n.º 9
0
 /// <summary>
 /// Describes how the transformer handles one Gcn column pair.
 /// </summary>
 /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
 /// <param name="ngramLength">Maximum ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">"Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param>
 /// <param name="weighting">The weighting criteria.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 public ColumnInfo(string name, string inputColumnName = null,
                   int ngramLength = NgramExtractingEstimator.Defaults.NgramLength,
                   int skipLength  = NgramExtractingEstimator.Defaults.SkipLength,
                   bool allLengths = NgramExtractingEstimator.Defaults.AllLengths,
                   NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting,
                   int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms)
     : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms }, inputColumnName ?? name)
 {
 }
Ejemplo n.º 10
0
 public OutPipelineColumn(PipelineColumn input,
                          int ngramLength,
                          int skipLength,
                          bool allLengths,
                          int maxNumTerms,
                          NgramExtractingEstimator.WeightingCriteria weighting)
     : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input)
 {
     Input = input;
 }
            public TransformInfo(ModelLoadContext ctx, bool readWeighting)
            {
                Contracts.AssertValue(ctx);

                // *** Binary format ***
                // int: NgramLength
                // int: SkipLength
                // int: Weighting Criteria (if readWeighting == true)
                // bool[NgramLength]: NonEmptyLevels

                NgramLength = ctx.Reader.ReadInt32();
                Contracts.CheckDecode(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength);
                SkipLength = ctx.Reader.ReadInt32();
                Contracts.CheckDecode(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength);
                Contracts.CheckDecode(NgramLength <= NgramBufferBuilder.MaxSkipNgramLength - SkipLength);

                if (readWeighting)
                {
                    Weighting = (NgramExtractingEstimator.WeightingCriteria)ctx.Reader.ReadInt32();
                }
                Contracts.CheckDecode(Enum.IsDefined(typeof(NgramExtractingEstimator.WeightingCriteria), Weighting));
                NonEmptyLevels = ctx.Reader.ReadBoolArray(NgramLength);
            }
Ejemplo n.º 12
0
            internal ColumnInfo(string name,
                                int ngramLength,
                                int skipLength,
                                bool allLengths,
                                NgramExtractingEstimator.WeightingCriteria weighting,
                                int[] maxNumTerms,
                                string inputColumnName = null)
            {
                Name            = name;
                InputColumnName = inputColumnName ?? name;
                NgramLength     = ngramLength;
                Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength));
                SkipLength = skipLength;
                if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength)
                {
                    throw Contracts.ExceptUserArg(nameof(skipLength),
                                                  $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}");
                }
                AllLengths = allLengths;
                Weighting  = weighting;
                var limits = new int[ngramLength];

                if (!AllLengths)
                {
                    Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 ||
                                           Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(maxNumTerms));
                    limits[ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? NgramExtractingEstimator.Defaults.MaxNumTerms : maxNumTerms[0];
                }
                else
                {
                    Contracts.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(maxNumTerms));
                    Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(maxNumTerms));
                    var extend = Utils.Size(maxNumTerms) == 0 ? NgramExtractingEstimator.Defaults.MaxNumTerms : maxNumTerms[maxNumTerms.Length - 1];
                    limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend);
                }
                Limits = ImmutableArray.Create(limits);
            }