예제 #1
0
 /// <summary>
 /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text.
 /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
 ///
 /// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorExtensions.ToBagofWords"/>
 /// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorExtensions.ToBagofWords"/> tokenizes text internally.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ToNgrams <TKey>(this VarVector <Key <TKey, string> > input,
                                              int ngramLength = 1,
                                              int skipLength  = 0,
                                              bool allLengths = true,
                                              int maxNumTerms = 10000000,
                                              NgramCountingEstimator.WeightingCriteria weighting = NgramCountingEstimator.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
 public TransformInfo(ColumnInfo info)
 {
     NgramLength    = info.NgramLength;
     SkipLength     = info.SkipLength;
     Weighting      = info.Weighting;
     NonEmptyLevels = new bool[NgramLength];
 }
예제 #3
0
 /// <summary>
 /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text.
 /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ToBagofWords(this Scalar <string> input,
                                           int ngramLength = 1,
                                           int skipLength  = 0,
                                           bool allLengths = true,
                                           int maxNumTerms = 10000000,
                                           NgramCountingEstimator.WeightingCriteria weighting = NgramCountingEstimator.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
 /// <summary>
 /// Describes how the transformer handles one Gcn column pair.
 /// </summary>
 /// <param name="input">Name of input column.</param>
 /// <param name="output">Name of output column.</param>
 /// <param name="ngramLength">Maximum ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">"Whether to store all ngram lengths up to ngramLength, or only ngramLength.</param>
 /// <param name="weighting">The weighting criteria.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 public ColumnInfo(string input, string output,
                   int ngramLength = NgramCountingEstimator.Defaults.NgramLength,
                   int skipLength  = NgramCountingEstimator.Defaults.SkipLength,
                   bool allLengths = NgramCountingEstimator.Defaults.AllLength,
                   NgramCountingEstimator.WeightingCriteria weighting = NgramCountingEstimator.Defaults.Weighting,
                   int maxNumTerms = NgramCountingEstimator.Defaults.MaxNumTerms) : this(input, output, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms })
 {
 }
예제 #5
0
 public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramCountingEstimator.WeightingCriteria weighting)
 {
     _ngramLength = ngramLength;
     _skipLength  = skipLength;
     _allLengths  = allLengths;
     _maxNumTerms = maxNumTerms;
     _weighting   = weighting;
 }
예제 #6
0
 public OutPipelineColumn(PipelineColumn input,
                          int ngramLength,
                          int skipLength,
                          bool allLengths,
                          int maxNumTerms,
                          NgramCountingEstimator.WeightingCriteria weighting)
     : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input)
 {
     Input = input;
 }
            public TransformInfo(ModelLoadContext ctx, bool readWeighting)
            {
                Contracts.AssertValue(ctx);

                // *** Binary format ***
                // int: NgramLength
                // int: SkipLength
                // int: Weighting Criteria (if readWeighting == true)
                // bool[NgramLength]: NonEmptyLevels

                NgramLength = ctx.Reader.ReadInt32();
                Contracts.CheckDecode(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength);
                SkipLength = ctx.Reader.ReadInt32();
                Contracts.CheckDecode(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength);
                Contracts.CheckDecode(NgramLength <= NgramBufferBuilder.MaxSkipNgramLength - SkipLength);

                if (readWeighting)
                {
                    Weighting = (NgramCountingEstimator.WeightingCriteria)ctx.Reader.ReadInt32();
                }
                Contracts.CheckDecode(Enum.IsDefined(typeof(NgramCountingEstimator.WeightingCriteria), Weighting));
                NonEmptyLevels = ctx.Reader.ReadBoolArray(NgramLength);
            }
            internal ColumnInfo(string input, string output,
                                int ngramLength,
                                int skipLength,
                                bool allLengths,
                                NgramCountingEstimator.WeightingCriteria weighting,
                                int[] maxNumTerms)
            {
                Input       = input;
                Output      = output;
                NgramLength = ngramLength;
                Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength));
                SkipLength = skipLength;
                if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength)
                {
                    throw Contracts.ExceptUserArg(nameof(skipLength),
                                                  $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}");
                }
                AllLengths = allLengths;
                Weighting  = weighting;
                var limits = new int[ngramLength];

                if (!AllLengths)
                {
                    Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 ||
                                           Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(maxNumTerms));
                    limits[ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? NgramCountingEstimator.Defaults.MaxNumTerms : maxNumTerms[0];
                }
                else
                {
                    Contracts.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(maxNumTerms));
                    Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(maxNumTerms));
                    var extend = Utils.Size(maxNumTerms) == 0 ? NgramCountingEstimator.Defaults.MaxNumTerms : maxNumTerms[maxNumTerms.Length - 1];
                    limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend);
                }
                Limits = ImmutableArray.Create(limits);
            }