示例#1
0
 /// <summary>
 /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text.
 /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
 ///
 /// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorExtensions.ToBagofWords"/>
 /// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorExtensions.ToBagofWords"/> tokenizes text internally.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ToNgrams <TKey>(this VarVector <Key <TKey, string> > input,
                                              int ngramLength = 1,
                                              int skipLength  = 0,
                                              bool allLengths = true,
                                              int maxNumTerms = 10000000,
                                              NgramTransform.WeightingCriteria weighting = NgramTransform.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
示例#2
0
 /// <summary>
 /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text.
 /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="ngramLength">Ngram length.</param>
 /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param>
 /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param>
 /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param>
 /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param>
 public static Vector <float> ToBagofWords(this Scalar <string> input,
                                           int ngramLength = 1,
                                           int skipLength  = 0,
                                           bool allLengths = true,
                                           int maxNumTerms = 10000000,
                                           NgramTransform.WeightingCriteria weighting = NgramTransform.WeightingCriteria.Tf)
 => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
示例#3
0
 public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramTransform.WeightingCriteria weighting)
 {
     _ngramLength = ngramLength;
     _skipLength  = skipLength;
     _allLengths  = allLengths;
     _maxNumTerms = maxNumTerms;
     _weighting   = weighting;
 }
示例#4
0
 public OutPipelineColumn(PipelineColumn input,
                          int ngramLength,
                          int skipLength,
                          bool allLengths,
                          int maxNumTerms,
                          NgramTransform.WeightingCriteria weighting)
     : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input)
 {
     Input = input;
 }