/// <summary> /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// /// /// <see cref="ToNgrams"/> is different from <see cref="WordBagEstimatorExtensions.ToBagofWords"/> /// in a way that <see cref="ToNgrams"/> takes tokenized text as input while <see cref="WordBagEstimatorExtensions.ToBagofWords"/> tokenizes text internally. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ToNgrams <TKey>(this VarVector <Key <TKey, string> > input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, int maxNumTerms = 10000000, NgramTransform.WeightingCriteria weighting = NgramTransform.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
/// <summary> /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="ngramLength">Ngram length.</param> /// <param name="skipLength">Maximum number of tokens to skip when constructing an ngram.</param> /// <param name="allLengths">Whether to include all ngram lengths up to <paramref name="ngramLength"/> or only <paramref name="ngramLength"/>.</param> /// <param name="maxNumTerms">Maximum number of ngrams to store in the dictionary.</param> /// <param name="weighting">Statistical measure used to evaluate how important a word is to a document in a corpus.</param> public static Vector <float> ToBagofWords(this Scalar <string> input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, int maxNumTerms = 10000000, NgramTransform.WeightingCriteria weighting = NgramTransform.WeightingCriteria.Tf) => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting);
public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramTransform.WeightingCriteria weighting) { _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; _maxNumTerms = maxNumTerms; _weighting = weighting; }
public OutPipelineColumn(PipelineColumn input, int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramTransform.WeightingCriteria weighting) : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input) { Input = input; }