Пример #1
0
        /// <summary>
        /// Creates a shingle filter with ad hoc parameter settings.
        /// </summary>
        /// <param name="input">stream from which to construct the matrix</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char?spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            _input             = input;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _inTermAtt    = input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = input.AddAttribute <IFlagsAttribute>();
        }
Пример #2
0
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        ///
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        ///
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix             = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt    = _input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = _input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = _input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = _input.AddAttribute <IFlagsAttribute>();
        }