A column focused matrix in three dimensions:
 Token[column][row][z-axis] { {{hello}, {greetings, and, salutations}}, {{world}, {earth}, {tellus}} }; 
todo consider row groups to indicate that shingles is only to contain permutations with texts in that same row group.
Пример #1
0
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        ///
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        ///
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix             = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter    = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt    = _input.AddAttribute <ITermAttribute>();
            _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>();
            _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>();
            _inOffsetAtt  = _input.AddAttribute <IOffsetAttribute>();
            _inTypeAtt    = _input.AddAttribute <ITypeAttribute>();
            _inFlagsAtt   = _input.AddAttribute <IFlagsAttribute>();
        }
Пример #2
0
        public override sealed bool IncrementToken()
        {
            if (Matrix == null)
            {
                Matrix = new Matrix.Matrix();

                // fill matrix with maximumShingleSize columns
                while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn())
                {
                    // this loop looks ugly
                }
            }

            // This loop exists in order to avoid recursive calls to the next method
            // as the complexity of a large matrix
            // then would require a multi gigabyte sized stack.
            Token token;

            do
            {
                token = ProduceNextToken(_reusableToken);
            } while (token == _requestNextToken);

            if (token == null)
            {
                return(false);
            }

            ClearAttributes();

            _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
            _posIncrAtt.PositionIncrement = token.PositionIncrement;
            _flagsAtt.Flags = token.Flags;
            _offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
            _typeAtt.Type       = token.Type;
            _payloadAtt.Payload = token.Payload;

            return(true);
        }
        public override sealed bool IncrementToken()
        {
            if (Matrix == null)
            {
                Matrix = new Matrix.Matrix();

                // fill matrix with maximumShingleSize columns
                while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn())
                {
                    // this loop looks ugly
                }
            }

            // This loop exists in order to avoid recursive calls to the next method
            // as the complexity of a large matrix
            // then would require a multi gigabyte sized stack.
            Token token;
            do
            {
                token = ProduceNextToken(_reusableToken);
            } while (token == _requestNextToken);
            
            if (token == null) 
                return false;

            ClearAttributes();

            _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength());
            _posIncrAtt.SetPositionIncrement(token.GetPositionIncrement());
            _flagsAtt.SetFlags(token.GetFlags());
            _offsetAtt.SetOffset(token.StartOffset(), token.EndOffset());
            _typeAtt.SetType(token.Type());
            _payloadAtt.SetPayload(token.GetPayload());

            return true;
        }
        public void TestMatrix()
        {
            // some other tests set this to null.
            // set it here in case tests are run out of the usual order.
            ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec();

            var matrix = new Matrix.Matrix();

            new Column(TokenFactory("no", 1), matrix);
            new Column(TokenFactory("surprise", 1), matrix);
            new Column(TokenFactory("to", 1), matrix);
            new Column(TokenFactory("see", 1), matrix);
            new Column(TokenFactory("england", 1), matrix);
            new Column(TokenFactory("manager", 1), matrix);

            var col = new Column(matrix);

            // sven göran eriksson is a multi token synonym to svennis
            new Row(col).Tokens.AddLast(TokenFactory("svennis", 1));

            var row = new Row(col);
            row.Tokens.AddLast(TokenFactory("sven", 1));
            row.Tokens.AddLast(TokenFactory("göran", 1));
            row.Tokens.AddLast(TokenFactory("eriksson", 1));

            new Column(TokenFactory("in", 1), matrix);
            new Column(TokenFactory("the", 1), matrix);
            new Column(TokenFactory("croud", 1), matrix);

            TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true,
                                                     new SimpleThreeDimensionalTokenSettingsCodec());

            //  for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
            //      System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
            //      token.clear();
            //    }

            AssertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
            AssertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
            AssertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
            AssertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
            AssertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
            AssertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
            AssertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
            AssertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
            AssertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
            AssertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
            AssertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
            AssertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
            AssertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
            AssertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
            AssertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);

            Assert.IsFalse(ts.IncrementToken());
        }