Token[column][row][z-axis] { {{hello}, {greetings, and, salutations}}, {{world}, {earth}, {tellus}} };
/// <summary> /// Creates a shingle filter based on a user defined matrix. /// /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at. /// /// </summary> /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { Matrix = matrix; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor // set the input to be an empty token stream, we already have the data. _input = new EmptyTokenStream(); _inTermAtt = _input.AddAttribute <ITermAttribute>(); _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>(); _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>(); _inOffsetAtt = _input.AddAttribute <IOffsetAttribute>(); _inTypeAtt = _input.AddAttribute <ITypeAttribute>(); _inFlagsAtt = _input.AddAttribute <IFlagsAttribute>(); }
public override sealed bool IncrementToken() { if (Matrix == null) { Matrix = new Matrix.Matrix(); // fill matrix with maximumShingleSize columns while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn()) { // this loop looks ugly } } // This loop exists in order to avoid recursive calls to the next method // as the complexity of a large matrix // then would require a multi gigabyte sized stack. Token token; do { token = ProduceNextToken(_reusableToken); } while (token == _requestNextToken); if (token == null) { return(false); } ClearAttributes(); _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); _posIncrAtt.PositionIncrement = token.PositionIncrement; _flagsAtt.Flags = token.Flags; _offsetAtt.SetOffset(token.StartOffset, token.EndOffset); _typeAtt.Type = token.Type; _payloadAtt.Payload = token.Payload; return(true); }
public override sealed bool IncrementToken() { if (Matrix == null) { Matrix = new Matrix.Matrix(); // fill matrix with maximumShingleSize columns while (Matrix.Columns.Count < MaximumShingleSize && ReadColumn()) { // this loop looks ugly } } // This loop exists in order to avoid recursive calls to the next method // as the complexity of a large matrix // then would require a multi gigabyte sized stack. Token token; do { token = ProduceNextToken(_reusableToken); } while (token == _requestNextToken); if (token == null) return false; ClearAttributes(); _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); _posIncrAtt.SetPositionIncrement(token.GetPositionIncrement()); _flagsAtt.SetFlags(token.GetFlags()); _offsetAtt.SetOffset(token.StartOffset(), token.EndOffset()); _typeAtt.SetType(token.Type()); _payloadAtt.SetPayload(token.GetPayload()); return true; }
public void TestMatrix() { // some other tests set this to null. // set it here in case tests are run out of the usual order. ShingleMatrixFilter.DefaultSettingsCodec = new SimpleThreeDimensionalTokenSettingsCodec(); var matrix = new Matrix.Matrix(); new Column(TokenFactory("no", 1), matrix); new Column(TokenFactory("surprise", 1), matrix); new Column(TokenFactory("to", 1), matrix); new Column(TokenFactory("see", 1), matrix); new Column(TokenFactory("england", 1), matrix); new Column(TokenFactory("manager", 1), matrix); var col = new Column(matrix); // sven göran eriksson is a multi token synonym to svennis new Row(col).Tokens.AddLast(TokenFactory("svennis", 1)); var row = new Row(col); row.Tokens.AddLast(TokenFactory("sven", 1)); row.Tokens.AddLast(TokenFactory("göran", 1)); row.Tokens.AddLast(TokenFactory("eriksson", 1)); new Column(TokenFactory("in", 1), matrix); new Column(TokenFactory("the", 1), matrix); new Column(TokenFactory("croud", 1), matrix); TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new SimpleThreeDimensionalTokenSettingsCodec()); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); // } AssertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0); AssertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0); AssertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0); AssertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0); AssertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0); AssertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0); AssertNext(ts, "to_see", 1, 1.4142135f, 0, 0); AssertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0); AssertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0); AssertNext(ts, "see_england", 1, 1.4142135f, 0, 0); AssertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0); AssertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0); AssertNext(ts, "england_manager", 1, 1.4142135f, 0, 0); AssertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0); AssertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0); AssertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0); AssertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0); AssertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0); AssertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0); AssertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0); AssertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0); AssertNext(ts, "in_the", 1, 1.4142135f, 0, 0); AssertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0); AssertNext(ts, "the_croud", 1, 1.4142135f, 0, 0); AssertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0); AssertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0); AssertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0); AssertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0); AssertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0); AssertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0); AssertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0); AssertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0); AssertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0); AssertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0); AssertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0); AssertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0); AssertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0); AssertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0); AssertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0); Assert.IsFalse(ts.IncrementToken()); }