Пример #1
0
 public virtual void  TestCtor()
 {
     Token t = new Token();
     char[] content = "hello".ToCharArray();
     t.SetTermBuffer(content, 0, content.Length);
     char[] buf = t.TermBuffer();
     Assert.AreNotEqual(t.TermBuffer(), content);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("word", t.Type);
     Assert.AreEqual(0, t.Flags);
     
     t = new Token(6, 22);
     t.SetTermBuffer(content, 0, content.Length);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("(hello,6,22)", t.ToString());
     Assert.AreEqual("word", t.Type);
     Assert.AreEqual(0, t.Flags);
     
     t = new Token(6, 22, 7);
     t.SetTermBuffer(content, 0, content.Length);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("(hello,6,22)", t.ToString());
     Assert.AreEqual(7, t.Flags);
     
     t = new Token(6, 22, "junk");
     t.SetTermBuffer(content, 0, content.Length);
     Assert.AreEqual("hello", t.Term);
     Assert.AreEqual("(hello,6,22,type=junk)", t.ToString());
     Assert.AreEqual(0, t.Flags);
 }
Пример #2
0
        public virtual void  TestCtor()
        {
            Token t = new Token();

            char[] content = "hello".ToCharArray();
            t.SetTermBuffer(content, 0, content.Length);
            char[] buf = t.TermBuffer();
            Assert.AreNotEqual(t.TermBuffer(), content);
            Assert.AreEqual("hello", t.Term);
            Assert.AreEqual("word", t.Type);
            Assert.AreEqual(0, t.Flags);

            t = new Token(6, 22);
            t.SetTermBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.Term);
            Assert.AreEqual("(hello,6,22)", t.ToString());
            Assert.AreEqual("word", t.Type);
            Assert.AreEqual(0, t.Flags);

            t = new Token(6, 22, 7);
            t.SetTermBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.Term);
            Assert.AreEqual("(hello,6,22)", t.ToString());
            Assert.AreEqual(7, t.Flags);

            t = new Token(6, 22, "junk");
            t.SetTermBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.Term);
            Assert.AreEqual("(hello,6,22,type=junk)", t.ToString());
            Assert.AreEqual(0, t.Flags);
        }
Пример #3
0
        public virtual void  TestMixedStringArray()
        {
            Token t = new Token("hello", 0, 5);

            Assert.AreEqual(t.TermLength(), 5);
            Assert.AreEqual(t.Term, "hello");
            t.SetTermBuffer("hello2");
            Assert.AreEqual(t.TermLength(), 6);
            Assert.AreEqual(t.Term, "hello2");
            t.SetTermBuffer("hello3".ToCharArray(), 0, 6);
            Assert.AreEqual(t.Term, "hello3");

            char[] buffer = t.TermBuffer();
            buffer[1] = 'o';
            Assert.AreEqual(t.Term, "hollo3");
        }
Пример #4
0
		public virtual void  TestToString()
		{
			char[] b = new char[]{'a', 'l', 'o', 'h', 'a'};
			Token t = new Token("", 0, 5);
			t.SetTermBuffer(b, 0, 5);
			Assert.AreEqual("(aloha,0,5)", t.ToString());
			
			t.SetTermText("hi there");
			Assert.AreEqual("(hi there,0,5)", t.ToString());
		}
Пример #5
0
        public virtual void  TestToString()
        {
            char[] b = new char[] { 'a', 'l', 'o', 'h', 'a' };
            Token  t = new Token("", 0, 5);

            t.SetTermBuffer(b, 0, 5);
            Assert.AreEqual("(aloha,0,5)", t.ToString());

            t.SetTermText("hi there");
            Assert.AreEqual("(hi there,0,5)", t.ToString());
        }
Пример #6
0
 public virtual void  TestResize()
 {
     Token t = new Token();
     char[] content = "hello".ToCharArray();
     t.SetTermBuffer(content, 0, content.Length);
     for (int i = 0; i < 2000; i++)
     {
         t.ResizeTermBuffer(i);
         Assert.IsTrue(i <= t.TermBuffer().Length);
         Assert.AreEqual("hello", t.Term);
     }
 }
Пример #7
0
 public override Token Next(Token result)
 {
     result = input.Next(result);
     if (result != null)
     {
         if (stemmer.Stem(result.TermBuffer(), 0, result.termLength))
             result.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
         return result;
     }
     else
         return null;
 }
Пример #8
0
        public virtual void  TestResize()
        {
            Token t = new Token();

            char[] content = "hello".ToCharArray();
            t.SetTermBuffer(content, 0, content.Length);
            for (int i = 0; i < 2000; i++)
            {
                t.ResizeTermBuffer(i);
                Assert.IsTrue(i <= t.TermBuffer().Length);
                Assert.AreEqual("hello", t.Term);
            }
        }
Пример #9
0
 public override Token Next(Token result)
 {
     result = input.Next(result);
     if (result != null)
     {
         if (stemmer.Stem(result.TermBuffer(), 0, result.termLength))
         {
             result.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
         }
         return(result);
     }
     else
     {
         return(null);
     }
 }
Пример #10
0
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            Token nextToken = input.Next(reusableToken);

            if (nextToken == null)
            {
                return(null);
            }

            if (stemmer.Stem(nextToken.TermBuffer(), 0, nextToken.TermLength()))
            {
                nextToken.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
            }

            return(nextToken);
        }
Пример #11
0
        public override Token Next(Token token)
        {
            token.Clear();
            if (start == 0)
            {
                length = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length);
                if (length <= 0)
                    return null;
            }

            if (start == length)
                return null;
            token.SetTermBuffer(ioBuffer, start, 1);

            start++;
            token.termBuffer[0] = System.Char.ToLower(token.termBuffer[0]);
            return token;
        }
Пример #12
0
		public virtual void  TestMixedStringArray()
		{
			Token t = new Token("hello", 0, 5);
			Assert.AreEqual(t.TermText(), "hello");
			Assert.AreEqual(t.TermLength(), 5);
			Assert.AreEqual(new System.String(t.TermBuffer(), 0, 5), "hello");
			t.SetTermText("hello2");
			Assert.AreEqual(t.TermLength(), 6);
			Assert.AreEqual(new System.String(t.TermBuffer(), 0, 6), "hello2");
			t.SetTermBuffer("hello3".ToCharArray(), 0, 6);
			Assert.AreEqual(t.TermText(), "hello3");
			
			// Make sure if we get the buffer and change a character
			// that termText() reflects the change
			char[] buffer = t.TermBuffer();
			buffer[1] = 'o';
			Assert.AreEqual(t.TermText(), "hollo3");
		}
Пример #13
0
        public virtual void  TestTermBufferEquals()
        {
            Token t1a = new Token();

            char[] content1a = "hello".ToCharArray();
            t1a.SetTermBuffer(content1a, 0, 5);
            Token t1b = new Token();

            char[] content1b = "hello".ToCharArray();
            t1b.SetTermBuffer(content1b, 0, 5);
            Token t2 = new Token();

            char[] content2 = "hello2".ToCharArray();
            t2.SetTermBuffer(content2, 0, 6);
            Assert.IsTrue(t1a.Equals(t1b));
            Assert.IsFalse(t1a.Equals(t2));
            Assert.IsFalse(t2.Equals(t1b));
        }
Пример #14
0
        public virtual void  TestClone()
        {
            Token t = new Token(0, 5);

            char[] content = "hello".ToCharArray();
            t.SetTermBuffer(content, 0, 5);
            char[] buf  = t.TermBuffer();
            Token  copy = (Token)TestSimpleAttributeImpls.AssertCloneIsEqual(t);

            Assert.AreEqual(t.Term, copy.Term);
            Assert.AreNotSame(buf, copy.TermBuffer());

            Payload pl = new Payload(new byte[] { 1, 2, 3, 4 });

            t.Payload = pl;
            copy      = (Token)TestSimpleAttributeImpls.AssertCloneIsEqual(t);
            Assert.AreEqual(pl, copy.Payload);
            Assert.AreNotSame(pl, copy.Payload);
        }
Пример #15
0
        public virtual void  TestMixedStringArray()
        {
            Token t = new Token("hello", 0, 5);

            Assert.AreEqual(t.TermText(), "hello");
            Assert.AreEqual(t.TermLength(), 5);
            Assert.AreEqual(t.Term(), "hello");
            t.SetTermText("hello2");
            Assert.AreEqual(t.TermLength(), 6);
            Assert.AreEqual(t.Term(), "hello2");
            t.SetTermBuffer("hello3".ToCharArray(), 0, 6);
            Assert.AreEqual(t.TermText(), "hello3");

            // Make sure if we get the buffer and change a character
            // that termText() reflects the change
            char[] buffer = t.TermBuffer();
            buffer[1] = 'o';
            Assert.AreEqual(t.TermText(), "hollo3");
        }
Пример #16
0
 public virtual System.Object Clone()
 {
     try
     {
         Token t = (Token)base.MemberwiseClone();
         if (termBuffer != null)
         {
             t.termBuffer = null;
             t.SetTermBuffer(termBuffer, 0, termLength);
         }
         if (payload != null)
         {
             t.SetPayload((Payload)payload.Clone());
         }
         return(t);
     }
     catch (System.Exception e)
     {
         throw new System.SystemException("", e);                 // shouldn't happen
     }
 }
Пример #17
0
        public override Token Next(Token token)
        {
            token.Clear();
            if (start == 0)
            {
                length = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length);
                if (length <= 0)
                {
                    return(null);
                }
            }

            if (start == length)
            {
                return(null);
            }
            token.SetTermBuffer(ioBuffer, start, 1);

            start++;
            token.termBuffer[0] = System.Char.ToLower(token.termBuffer[0]);
            return(token);
        }
        public override Token Next(Token result)
		{
            result = input.Next(result);
            if (result != null)
            {
                char[] buffer = result.TermBuffer();
                int length = result.TermLength();
                // If no characters actually require rewriting then we
                // just return token as-is:
                for (int i = 0; i < length; i++)
                {
                    char c = buffer[i];
                    if (c >= '\u00c0' && c <= '\u0178')
                    {
                        RemoveAccents(buffer, length);
                        result.SetTermBuffer(output, 0, outputPos);
                        break;
                    }
                }
                return result;
            }
            else
                return null;
        }
 private Token TokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset)
 {
     Token token = new Token(startOffset, endOffset);
     token.SetTermBuffer(text);
     token.PositionIncrement = posIncr;
     ShingleMatrixFilter.DefaultSettingsCodec.SetWeight(token, weight);
     return token;
 }
Пример #20
0
        public virtual void TestGrow()
        {
            Token t = new Token();

            System.Text.StringBuilder buf = new System.Text.StringBuilder("ab");
            for (int i = 0; i < 20; i++)
            {
                char[] content = buf.ToString().ToCharArray();
                t.SetTermBuffer(content, 0, content.Length);
                Assert.AreEqual(buf.Length, t.TermLength());
                Assert.AreEqual(buf.ToString(), t.Term);
                buf.Append(buf.ToString());
            }
            Assert.AreEqual(1048576, t.TermLength());
            Assert.AreEqual(1048576, t.TermBuffer().Length);

            // now as a string, first variant
            t   = new Token();
            buf = new System.Text.StringBuilder("ab");
            for (int i = 0; i < 20; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content, 0, content.Length);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term);
                buf.Append(content);
            }
            Assert.AreEqual(1048576, t.TermLength());
            Assert.AreEqual(1048576, t.TermBuffer().Length);

            // now as a string, second variant
            t   = new Token();
            buf = new System.Text.StringBuilder("ab");
            for (int i = 0; i < 20; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term);
                buf.Append(content);
            }
            Assert.AreEqual(1048576, t.TermLength());
            Assert.AreEqual(1048576, t.TermBuffer().Length);

            // Test for slow growth to a long term
            t   = new Token();
            buf = new System.Text.StringBuilder("a");
            for (int i = 0; i < 20000; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term);
                buf.Append("a");
            }
            Assert.AreEqual(20000, t.TermLength());
            Assert.AreEqual(32768, t.TermBuffer().Length);

            // Test for slow growth to a long term
            t   = new Token();
            buf = new System.Text.StringBuilder("a");
            for (int i = 0; i < 20000; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term);
                buf.Append("a");
            }
            Assert.AreEqual(20000, t.TermLength());
            Assert.AreEqual(32768, t.TermBuffer().Length);
        }
 private static Token CreateToken(String term, int start, int offset)
 {
     var token = new Token(start, offset);
     token.SetTermBuffer(term);
     return token;
 }
Пример #22
0
 protected internal void AddToken(float score)
 {
     if (NumTokens < MAX_NUM_TOKENS_PER_GROUP)
     {
         int termStartOffset = offsetAtt.StartOffset;
         int termEndOffset = offsetAtt.EndOffset;
         if (NumTokens == 0)
         {
             startOffset = MatchStartOffset = termStartOffset;
             endOffset = MatchEndOffset = termEndOffset;
             tot += score;
         }
         else
         {
             startOffset = Math.Min(startOffset, termStartOffset);
             endOffset = Math.Max(endOffset, termEndOffset);
             if (score > 0)
             {
                 if (tot == 0)
                 {
                     MatchStartOffset = offsetAtt.StartOffset;
                     MatchEndOffset = offsetAtt.EndOffset;
                 }
                 else
                 {
                     MatchStartOffset = Math.Min(MatchStartOffset, termStartOffset);
                     MatchEndOffset = Math.Max(MatchEndOffset, termEndOffset);
                 }
                 tot += score;
             }
         }
         Token token = new Token(termStartOffset, termEndOffset);
         token.SetTermBuffer(termAtt.Term);
         tokens[NumTokens] = token;
         scores[NumTokens] = score;
         NumTokens++;
     }
 }
 private Token GetNextSuffixInputToken(Token token)
 {
     if (!Suffix.IncrementToken()) return null;
     token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
     token.SetPositionIncrement(_posIncrAtt.GetPositionIncrement());
     token.SetFlags(_flagsAtt.GetFlags());
     token.SetOffset(_offsetAtt.StartOffset(), _offsetAtt.EndOffset());
     token.SetType(_typeAtt.Type());
     token.SetPayload(_payloadAtt.GetPayload());
     return token;
 }
Пример #24
0
 public void  SetTermBuffer(char[] buffer, int offset, int length)
 {
     delegate_Renamed.SetTermBuffer(buffer, offset, length);
 }
        private static Token TokenFactory(String text, int posIncr, int startOffset, int endOffset)
        {
            var token = new Token(startOffset, endOffset);

            token.SetTermBuffer(text);
            token.SetPositionIncrement(posIncr);

            return token;
        }
Пример #26
0
        /// <summary>
        /// Low level api.
        /// Returns a token stream or null if no offset info available in index.
        /// This can be used to feed the highlighter with a pre-parsed token stream 
        /// 
        /// In my tests the speeds to recreate 1000 token streams using this method are:
        /// - with TermVector offset only data stored - 420  milliseconds 
        /// - with TermVector offset AND position data stored - 271 milliseconds
        ///  (nb timings for TermVector with position data are based on a tokenizer with contiguous
        ///  positions - no overlaps or gaps)
        /// The cost of not using TermPositionVector to store
        /// pre-parsed content and using an analyzer to re-parse the original content: 
        /// - reanalyzing the original content - 980 milliseconds
        /// 
        /// The re-analyze timings will typically vary depending on -
        ///     1) The complexity of the analyzer code (timings above were using a 
        ///        stemmer/lowercaser/stopword combo)
        ///  2) The  number of other fields (Lucene reads ALL fields off the disk 
        ///     when accessing just one document field - can cost dear!)
        ///  3) Use of compression on field storage - could be faster due to compression (less disk IO)
        ///     or slower (more CPU burn) depending on the content.
        /// </summary>
        /// <param name="tpv"/>
        /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
        /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
        public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
        {
            //code to reconstruct the original sequence of Tokens
            String[] terms = tpv.GetTerms();
            int[] freq = tpv.GetTermFrequencies();

            int totalTokens = freq.Sum();

            var tokensInOriginalOrder = new Token[totalTokens];
            List<Token> unsortedTokens = null;
            for (int t = 0; t < freq.Length; t++)
            {
                TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
                if (offsets == null)
                {
                    return null;
                }

                int[] pos = null;
                if (tokenPositionsGuaranteedContiguous)
                {
                    //try get the token position info to speed up assembly of tokens into sorted sequence
                    pos = tpv.GetTermPositions(t);
                }
                if (pos == null)
                {
                    //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                    if (unsortedTokens == null)
                    {
                        unsortedTokens = new List<Token>();
                    }

                    foreach (TermVectorOffsetInfo t1 in offsets)
                    {
                        var token = new Token(t1.StartOffset, t1.EndOffset);
                        token.SetTermBuffer(terms[t]);
                        unsortedTokens.Add(token);
                    }
                }
                else
                {
                    //We have positions stored and a guarantee that the token position information is contiguous

                    // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                    // creates jumps in position numbers - this code would fail under those circumstances

                    //tokens stored with positions - can use this to index straight into sorted array
                    for (int tp = 0; tp < pos.Length; tp++)
                    {
                        var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset);
                        tokensInOriginalOrder[pos[tp]] = token;
                    }
                }
            }
            //If the field has been stored without position data we must perform a sort
            if (unsortedTokens != null)
            {
                tokensInOriginalOrder = unsortedTokens.ToArray();
                Array.Sort(tokensInOriginalOrder, (t1, t2) =>
                                                      {
                                                          if (t1.StartOffset > t2.EndOffset)
                                                              return 1;
                                                          if (t1.StartOffset < t2.StartOffset)
                                                              return -1;
                                                          return 0;
                                                      });
            }
            return new StoredTokenStream(tokensInOriginalOrder);
        }
				public override Token Next(Token result)
				{
					if (buffered != null)
					{
						Token t = buffered;
						buffered = null;
						return t;
					}
					Token t2 = input.Next(result);
					if (t2 == null)
						return null;
					if (System.Char.IsDigit(t2.TermBuffer()[0]))
					{
						t2.SetPositionIncrement(t2.TermBuffer()[0] - '0');
					}
					if (first)
					{
						// set payload on first position only
						t2.SetPayload(new Payload(new byte[]{100}));
						first = false;
					}
					
					// index a "synonym" for every token
					buffered = (Token) t2.Clone();
					buffered.SetPayload(null);
					buffered.SetPositionIncrement(0);
					buffered.SetTermBuffer(new char[]{'b'}, 0, 1);
					
					return t2;
				}
        private static Token TokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset,
                                          TokenPositioner positioner)
        {
            var token = new Token(startOffset, endOffset);

            token.SetTermBuffer(text);
            token.SetPositionIncrement(posIncr);

            ShingleMatrixFilter.DefaultSettingsCodec.SetWeight(token, weight);
            ShingleMatrixFilter.DefaultSettingsCodec.SetTokenPositioner(token, positioner);

            return token;
        }
Пример #29
0
 public virtual void  TestCopyTo()
 {
     Token t = new Token();
     Token copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t);
     Assert.AreEqual("", t.Term);
     Assert.AreEqual("", copy.Term);
     
     t = new Token(0, 5);
     char[] content = "hello".ToCharArray();
     t.SetTermBuffer(content, 0, 5);
     char[] buf = t.TermBuffer();
     copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t);
     Assert.AreEqual(t.Term, copy.Term);
     Assert.AreNotSame(buf, copy.TermBuffer());
     
     Payload pl = new Payload(new byte[]{1, 2, 3, 4});
     t.Payload = pl;
     copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t);
     Assert.AreEqual(pl, copy.Payload);
     Assert.AreNotSame(pl, copy.Payload);
 }
        private Token GetNextInputToken(Token token)
        {
            if (!_input.IncrementToken()) return null;

            token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
            token.SetPositionIncrement(_inPosIncrAtt.GetPositionIncrement());
            token.SetFlags(_inFlagsAtt.GetFlags());
            token.SetOffset(_inOffsetAtt.StartOffset(), _inOffsetAtt.EndOffset());
            token.SetType(_inTypeAtt.Type());
            token.SetPayload(_inPayloadAtt.GetPayload());
            return token;
        }
 private Token TokenFactory(String text, int posIncr, int startOffset, int endOffset)
 {
     Token token = new Token(startOffset, endOffset);
     token.SetTermBuffer(text);
     token.PositionIncrement = posIncr;
     return token;
 }
Пример #32
0
 public virtual void  TestGrow()
 {
     Token t = new Token();
     System.Text.StringBuilder buf = new System.Text.StringBuilder("ab");
     for (int i = 0; i < 20; i++)
     {
         char[] content = buf.ToString().ToCharArray();
         t.SetTermBuffer(content, 0, content.Length);
         Assert.AreEqual(buf.Length, t.TermLength());
         Assert.AreEqual(buf.ToString(), t.Term);
         buf.Append(buf.ToString());
     }
     Assert.AreEqual(1048576, t.TermLength());
     Assert.AreEqual(1179654, t.TermBuffer().Length);
     
     // now as a string, first variant
     t = new Token();
     buf = new System.Text.StringBuilder("ab");
     for (int i = 0; i < 20; i++)
     {
         System.String content = buf.ToString();
         t.SetTermBuffer(content, 0, content.Length);
         Assert.AreEqual(content.Length, t.TermLength());
         Assert.AreEqual(content, t.Term);
         buf.Append(content);
     }
     Assert.AreEqual(1048576, t.TermLength());
     Assert.AreEqual(1179654, t.TermBuffer().Length);
     
     // now as a string, second variant
     t = new Token();
     buf = new System.Text.StringBuilder("ab");
     for (int i = 0; i < 20; i++)
     {
         System.String content = buf.ToString();
         t.SetTermBuffer(content);
         Assert.AreEqual(content.Length, t.TermLength());
         Assert.AreEqual(content, t.Term);
         buf.Append(content);
     }
     Assert.AreEqual(1048576, t.TermLength());
     Assert.AreEqual(1179654, t.TermBuffer().Length);
     
     // Test for slow growth to a long term
     t = new Token();
     buf = new System.Text.StringBuilder("a");
     for (int i = 0; i < 20000; i++)
     {
         System.String content = buf.ToString();
         t.SetTermBuffer(content);
         Assert.AreEqual(content.Length, t.TermLength());
         Assert.AreEqual(content, t.Term);
         buf.Append("a");
     }
     Assert.AreEqual(20000, t.TermLength());
     Assert.AreEqual(20167, t.TermBuffer().Length);
     
     // Test for slow growth to a long term
     t = new Token();
     buf = new System.Text.StringBuilder("a");
     for (int i = 0; i < 20000; i++)
     {
         System.String content = buf.ToString();
         t.SetTermBuffer(content);
         Assert.AreEqual(content.Length, t.TermLength());
         Assert.AreEqual(content, t.Term);
         buf.Append("a");
     }
     Assert.AreEqual(20000, t.TermLength());
     Assert.AreEqual(20167, t.TermBuffer().Length);
 }
Пример #33
0
 public virtual void  TestTermBufferEquals()
 {
     Token t1a = new Token();
     char[] content1a = "hello".ToCharArray();
     t1a.SetTermBuffer(content1a, 0, 5);
     Token t1b = new Token();
     char[] content1b = "hello".ToCharArray();
     t1b.SetTermBuffer(content1b, 0, 5);
     Token t2 = new Token();
     char[] content2 = "hello2".ToCharArray();
     t2.SetTermBuffer(content2, 0, 6);
     Assert.IsTrue(t1a.Equals(t1b));
     Assert.IsFalse(t1a.Equals(t2));
     Assert.IsFalse(t2.Equals(t1b));
 }
Пример #34
0
        public override bool IncrementToken()
        {

            if (currentRealToken == null)
            {
                bool next = realStream.IncrementToken();
                if (!next)
                {
                    return false;
                }
                //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
                ClearAttributes();
                termAtt.SetTermBuffer(realTermAtt.Term);
                offsetAtt.SetOffset(realOffsetAtt.StartOffset, realOffsetAtt.EndOffset);
                posIncrAtt.PositionIncrement = realPosIncrAtt.PositionIncrement;

                String expansions = synonyms[realTermAtt.Term];
                if (expansions == null)
                {
                    return true;
                }
                st = new Tokenizer(expansions, ",");
                if (st.HasMoreTokens())
                {
                    currentRealToken = new Token(realOffsetAtt.StartOffset, realOffsetAtt.EndOffset);
                    currentRealToken.SetTermBuffer(realTermAtt.Term);
                }

                return true;
            }
            else
            {
                String tok = st.NextToken();
                ClearAttributes();
                termAtt.SetTermBuffer(tok);
                offsetAtt.SetOffset(currentRealToken.StartOffset, currentRealToken.EndOffset);
                posIncrAtt.PositionIncrement = 0;
                if (!st.HasMoreTokens())
                {
                    currentRealToken = null;
                    st = null;
                }
                return true;
            }
        }
        /// <summary>
        /// This method exists in order to avoid recursive calls to the method
        /// as the complexity of a fairly small matrix then easily would require
        /// a gigabyte sized stack per thread.
        /// </summary>
        /// <param name="reusableToken"></param>
        /// <returns>null if exhausted, instance request_next_token if one more call is required for an answer, 
        /// or instance parameter resuableToken.</returns>
        private Token ProduceNextToken(Token reusableToken)
        {
            if (_currentPermuationTokens != null)
            {
                _currentShingleLength++;

                if (_currentShingleLength + _currentPermutationTokensStartOffset <= _currentPermuationTokens.Count
                    && _currentShingleLength <= MaximumShingleSize)
                {
                    // it is possible to create at least one more shingle of the current matrix permutation

                    if (IsIgnoringSinglePrefixOrSuffixShingle && 
                        _currentShingleLength == 1 && 
                        (_currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast))
                    {
                        return Next();
                    }

                    var termLength = 0;

                    var shingle = new EquatableList<Token>();

                    for (int i = 0; i < _currentShingleLength; i++)
                    {
                        var shingleToken = _currentPermuationTokens[i + _currentPermutationTokensStartOffset];
                        termLength += shingleToken.TermLength();
                        shingle.Add(shingleToken);
                    }
                    if (SpacerCharacter != null)
                        termLength += _currentShingleLength - 1;

                    // only produce shingles that not already has been created
                    if (!_shinglesSeen.Add(shingle))
                        return _requestNextToken;

                    // shingle token factory
                    var sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. ;)
                    foreach (var shingleToken in shingle)
                    {
                        if (SpacerCharacter != null &&  sb.Length > 0)
                            sb.Append(SpacerCharacter);

                        sb.Append(shingleToken.TermBuffer(), 0, shingleToken.TermLength());
                    }

                    reusableToken.SetTermBuffer(sb.ToString());
                    UpdateToken(reusableToken, shingle, _currentPermutationTokensStartOffset, _currentPermutationRows,
                                _currentPermuationTokens);

                    return reusableToken;
                }

                // it is NOT possible to create one more shingles of the current matrix permutation
                if (_currentPermutationTokensStartOffset < _currentPermuationTokens.Count - 1)
                {
                    // reset shingle size and move one step to the right in the current tokens permutation
                    _currentPermutationTokensStartOffset++;
                    _currentShingleLength = MinimumShingleSize - 1;
                    return _requestNextToken;
                }


                // todo does this ever occur?
                if (_permutations == null)
                    return null;

                if (!_permutations.HasNext())
                {
                    // load more data (if available) to the matrix

                    // don't really care, we just read it.
                    if (_input != null)
                        ReadColumn();

                    // get rid of resources

                    // delete the first column in the matrix
                    var deletedColumn = Matrix.Columns[0];
                    Matrix.Columns.RemoveAt(0);

                    // remove all shingles seen that include any of the tokens from the deleted column.
                    var deletedColumnTokens = deletedColumn.Rows.SelectMany(row => row.Tokens).ToList();
                    
                    // I'm a little concerned about this part of the code, because the unit tests currently 
                    // don't cover this scenario. (I put a break point here, and ran the unit tests in debug mode 
                    // and this code block was never hit... I also changed it significatly from the Java version
                    // to use RemoveWhere and LINQ. 
                    //
                    // TODO: Write a unit test to cover this and make sure this is a good port! -thoward

                    // linq version
                    _shinglesSeen.RemoveWhere(
                        shingle => (shingle.Find(deletedColumnTokens.Contains) != default(Token)));

                    //// initial conversion
                    //var shinglesSeenIterator = _shinglesSeen.ToList();
                    //foreach (var shingle in shinglesSeenIterator)
                    //{
                    //    foreach (var deletedColumnToken in deletedColumnTokens)
                    //    {
                    //        if (shingle.Contains(deletedColumnToken))
                    //        {
                    //            _shinglesSeen.Remove(shingle);
                    //            break;
                    //        }
                    //    }
                    //}

                    // exhausted
                    if (Matrix.Columns.Count < MinimumShingleSize)
                        return null;

                    // create permutations of the matrix it now looks
                    _permutations = Matrix.PermutationIterator();
                }

                NextTokensPermutation();
                return _requestNextToken;
            }

            if (_permutations == null)
                _permutations = Matrix.PermutationIterator();

            if (!_permutations.HasNext())
                return null;

            NextTokensPermutation();

            return _requestNextToken;
        }
Пример #36
0
 public virtual void  TestMixedStringArray()
 {
     Token t = new Token("hello", 0, 5);
     Assert.AreEqual(t.TermLength(), 5);
     Assert.AreEqual(t.Term, "hello");
     t.SetTermBuffer("hello2");
     Assert.AreEqual(t.TermLength(), 6);
     Assert.AreEqual(t.Term, "hello2");
     t.SetTermBuffer("hello3".ToCharArray(), 0, 6);
     Assert.AreEqual(t.Term, "hello3");
     
     char[] buffer = t.TermBuffer();
     buffer[1] = 'o';
     Assert.AreEqual(t.Term, "hollo3");
 }