Reset() public method

Resets this stream to the beginning. This is an optional operation, so subclasses may or may not implement this method. Reset() is not needed for the standard indexing process. However, if the tokens of a TokenStream are intended to be consumed more than once, it is necessary to implement Reset(). Note that if your TokenStream caches tokens and feeds them back again after a reset, it is imperative that you clone the tokens when you store them away (on the first pass) as well as when you return them (on future passes after Reset()).
public Reset ( ) : void
return void
Exemplo n.º 1
0
        protected internal virtual void ToDotFile(Analyzer a, string inputText, string localFileName)
        {
            StreamWriter w  = new StreamWriter(new FileStream(localFileName, FileMode.Open), IOUtils.CHARSET_UTF_8);
            TokenStream  ts = a.TokenStream("field", new StreamReader(inputText));

            ts.Reset();
            (new TokenStreamToDot(inputText, ts, /* new PrintWriter(*/ w /*)*/)).ToDot();
            w.Close();
        }
Exemplo n.º 2
0
        protected internal virtual string ToDot(Analyzer a, string inputText)
        {
            StringWriter sw = new StringWriter();
            TokenStream  ts = a.TokenStream("field", new StringReader(inputText));

            ts.Reset();
            (new TokenStreamToDot(inputText, ts, /*new StreamWriter(*/ (TextWriter)sw /*)*/)).ToDot();
            return(sw.ToString());
        }
Exemplo n.º 3
0
        public virtual void AssertThreadSafe(Analyzer analyzer)
        {
            int numTestPoints = 100;
            int numThreads    = TestUtil.NextInt(Random(), 3, 5);
            Dictionary <string, BytesRef> map = new Dictionary <string, BytesRef>();

            // create a map<String,SortKey> up front.
            // then with multiple threads, generate sort keys for all the keys in the map
            // and ensure they are the same as the ones we produced in serial fashion.

            for (int i = 0; i < numTestPoints; i++)
            {
                string      term           = TestUtil.RandomSimpleString(Random());
                IOException priorException = null;
                TokenStream ts             = analyzer.TokenStream("fake", new StreamReader(term));
                try
                {
                    ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                    BytesRef bytes = termAtt.BytesRef;
                    ts.Reset();
                    Assert.IsTrue(ts.IncrementToken());
                    termAtt.FillBytesRef();
                    // ensure we make a copy of the actual bytes too
                    map[term] = BytesRef.DeepCopyOf(bytes);
                    Assert.IsFalse(ts.IncrementToken());
                    ts.End();
                }
                catch (IOException e)
                {
                    priorException = e;
                }
                finally
                {
                    IOUtils.CloseWhileHandlingException(priorException, ts);
                }
            }

            ThreadClass[] threads = new ThreadClass[numThreads];
            for (int i = 0; i < numThreads; i++)
            {
                threads[i] = new ThreadAnonymousInnerClassHelper(this, analyzer, map);
            }
            for (int i = 0; i < numThreads; i++)
            {
                threads[i].Start();
            }
            for (int i = 0; i < numThreads; i++)
            {
                threads[i].Join();
            }
        }
Exemplo n.º 4
0
        public void TestLUCENE_3042()
        {
            String testString = "t";

            Analyzer analyzer = new MockAnalyzer(Random);

            using (TokenStream stream = analyzer.GetTokenStream("dummy", testString))
            {
                stream.Reset();
                while (stream.IncrementToken())
                {
                    // consume
                }
                stream.End();
            }

            AssertAnalyzesTo(analyzer, testString, new String[] { "t" });
        }
Exemplo n.º 5
0
        public void TestForwardOffsets()
        {
            int num = AtLeast(1000);

            for (int i = 0; i < num; i++)
            {
                String         s          = TestUtil.RandomHtmlishString(Random, 20);
                StringReader   reader     = new StringReader(s);
                MockCharFilter charfilter = new MockCharFilter(reader, 2);
                MockAnalyzer   analyzer   = new MockAnalyzer(Random);
                using TokenStream ts = analyzer.GetTokenStream("bogus", charfilter);
                ts.Reset();
                while (ts.IncrementToken())
                {
                    ;
                }
                ts.End();
            }
        }
        private static void  ConsumeStreamVeryOldAPI(TokenStream stream)
        {
            stream.Reset();

            Token token;
            int   i = 0;

            while ((token = stream.Next()) != null)
            {
                System.String term = token.Term();
                Payload       p    = token.GetPayload();
                if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
                {
                    Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
                }
                else
                {
                    Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
                }
                Assert.AreEqual(results[i], term);
                i++;
            }
        }
Exemplo n.º 7
0
 public override void Run()
 {
     try
     {
         foreach (var mapping in this.map)
         {
             string      term           = mapping.Key;
             BytesRef    expected       = mapping.Value;
             Exception   priorException = null; // LUCENENET: No need to cast to IOExcpetion
             TokenStream ts             = this.analyzer.GetTokenStream("fake", new StringReader(term));
             try
             {
                 ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                 BytesRef bytes = termAtt.BytesRef;
                 ts.Reset();
                 Assert.IsTrue(ts.IncrementToken());
                 termAtt.FillBytesRef();
                 Assert.AreEqual(expected, bytes);
                 Assert.IsFalse(ts.IncrementToken());
                 ts.End();
             }
             catch (Exception e) when(e.IsIOException())
             {
                 priorException = e;
             }
             finally
             {
                 IOUtils.DisposeWhileHandlingException(priorException, ts);
             }
         }
     }
     catch (Exception e) when(e.IsIOException())
     {
         throw RuntimeException.Create(e);
     }
 }
        private static void  ConsumeStreamNewAPI(TokenStream stream)
        {
            stream.Reset();
            PayloadAttribute payloadAtt = (PayloadAttribute)stream.AddAttribute(typeof(PayloadAttribute));
            TermAttribute    termAtt    = (TermAttribute)stream.AddAttribute(typeof(TermAttribute));

            int i = 0;

            while (stream.IncrementToken())
            {
                System.String term = termAtt.Term();
                Payload       p    = payloadAtt.GetPayload();
                if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
                {
                    Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
                }
                else
                {
                    Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
                }
                Assert.AreEqual(results[i], term);
                i++;
            }
        }
Exemplo n.º 9
0
 public override void Run()
 {
     try
     {
         foreach (KeyValuePair <string, BytesRef> mapping in Map)
         {
             string      term           = mapping.Key;
             BytesRef    expected       = mapping.Value;
             IOException priorException = null;
             TokenStream ts             = Analyzer.TokenStream("fake", new StreamReader(term));
             try
             {
                 ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                 BytesRef bytes = termAtt.BytesRef;
                 ts.Reset();
                 Assert.IsTrue(ts.IncrementToken());
                 termAtt.FillBytesRef();
                 Assert.AreEqual(expected, bytes);
                 Assert.IsFalse(ts.IncrementToken());
                 ts.End();
             }
             catch (IOException e)
             {
                 priorException = e;
             }
             finally
             {
                 IOUtils.CloseWhileHandlingException(priorException, ts);
             }
         }
     }
     catch (IOException e)
     {
         throw (Exception)e;
     }
 }
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int? finalOffset)
        {
            Assert.IsNotNull(output);
            ICheckClearAttributesAttribute checkClearAtt = ts.AddAttribute<ICheckClearAttributesAttribute>();

            Assert.IsTrue(ts.HasAttribute<ITermAttribute>(), "has no TermAttribute");
            ITermAttribute termAtt = ts.GetAttribute<ITermAttribute>();

            IOffsetAttribute offsetAtt = null;
            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute<IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute<IOffsetAttribute>();
            }
    
            ITypeAttribute typeAtt = null;
            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute<ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute<ITypeAttribute>();
            }
            
            IPositionIncrementAttribute posIncrAtt = null;
            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute<IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute<IPositionIncrementAttribute>();
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null) offsetAtt.SetOffset(14584724, 24683243);
                if (typeAtt != null) typeAtt.Type = "bogusType";
                if (posIncrAtt != null) posIncrAtt.PositionIncrement = 45987657;

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term, "term " + i);
                if (startOffsets != null)
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset, "startOffset " + i);
                if (endOffsets != null)
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset, "endOffset " + i);
                if (types != null)
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                if (posIncrements != null)
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset, "finalOffset ");
            ts.Close();
        }
Exemplo n.º 11
0
		private static void  ConsumeStreamVeryOldAPI(TokenStream stream)
		{
			stream.Reset();
			
			Token token;
			int i = 0;
			while ((token = stream.Next()) != null)
			{
				System.String term = token.Term();
				Payload p = token.GetPayload();
				if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
				{
					Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
				}
				else
				{
					Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
				}
				Assert.AreEqual(results[i], term);
				i++;
			}
		}
Exemplo n.º 12
0
		private static void  ConsumeStreamNewAPI(TokenStream stream)
		{
			stream.Reset();
			PayloadAttribute payloadAtt = (PayloadAttribute) stream.AddAttribute(typeof(PayloadAttribute));
			TermAttribute termAtt = (TermAttribute) stream.AddAttribute(typeof(TermAttribute));
			
			int i = 0;
			while (stream.IncrementToken())
			{
				System.String term = termAtt.Term();
				Payload p = payloadAtt.GetPayload();
				if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
				{
					Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
				}
				else
				{
					Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
				}
				Assert.AreEqual(results[i], term);
				i++;
			}
		}
Exemplo n.º 13
0
        public virtual void ToDot()
        {
            @in.Reset();
            WriteHeader();

            // TODO: is there some way to tell dot that it should
            // make the "main path" a straight line and have the
            // non-sausage arcs not affect node placement...

            int pos        = -1;
            int lastEndPos = -1;

            while (@in.IncrementToken())
            {
                bool isFirst = pos == -1;
                int  posInc  = PosIncAtt.PositionIncrement;
                if (isFirst && posInc == 0)
                {
                    // TODO: hmm are TS's still allowed to do this...?
                    Console.Error.WriteLine("WARNING: first posInc was 0; correcting to 1");
                    posInc = 1;
                }

                if (posInc > 0)
                {
                    // New node:
                    pos += posInc;
                    WriteNode(pos, Convert.ToString(pos));
                }

                if (posInc > 1)
                {
                    // Gap!
                    WriteArc(lastEndPos, pos, null, "dotted");
                }

                if (isFirst)
                {
                    WriteNode(-1, null);
                    WriteArc(-1, pos, null, null);
                }

                string arcLabel = TermAtt.ToString();
                if (OffsetAtt != null)
                {
                    int startOffset = OffsetAtt.StartOffset;
                    int endOffset   = OffsetAtt.EndOffset;
                    //System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length());
                    if (InputText != null)
                    {
                        arcLabel += " / " + InputText.Substring(startOffset, endOffset - startOffset);
                    }
                    else
                    {
                        arcLabel += " / " + startOffset + "-" + endOffset;
                    }
                }

                WriteArc(pos, pos + PosLengthAtt.PositionLength, arcLabel, null);
                lastEndPos = pos + PosLengthAtt.PositionLength;
            }

            @in.End();

            if (lastEndPos != -1)
            {
                // TODO: should we output any final text (from end
                // offsets) on this arc...?
                WriteNode(-2, null);
                WriteArc(lastEndPos, -2, null, null);
            }

            WriteTrailer();
        }
Exemplo n.º 14
0
 /// <summary>Reset the filter as well as the input TokenStream. </summary>
 public override void  Reset()
 {
     input.Reset();
 }
Exemplo n.º 15
0
        /// <summary>
        /// Pulls the graph (including <see cref="IPositionLengthAttribute"/>
        /// from the provided <see cref="TokenStream"/>, and creates the corresponding
        /// automaton where arcs are bytes (or Unicode code points
        /// if unicodeArcs = true) from each term.
        /// </summary>
        public virtual Automaton ToAutomaton(TokenStream @in)
        {
            var  a             = new Automaton();
            bool deterministic = true;

            var posIncAtt    = @in.AddAttribute <IPositionIncrementAttribute>();
            var posLengthAtt = @in.AddAttribute <IPositionLengthAttribute>();
            var offsetAtt    = @in.AddAttribute <IOffsetAttribute>();
            var termBytesAtt = @in.AddAttribute <ITermToBytesRefAttribute>();

            BytesRef term = termBytesAtt.BytesRef;

            @in.Reset();

            // Only temporarily holds states ahead of our current
            // position:

            RollingBuffer <Position> positions = new Positions();

            int      pos       = -1;
            Position posData   = null;
            int      maxOffset = 0;

            while (@in.IncrementToken())
            {
                int posInc = posIncAtt.PositionIncrement;
                if (!preservePositionIncrements && posInc > 1)
                {
                    posInc = 1;
                }
                Debug.Assert(pos > -1 || posInc > 0);

                if (posInc > 0)
                {
                    // New node:
                    pos += posInc;

                    posData = positions.Get(pos);
                    Debug.Assert(posData.leaving == null);

                    if (posData.arriving == null)
                    {
                        // No token ever arrived to this position
                        if (pos == 0)
                        {
                            // OK: this is the first token
                            posData.leaving = a.GetInitialState();
                        }
                        else
                        {
                            // this means there's a hole (eg, StopFilter
                            // does this):
                            posData.leaving = new State();
                            AddHoles(a.GetInitialState(), positions, pos);
                        }
                    }
                    else
                    {
                        posData.leaving = new State();
                        posData.arriving.AddTransition(new Transition(POS_SEP, posData.leaving));
                        if (posInc > 1)
                        {
                            // A token spanned over a hole; add holes
                            // "under" it:
                            AddHoles(a.GetInitialState(), positions, pos);
                        }
                    }
                    positions.FreeBefore(pos);
                }
                else
                {
                    // note: this isn't necessarily true. its just that we aren't surely det.
                    // we could optimize this further (e.g. buffer and sort synonyms at a position)
                    // but thats probably overkill. this is cheap and dirty
                    deterministic = false;
                }

                int endPos = pos + posLengthAtt.PositionLength;

                termBytesAtt.FillBytesRef();
                BytesRef termUTF8    = ChangeToken(term);
                int[]    termUnicode = null;
                Position endPosData  = positions.Get(endPos);
                if (endPosData.arriving == null)
                {
                    endPosData.arriving = new State();
                }

                State state   = posData.leaving;
                int   termLen = termUTF8.Length;
                if (unicodeArcs)
                {
                    string utf16 = termUTF8.Utf8ToString();
                    termUnicode = new int[utf16.CodePointCount(0, utf16.Length)];
                    termLen     = termUnicode.Length;
                    for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
                    {
                        termUnicode[j++] = cp = Character.CodePointAt(utf16, i);
                    }
                }
                else
                {
                    termLen = termUTF8.Length;
                }

                for (int byteIDX = 0; byteIDX < termLen; byteIDX++)
                {
                    State nextState = byteIDX == termLen - 1 ? endPosData.arriving : new State();
                    int   c;
                    if (unicodeArcs)
                    {
                        c = termUnicode[byteIDX];
                    }
                    else
                    {
                        c = termUTF8.Bytes[termUTF8.Offset + byteIDX] & 0xff;
                    }
                    state.AddTransition(new Transition(c, nextState));
                    state = nextState;
                }

                maxOffset = Math.Max(maxOffset, offsetAtt.EndOffset);
            }

            @in.End();
            State endState = null;

            if (offsetAtt.EndOffset > maxOffset)
            {
                endState        = new State();
                endState.Accept = true;
            }

            pos++;
            while (pos <= positions.MaxPos)
            {
                posData = positions.Get(pos);
                if (posData.arriving != null)
                {
                    if (endState != null)
                    {
                        posData.arriving.AddTransition(new Transition(POS_SEP, endState));
                    }
                    else
                    {
                        posData.arriving.Accept = true;
                    }
                }
                pos++;
            }

            //toDot(a);
            a.IsDeterministic = deterministic;
            return(a);
        }
Exemplo n.º 16
0
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute");
            TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            OffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute));
            }

            TypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute));
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.SetType("bogusType");
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.SetPositionIncrement(45987657);
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset ");
            }
            ts.Close();
        }
        // offsetsAreCorrect also validates:
        //   - graph offsets are correct (all tokens leaving from
        //     pos X have the same startOffset; all tokens
        //     arriving to pos Y have the same endOffset)
        //   - offsets only move forwards (startOffset >=
        //     lastStartOffset)
        public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int? finalOffset, int? finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect)
        {
            // LUCENENET: Bug fix: NUnit throws an exception when something fails. 
            // This causes Dispose() to be skipped and it pollutes other tests indicating false negatives.
            // Added this try-finally block to fix this.
            try
            {

                Assert.IsNotNull(output);
                var checkClearAtt = ts.AddAttribute<ICheckClearAttributesAttribute>();

                ICharTermAttribute termAtt = null;
                if (output.Length > 0)
                {
                    Assert.IsTrue(ts.HasAttribute<ICharTermAttribute>(), "has no CharTermAttribute");
                    termAtt = ts.GetAttribute<ICharTermAttribute>();
                }

                IOffsetAttribute offsetAtt = null;
                if (startOffsets != null || endOffsets != null || finalOffset != null)
                {
                    Assert.IsTrue(ts.HasAttribute<IOffsetAttribute>(), "has no OffsetAttribute");
                    offsetAtt = ts.GetAttribute<IOffsetAttribute>();
                }

                ITypeAttribute typeAtt = null;
                if (types != null)
                {
                    Assert.IsTrue(ts.HasAttribute<ITypeAttribute>(), "has no TypeAttribute");
                    typeAtt = ts.GetAttribute<ITypeAttribute>();
                }

                IPositionIncrementAttribute posIncrAtt = null;
                if (posIncrements != null || finalPosInc != null)
                {
                    Assert.IsTrue(ts.HasAttribute<IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                    posIncrAtt = ts.GetAttribute<IPositionIncrementAttribute>();
                }

                IPositionLengthAttribute posLengthAtt = null;
                if (posLengths != null)
                {
                    Assert.IsTrue(ts.HasAttribute<IPositionLengthAttribute>(), "has no PositionLengthAttribute");
                    posLengthAtt = ts.GetAttribute<IPositionLengthAttribute>();
                }

                IKeywordAttribute keywordAtt = null;
                if (keywordAtts != null)
                {
                    Assert.IsTrue(ts.HasAttribute<IKeywordAttribute>(), "has no KeywordAttribute");
                    keywordAtt = ts.GetAttribute<IKeywordAttribute>();
                }

                // Maps position to the start/end offset:
                IDictionary<int?, int?> posToStartOffset = new Dictionary<int?, int?>();
                IDictionary<int?, int?> posToEndOffset = new Dictionary<int?, int?>();

                ts.Reset();
                int pos = -1;
                int lastStartOffset = 0;
                for (int i = 0; i < output.Length; i++)
                {
                    // extra safety to enforce, that the state is not preserved and also assign bogus values
                    ts.ClearAttributes();
                    termAtt.SetEmpty().Append("bogusTerm");
                    if (offsetAtt != null)
                    {
                        offsetAtt.SetOffset(14584724, 24683243);
                    }
                    if (typeAtt != null)
                    {
                        typeAtt.Type = "bogusType";
                    }
                    if (posIncrAtt != null)
                    {
                        posIncrAtt.PositionIncrement = 45987657;
                    }
                    if (posLengthAtt != null)
                    {
                        posLengthAtt.PositionLength = 45987653;
                    }
                    if (keywordAtt != null)
                    {
                        keywordAtt.Keyword = (i & 1) == 0;
                    }

                    bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before
                    Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                    Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain");

                    Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString());
                    if (startOffsets != null)
                    {
                        Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                    }
                    if (endOffsets != null)
                    {
                        Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                    }
                    if (types != null)
                    {
                        Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                    }
                    if (posIncrements != null)
                    {
                        Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                    }
                    if (posLengths != null)
                    {
                        Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i);
                    }
                    if (keywordAtts != null)
                    {
                        Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i);
                    }

                    // we can enforce some basic things about a few attributes even if the caller doesn't check:
                    if (offsetAtt != null)
                    {
                        int startOffset = offsetAtt.StartOffset();
                        int endOffset = offsetAtt.EndOffset();
                        if (finalOffset != null)
                        {
                            Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset");
                            Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset);
                        }

                        if (offsetsAreCorrect)
                        {
                            Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
                            lastStartOffset = offsetAtt.StartOffset();
                        }

                        if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null)
                        {
                            // Validate offset consistency in the graph, ie
                            // all tokens leaving from a certain pos have the
                            // same startOffset, and all tokens arriving to a
                            // certain pos have the same endOffset:
                            int posInc = posIncrAtt.PositionIncrement;
                            pos += posInc;

                            int posLength = posLengthAtt.PositionLength;

                            if (!posToStartOffset.ContainsKey(pos))
                            {
                                // First time we've seen a token leaving from this position:
                                posToStartOffset[pos] = startOffset;
                                //System.out.println("  + s " + pos + " -> " + startOffset);
                            }
                            else
                            {
                                // We've seen a token leaving from this position
                                // before; verify the startOffset is the same:
                                //System.out.println("  + vs " + pos + " -> " + startOffset);
                                Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                            }

                            int endPos = pos + posLength;

                            if (!posToEndOffset.ContainsKey(endPos))
                            {
                                // First time we've seen a token arriving to this position:
                                posToEndOffset[endPos] = endOffset;
                                //System.out.println("  + e " + endPos + " -> " + endOffset);
                            }
                            else
                            {
                                // We've seen a token arriving to this position
                                // before; verify the endOffset is the same:
                                //System.out.println("  + ve " + endPos + " -> " + endOffset);
                                Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                            }
                        }
                    }
                    if (posIncrAtt != null)
                    {
                        if (i == 0)
                        {
                            Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1");
                        }
                        else
                        {
                            Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0");
                        }
                    }
                    if (posLengthAtt != null)
                    {
                        Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1");
                    }
                }

                if (ts.IncrementToken())
                {
                    Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt);
                }

                // repeat our extra safety checks for End()
                ts.ClearAttributes();
                if (termAtt != null)
                {
                    termAtt.SetEmpty().Append("bogusTerm");
                }
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }
                if (posLengthAtt != null)
                {
                    posLengthAtt.PositionLength = 45987653;
                }

                var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before

                ts.End();
                Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()");

                if (finalOffset != null)
                {
                    Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset");
                }
                if (offsetAtt != null)
                {
                    Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0");
                }
                if (finalPosInc != null)
                {
                    Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc");
                }

                ts.Dispose();
            }
            catch (Exception)
            {
                //ts.Reset();
                ts.ClearAttributes();
                ts.End();
                ts.Dispose();
                throw;
            }
        }
Exemplo n.º 18
0
 /// <summary>
 /// Reset the filter as well as the input TokenStream.
 /// </summary>
 public override void Reset()
 {
     base.Reset();
     input.Reset();
 }
Exemplo n.º 19
0
        // offsetsAreCorrect also validates:
        //   - graph offsets are correct (all tokens leaving from
        //     pos X have the same startOffset; all tokens
        //     arriving to pos Y have the same endOffset)
        //   - offsets only move forwards (startOffset >=
        //     lastStartOffset)
        public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect)
        {
            Assert.IsNotNull(output);
            var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>();

            ICharTermAttribute termAtt = null;

            if (output.Length > 0)
            {
                Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute");
                termAtt = ts.GetAttribute <ICharTermAttribute>();
            }

            IOffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute <IOffsetAttribute>();
            }

            ITypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute <ITypeAttribute>();
            }

            IPositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null || finalPosInc != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>();
            }

            IPositionLengthAttribute posLengthAtt = null;

            if (posLengths != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute");
                posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>();
            }

            IKeywordAttribute keywordAtt = null;

            if (keywordAtts != null)
            {
                Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute");
                keywordAtt = ts.GetAttribute <IKeywordAttribute>();
            }

            // Maps position to the start/end offset:
            IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>();
            IDictionary <int?, int?> posToEndOffset   = new Dictionary <int?, int?>();

            ts.Reset();
            int pos             = -1;
            int lastStartOffset = 0;

            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }
                if (posLengthAtt != null)
                {
                    posLengthAtt.PositionLength = 45987653;
                }
                if (keywordAtt != null)
                {
                    keywordAtt.Keyword = (i & 1) == 0;
                }

                bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString());
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                }
                if (posLengths != null)
                {
                    Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i);
                }
                if (keywordAtts != null)
                {
                    Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i);
                }

                // we can enforce some basic things about a few attributes even if the caller doesn't check:
                if (offsetAtt != null)
                {
                    int startOffset = offsetAtt.StartOffset();
                    int endOffset   = offsetAtt.EndOffset();
                    if (finalOffset != null)
                    {
                        Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset");
                        Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset);
                    }

                    if (offsetsAreCorrect)
                    {
                        Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
                        lastStartOffset = offsetAtt.StartOffset();
                    }

                    if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null)
                    {
                        // Validate offset consistency in the graph, ie
                        // all tokens leaving from a certain pos have the
                        // same startOffset, and all tokens arriving to a
                        // certain pos have the same endOffset:
                        int posInc = posIncrAtt.PositionIncrement;
                        pos += posInc;

                        int posLength = posLengthAtt.PositionLength;

                        if (!posToStartOffset.ContainsKey(pos))
                        {
                            // First time we've seen a token leaving from this position:
                            posToStartOffset[pos] = startOffset;
                            //System.out.println("  + s " + pos + " -> " + startOffset);
                        }
                        else
                        {
                            // We've seen a token leaving from this position
                            // before; verify the startOffset is the same:
                            //System.out.println("  + vs " + pos + " -> " + startOffset);
                            Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }

                        int endPos = pos + posLength;

                        if (!posToEndOffset.ContainsKey(endPos))
                        {
                            // First time we've seen a token arriving to this position:
                            posToEndOffset[endPos] = endOffset;
                            //System.out.println("  + e " + endPos + " -> " + endOffset);
                        }
                        else
                        {
                            // We've seen a token arriving to this position
                            // before; verify the endOffset is the same:
                            //System.out.println("  + ve " + endPos + " -> " + endOffset);
                            Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }
                    }
                }
                if (posIncrAtt != null)
                {
                    if (i == 0)
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1");
                    }
                    else
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0");
                    }
                }
                if (posLengthAtt != null)
                {
                    Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1");
                }
            }

            if (ts.IncrementToken())
            {
                Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt);
            }

            // repeat our extra safety checks for End()
            ts.ClearAttributes();
            if (termAtt != null)
            {
                termAtt.SetEmpty().Append("bogusTerm");
            }
            if (offsetAtt != null)
            {
                offsetAtt.SetOffset(14584724, 24683243);
            }
            if (typeAtt != null)
            {
                typeAtt.Type = "bogusType";
            }
            if (posIncrAtt != null)
            {
                posIncrAtt.PositionIncrement = 45987657;
            }
            if (posLengthAtt != null)
            {
                posLengthAtt.PositionLength = 45987653;
            }

            var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before

            ts.End();
            Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()");

            if (finalOffset != null)
            {
                Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset");
            }
            if (offsetAtt != null)
            {
                Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0");
            }
            if (finalPosInc != null)
            {
                Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc");
            }

            ts.Dispose();
        }