public MyTokenStream(TestTermVectorsReader enclosingInstance)
 {
     InitBlock(enclosingInstance);
     termAtt    = (TermAttribute)AddAttribute(typeof(TermAttribute));
     posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
     offsetAtt  = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
 }
        public virtual void  TestStopListPositions()
        {
            bool defaultEnable = StopFilter.GetEnablePositionIncrementsDefault();

            StopFilter.SetEnablePositionIncrementsDefault(true);
            try
            {
                System.Collections.Hashtable stopWordsSet = new System.Collections.Hashtable();
                stopWordsSet.Add("good", "good");
                stopWordsSet.Add("test", "test");
                stopWordsSet.Add("analyzer", "analyzer");
                StopAnalyzer           newStop = new StopAnalyzer(stopWordsSet);
                System.IO.StringReader reader  = new System.IO.StringReader("This is a good test of the english stop analyzer with positions");
                int[]       expectedIncr       = new int[] { 1, 1, 1, 3, 1, 1, 1, 2, 1 };
                TokenStream stream             = newStop.TokenStream("test", reader);
                Assert.IsNotNull(stream);
                int           i       = 0;
                TermAttribute termAtt = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));
                PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stream.AddAttribute(typeof(PositionIncrementAttribute));

                while (stream.IncrementToken())
                {
                    System.String text = termAtt.Term();
                    Assert.IsFalse(stopWordsSet.Contains(text));
                    Assert.AreEqual(expectedIncr[i++], posIncrAtt.GetPositionIncrement());
                }
            }
            finally
            {
                StopFilter.SetEnablePositionIncrementsDefault(defaultEnable);
            }
        }
 public TestFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed) : base(in_Renamed)
 {
     InitBlock(enclosingInstance);
     termAtt    = (TermAttribute)AddAttribute(typeof(TermAttribute));
     posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
     offsetAtt  = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
     typeAtt    = (TypeAttribute)AddAttribute(typeof(TypeAttribute));
 }
Exemple #4
0
 public PayloadFilter(TokenStream input, System.String fieldName) : base(input)
 {
     this.fieldName = fieldName;
     pos            = 0;
     i           = 0;
     posIncrAttr = (PositionIncrementAttribute)input.AddAttribute(typeof(PositionIncrementAttribute));
     payloadAttr = (PayloadAttribute)input.AddAttribute(typeof(PayloadAttribute));
     termAttr    = (TermAttribute)input.AddAttribute(typeof(TermAttribute));
 }
Exemple #5
0
 public PayloadFilter(TestPayloadSpans enclosingInstance, TokenStream input, System.String fieldName) : base(input)
 {
     InitBlock(enclosingInstance);
     this.fieldName = fieldName;
     pos            = 0;
     SupportClass.CollectionsHelper.AddIfNotContains(entities, "xx");
     SupportClass.CollectionsHelper.AddIfNotContains(entities, "one");
     SupportClass.CollectionsHelper.AddIfNotContains(nopayload, "nopayload");
     SupportClass.CollectionsHelper.AddIfNotContains(nopayload, "np");
     termAtt    = (TermAttribute)AddAttribute(typeof(TermAttribute));
     posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
     payloadAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute));
 }
        private void  DoTestStopPositons(StopFilter stpf, bool enableIcrements)
        {
            Log("---> test with enable-increments-" + (enableIcrements?"enabled":"disabled"));
            stpf.SetEnablePositionIncrements(enableIcrements);
            TermAttribute termAtt = (TermAttribute)stpf.GetAttribute(typeof(TermAttribute));
            PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stpf.GetAttribute(typeof(PositionIncrementAttribute));

            for (int i = 0; i < 20; i += 3)
            {
                Assert.IsTrue(stpf.IncrementToken());
                Log("Token " + i + ": " + stpf);
                System.String w = English.IntToEnglish(i).Trim();
                Assert.AreEqual(w, termAtt.Term(), "expecting token " + i + " to be " + w);
                Assert.AreEqual(enableIcrements?(i == 0?1:3):1, posIncrAtt.GetPositionIncrement(), "all but first token must have position increment of 3");
            }
            Assert.IsFalse(stpf.IncrementToken());
        }
        public virtual void  TestStopList()
        {
            System.Collections.Hashtable stopWordsSet = new System.Collections.Hashtable();
            stopWordsSet.Add("good", "good");
            stopWordsSet.Add("test", "test");
            stopWordsSet.Add("analyzer", "analyzer");
            StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);

            System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer");
            TokenStream            stream = newStop.TokenStream("test", reader);

            Assert.IsNotNull(stream);
            TermAttribute termAtt = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));
            PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stream.AddAttribute(typeof(PositionIncrementAttribute));

            while (stream.IncrementToken())
            {
                System.String text = termAtt.Term();
                Assert.IsFalse(stopWordsSet.Contains(text));
                Assert.AreEqual(1, posIncrAtt.GetPositionIncrement());                 // by default stop tokenizer does not apply increments.
            }
        }
 private void Init(System.IO.TextReader input, bool replaceInvalidAcronym)
 {
     this.replaceInvalidAcronym = replaceInvalidAcronym;
     this.input = input;
     termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
     offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
     posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
     typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
 }
			public TestPosIncrementFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed):base(in_Renamed)
			{
				InitBlock(enclosingInstance);
				termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
				posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
			}
			public MyTokenStream(TestTermVectorsReader enclosingInstance)
			{
				InitBlock(enclosingInstance);
				termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
				posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
				offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
			}
Exemple #11
0
 private void InitBlock()
 {
     termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
     typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
     posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
 }
Exemple #12
0
 public void  Init()
 {
     termAtt    = (TermAttribute)AddAttribute(typeof(TermAttribute));
     posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
 }
			public PayloadFilter(TestPayloadSpans enclosingInstance, TokenStream input, System.String fieldName):base(input)
			{
				InitBlock(enclosingInstance);
				this.fieldName = fieldName;
				pos = 0;
				Support.CollectionsHelper.AddIfNotContains(entities, "xx");
				Support.CollectionsHelper.AddIfNotContains(entities, "one");
				Support.CollectionsHelper.AddIfNotContains(nopayload, "nopayload");
				Support.CollectionsHelper.AddIfNotContains(nopayload, "np");
				termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
				posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
				payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute));
			}
		public PayloadFilter(TokenStream input, System.String fieldName):base(input)
		{
			this.fieldName = fieldName;
			pos = 0;
			i = 0;
			posIncrAttr = (PositionIncrementAttribute) input.AddAttribute(typeof(PositionIncrementAttribute));
			payloadAttr = (PayloadAttribute) input.AddAttribute(typeof(PayloadAttribute));
			termAttr = (TermAttribute) input.AddAttribute(typeof(TermAttribute));
		}
        public override void  ProcessFields(Fieldable[] fields, int count)
        {
            fieldState.Reset(docState.doc.GetBoost());

            int maxFieldLength = docState.maxFieldLength;

            bool doInvert = consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                Fieldable field = fields[i];

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (field.IsIndexed() && doInvert)
                {
                    bool anyToken;

                    if (fieldState.length > 0)
                    {
                        fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name);
                    }

                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        int           valueLength = stringValue.Length;
                        perThread.singleTokenTokenStream.Reinit(stringValue, 0, valueLength);
                        fieldState.attributeSource = perThread.singleTokenTokenStream;
                        consumer.Start(field);

                        bool success = false;
                        try
                        {
                            consumer.Add();
                            success = true;
                        }
                        finally
                        {
                            if (!success)
                            {
                                docState.docWriter.SetAborting();
                            }
                        }
                        fieldState.offset += valueLength;
                        fieldState.length++;
                        fieldState.position++;
                        anyToken = valueLength > 0;
                    }
                    else
                    {
                        // tokenized field
                        TokenStream stream;
                        TokenStream streamValue = field.TokenStreamValue();

                        if (streamValue != null)
                        {
                            stream = streamValue;
                        }
                        else
                        {
                            // the field does not have a TokenStream,
                            // so we have to obtain one from the analyzer
                            System.IO.TextReader reader;                             // find or make Reader
                            System.IO.TextReader readerValue = field.ReaderValue();

                            if (readerValue != null)
                            {
                                reader = readerValue;
                            }
                            else
                            {
                                System.String stringValue = field.StringValue();
                                if (stringValue == null)
                                {
                                    throw new System.ArgumentException("field must have either TokenStream, String or Reader value");
                                }
                                perThread.stringReader.Init(stringValue);
                                reader = perThread.stringReader;
                            }

                            // Tokenize field and add to postingTable
                            stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader);
                        }

                        // reset the TokenStream to the first token
                        stream.Reset();

                        int startLength = fieldState.length;

                        // deprecated
                        bool allowMinus1Position = docState.allowMinus1Position;

                        try
                        {
                            int offsetEnd = fieldState.offset - 1;

                            bool hasMoreTokens = stream.IncrementToken();

                            fieldState.attributeSource = stream;

                            OffsetAttribute            offsetAttribute  = (OffsetAttribute)fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute));
                            PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute)fieldState.attributeSource.AddAttribute(typeof(PositionIncrementAttribute));

                            consumer.Start(field);

                            for (; ;)
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                if (!hasMoreTokens)
                                {
                                    break;
                                }

                                int posIncr = posIncrAttribute.GetPositionIncrement();
                                fieldState.position += posIncr;
                                if (allowMinus1Position || fieldState.position > 0)
                                {
                                    fieldState.position--;
                                }

                                if (posIncr == 0)
                                {
                                    fieldState.numOverlap++;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        docState.docWriter.SetAborting();
                                    }
                                }
                                fieldState.position++;
                                offsetEnd = fieldState.offset + offsetAttribute.EndOffset();
                                if (++fieldState.length >= maxFieldLength)
                                {
                                    if (docState.infoStream != null)
                                    {
                                        docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens");
                                    }
                                    break;
                                }

                                hasMoreTokens = stream.IncrementToken();
                            }
                            // trigger streams to perform end-of-stream operations
                            stream.End();

                            fieldState.offset += offsetAttribute.EndOffset();
                            anyToken           = fieldState.length > startLength;
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    if (anyToken)
                    {
                        fieldState.offset += docState.analyzer.GetOffsetGap(field);
                    }
                    fieldState.boost *= field.GetBoost();
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            consumer.Finish();
            endConsumer.Finish();
        }
        /// <summary> Not an explicit test, just useful to print out some info on performance
        ///
        /// </summary>
        /// <throws>  Exception </throws>
        public virtual void  Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                System.Text.StringBuilder buffer = new System.Text.StringBuilder();
                System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.ConsumeAllTokens();
                TokenStream   stream  = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100);
                TermAttribute tfTok   = (TermAttribute)stream.AddAttribute(typeof(TermAttribute));
                TermAttribute sinkTok = (TermAttribute)sink.AddAttribute(typeof(TermAttribute));
                for (int i = 0; stream.IncrementToken(); i++)
                {
                    Assert.IsTrue(sink.IncrementToken());
                    Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int  tfPos = 0;
                    long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())));
                        PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stream.GetAttribute(typeof(PositionIncrementAttribute));
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.GetPositionIncrement();
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = (PositionIncrementAttribute)stream.GetAttribute(typeof(PositionIncrementAttribute));
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.GetPositionIncrement();
                        }
                    }
                    long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))));
                        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)teeStream.GetAttribute(typeof(PositionIncrementAttribute));
                        while (teeStream.IncrementToken())
                        {
                            sinkPos += posIncrAtt.GetPositionIncrement();
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = (PositionIncrementAttribute)sink.GetAttribute(typeof(PositionIncrementAttribute));
                        while (sink.IncrementToken())
                        {
                            sinkPos += posIncrAtt.GetPositionIncrement();
                        }
                    }
                    finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
                }
                System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }
 public void Init()
 {
     termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
     posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
 }