/// <summary>Returns the next token in the stream, or null at EOS.
		/// <p>Removes <tt>'s</tt> from the end of words.
		/// <p>Removes dots from acronyms.
		/// </summary>
		public override Token Next(Token result)
		{
			Token t = input.Next(result);
			
			if (t == null)
				return null;
			
			char[] buffer = t.TermBuffer();
			int bufferLength = t.TermLength();
			System.String type = t.Type();
			
			if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
			{
				// Strip last 2 characters off
				t.SetTermLength(bufferLength - 2);
			}
			else if (type == ACRONYM_TYPE)
			{
				// remove dots
				int upto = 0;
				for (int i = 0; i < bufferLength; i++)
				{
					char c = buffer[i];
					if (c != '.')
						buffer[upto++] = c;
				}
				t.SetTermLength(upto);
			}
			
			return t;
		}
Example #2
0
        public virtual void TestToStringAndMultiAttributeImplementations()
        {
            AttributeSource src     = new AttributeSource();
            TermAttribute   termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute));
            TypeAttribute   typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute));

            termAtt.SetTermBuffer("TestTerm");
            typeAtt.SetType("TestType");
            Assert.AreEqual("(" + termAtt.ToString() + "," + typeAtt.ToString() + ")", src.ToString(), "Attributes should appear in original order");
            System.Collections.Generic.IEnumerator <AttributeImpl> it = src.GetAttributeImplsIterator().GetEnumerator();
            Assert.IsTrue(it.MoveNext(), "Iterator should have 2 attributes left");
            Assert.AreSame(termAtt, it.Current, "First AttributeImpl from iterator should be termAtt");
            Assert.IsTrue(it.MoveNext(), "Iterator should have 1 attributes left");
            Assert.AreSame(typeAtt, it.Current, "Second AttributeImpl from iterator should be typeAtt");
            Assert.IsFalse(it.MoveNext(), "Iterator should have 0 attributes left");

            src = new AttributeSource();
            src.AddAttributeImpl(new Token());
            // this should not add a new attribute as Token implements TermAttribute, too
            termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute));
            Assert.IsTrue(termAtt is Token, "TermAttribute should be implemented by Token");
            // get the Token attribute and check, that it is the only one
            it = src.GetAttributeImplsIterator().GetEnumerator();
            Assert.IsTrue(it.MoveNext());
            Token tok = (Token)it.Current;

            Assert.IsFalse(it.MoveNext(), "There should be only one attribute implementation instance");

            termAtt.SetTermBuffer("TestTerm");
            Assert.AreEqual("(" + tok.ToString() + ")", src.ToString(), "Token should only printed once");
        }
Example #3
0
 public override Token Next()
 {
     if (inPhrase)
     {
         inPhrase = false;
         return(new Token("phrase2", savedStart, savedEnd));
     }
     else
     {
         for (Token token = input.Next(); token != null; token = input.Next())
         {
             if (token.TermText().Equals("phrase"))
             {
                 inPhrase   = true;
                 savedStart = token.StartOffset();
                 savedEnd   = token.EndOffset();
                 return(new Token("phrase1", savedStart, savedEnd));
             }
             else if (!token.TermText().Equals("stop"))
             {
                 return(token);
             }
         }
     }
     return(null);
 }
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            Token nextToken = input.Next(reusableToken);

            if (nextToken == null)
                return null;

            char[] buffer = nextToken.TermBuffer();
            int bufferLength = nextToken.TermLength();
            System.String type = nextToken.Type();

            if (type == APOSTROPHE_TYPE &&
                bufferLength >= 2 &&
                buffer[bufferLength - 2] == '\'' &&
                (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                nextToken.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                        buffer[upto++] = c;
                }
                nextToken.SetTermLength(upto);
            }

            return nextToken;
        }
Example #5
0
        internal static void  Test(System.IO.TextReader reader, bool verbose, long bytes)
        {
            Analyzer    analyzer = new SimpleAnalyzer();
            TokenStream stream   = analyzer.TokenStream(null, reader);

            System.DateTime start = System.DateTime.Now;

            int count = 0;

            for (Token t = stream.Next(); t != null; t = stream.Next())
            {
                if (verbose)
                {
                    System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset());
                }
                count++;
            }

            System.DateTime end = System.DateTime.Now;

            long time = end.Ticks - start.Ticks;

            System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
            System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
            System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
        }
Example #6
0
 internal virtual void  AddToken(Token token, float score)
 {
     if (numTokens < MAX_NUM_TOKENS_PER_GROUP)
     {
         if (numTokens == 0)
         {
             startOffset = matchStartOffset = token.StartOffset();
             endOffset   = matchEndOffset = token.EndOffset();
             tot        += score;
         }
         else
         {
             startOffset = Math.Min(startOffset, token.StartOffset());
             endOffset   = Math.Max(endOffset, token.EndOffset());
             if (score > 0)
             {
                 if (tot == 0)
                 {
                     matchStartOffset = token.StartOffset();
                     matchEndOffset   = token.EndOffset();
                 }
                 else
                 {
                     matchStartOffset = Math.Min(matchStartOffset, token.StartOffset());
                     matchEndOffset   = Math.Max(matchEndOffset, token.EndOffset());
                 }
                 tot += score;
             }
         }
         tokens[numTokens] = token;
         scores[numTokens] = score;
         numTokens++;
     }
 }
Example #7
0
		internal virtual void  AddToken(Token token, float score)
		{
			if (numTokens < MAX_NUM_TOKENS_PER_GROUP)
			{
				if (numTokens == 0)
				{
					startOffset = matchStartOffset = token.StartOffset();
					endOffset = matchEndOffset = token.EndOffset();
					tot += score;
				}
				else
				{
					startOffset = Math.Min(startOffset, token.StartOffset());
					endOffset = Math.Max(endOffset, token.EndOffset());
					if (score > 0)
					{
						if (tot == 0)
						{
							matchStartOffset = token.StartOffset();
							matchEndOffset = token.EndOffset();
						}
						else
						{
							matchStartOffset = Math.Min(matchStartOffset, token.StartOffset());
							matchEndOffset = Math.Max(matchEndOffset, token.EndOffset());
						}
						tot += score;
					}
				}
				tokens[numTokens] = token;
				scores[numTokens] = score;
				numTokens++;
			}
		}
		/* (non-Javadoc)
		* @see Lucene.Net.Highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
		*/
		public virtual bool IsNewFragment(Token token)
		{
			bool isNewFrag = token.EndOffset() >= (fragmentSize * currentNumFrags);
			if (isNewFrag)
			{
				currentNumFrags++;
			}
			return isNewFrag;
		}
				public override Token Next()
				{
					if (i == TOKENS.Length)
						return null;
					Token t = new Token(TOKENS[i], i, i);
					t.SetPositionIncrement(INCREMENTS[i]);
					i++;
					return t;
				}
        /* (non-Javadoc)
         * @see Lucene.Net.Highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
         */
        public virtual bool IsNewFragment(Token token)
        {
            bool isNewFrag = token.EndOffset() >= (fragmentSize * currentNumFrags);

            if (isNewFrag)
            {
                currentNumFrags++;
            }
            return(isNewFrag);
        }
Example #11
0
                public override Token Next()
                {
                    if (i == TOKENS.Length)
                    {
                        return(null);
                    }
                    Token t = new Token(TOKENS[i], i, i);

                    t.SetPositionIncrement(INCREMENTS[i]);
                    i++;
                    return(t);
                }
        /*
         * (non-Javadoc)
         *
         * @see Lucene.Net.Analysis.TokenStream#next()
         */
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            int posIncr = 1;

            while (true)
            {
                int tokenType = scanner.GetNextToken();

                if (tokenType == StandardTokenizerImpl.YYEOF)
                {
                    return(null);
                }

                if (scanner.Yylength() <= maxTokenLength)
                {
                    reusableToken.Clear();
                    reusableToken.SetPositionIncrement(posIncr);
                    scanner.GetText(reusableToken);
                    int start = scanner.Yychar();
                    reusableToken.SetStartOffset(start);
                    reusableToken.SetEndOffset(start + reusableToken.TermLength());
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
                    {
                        if (replaceInvalidAcronym)
                        {
                            reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                            reusableToken.SetTermLength(reusableToken.TermLength() - 1);                             // remove extra '.'
                        }
                        else
                        {
                            reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                        }
                    }
                    else
                    {
                        reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
                    }
                    return(reusableToken);
                }
                // When we skip a too-long term, we still increment the
                // position increment
                else
                {
                    posIncr++;
                }
            }
        }
        internal override void addTerm(Token t, RawPostingList p0)
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.addTerm start"));

            FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList)p0;

            System.Diagnostics.Debug.Assert(omitTf || p.docFreq > 0);

            if (omitTf)
            {
                if (docState.docID != p.lastDocID)
                {
                    System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID);
                    termsHashPerField.writeVInt(0, p.lastDocCode);
                    p.lastDocCode = docState.docID - p.lastDocID;
                    p.lastDocID   = docState.docID;
                }
            }
            else
            {
                if (docState.docID != p.lastDocID)
                {
                    System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID);
                    // Term not yet seen in the current doc but previously
                    // seen in other doc(s) since the last flush

                    // Now that we know doc freq for previous doc,
                    // write it & lastDocCode
                    if (1 == p.docFreq)
                    {
                        termsHashPerField.writeVInt(0, p.lastDocCode | 1);
                    }
                    else
                    {
                        termsHashPerField.writeVInt(0, p.lastDocCode);
                        termsHashPerField.writeVInt(0, p.docFreq);
                    }
                    p.docFreq     = 1;
                    p.lastDocCode = (docState.docID - p.lastDocID) << 1;
                    p.lastDocID   = docState.docID;
                    writeProx(t, p, fieldState.position);
                }
                else
                {
                    p.docFreq++;
                    writeProx(t, p, fieldState.position - p.lastPosition);
                }
            }
        }
Example #14
0
            public virtual int Compare(System.Object o1, System.Object o2)
            {
                Token t1 = (Token)o1;
                Token t2 = (Token)o2;

                if (t1.StartOffset() > t2.StartOffset())
                {
                    return(1);
                }
                if (t1.StartOffset() < t2.StartOffset())
                {
                    return(-1);
                }
                return(0);
            }
Example #15
0
        public virtual void  TestIncrementingPositions()
        {
            Analyzer    analyzer = new WhitespaceAnalyzer();
            TokenStream ts       = analyzer.TokenStream("Field", new System.IO.StringReader("one two three four five"));

            while (true)
            {
                Token token = ts.Next();
                if (token == null)
                {
                    break;
                }
                Assert.AreEqual(1, token.GetPositionIncrement(), token.TermText());
            }
        }
        internal override void addTerm(Token t, RawPostingList p0)
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.addTerm start"));

            FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList)p0;

            System.Diagnostics.Debug.Assert(omitTf || p.docFreq > 0);

            if (omitTf)
            {
                if (docState.docID != p.lastDocID)
                {
                    System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID);
                    termsHashPerField.writeVInt(0, p.lastDocCode);
                    p.lastDocCode = docState.docID - p.lastDocID;
                    p.lastDocID = docState.docID;
                }
            }
            else
            {
                if (docState.docID != p.lastDocID)
                {
                    System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID);
                    // Term not yet seen in the current doc but previously
                    // seen in other doc(s) since the last flush

                    // Now that we know doc freq for previous doc,
                    // write it & lastDocCode
                    if (1 == p.docFreq)
                        termsHashPerField.writeVInt(0, p.lastDocCode | 1);
                    else
                    {
                        termsHashPerField.writeVInt(0, p.lastDocCode);
                        termsHashPerField.writeVInt(0, p.docFreq);
                    }
                    p.docFreq = 1;
                    p.lastDocCode = (docState.docID - p.lastDocID) << 1;
                    p.lastDocID = docState.docID;
                    writeProx(t, p, fieldState.position);
                }
                else
                {
                    p.docFreq++;
                    writeProx(t, p, fieldState.position - p.lastPosition);
                }
            }
        }
        internal void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode)
        {
            Payload payload = t.GetPayload();

            if (payload != null && payload.length > 0)
            {
                termsHashPerField.writeVInt(1, (proxCode << 1) | 1);
                termsHashPerField.writeVInt(1, payload.length);
                termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length);
                hasPayloads = true;
            }
            else
            {
                termsHashPerField.writeVInt(1, proxCode << 1);
            }
            p.lastPosition = fieldState.position;
        }
 internal override void newTerm(Token t, RawPostingList p0)
 {
     // First time we're seeing this term since the last
     // flush
     System.Diagnostics.Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.newTerm start"));
     FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList)p0;
     p.lastDocID = docState.docID;
     if (omitTf)
     {
         p.lastDocCode = docState.docID;
     }
     else
     {
         p.lastDocCode = docState.docID << 1;
         p.docFreq     = 1;
         writeProx(t, p, fieldState.position);
     }
 }
        public virtual void TestMixupDocs()
        {
            Directory dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()));
            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc);
            Document doc = new Document();
            FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
            customType.StoreTermVectors = true;
            customType.StoreTermVectorPositions = true;
            customType.StoreTermVectorPayloads = true;
            customType.StoreTermVectorOffsets = Random().NextBoolean();
            Field field = new Field("field", "", customType);
            TokenStream ts = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true);
            Assert.IsFalse(ts.HasAttribute<IPayloadAttribute>());
            field.TokenStream = ts;
            doc.Add(field);
            writer.AddDocument(doc);

            Token withPayload = new Token("withPayload", 0, 11);
            withPayload.Payload = new BytesRef("test");
            ts = new CannedTokenStream(withPayload);
            Assert.IsTrue(ts.HasAttribute<IPayloadAttribute>());
            field.TokenStream = ts;
            writer.AddDocument(doc);

            ts = new MockTokenizer(new StringReader("another"), MockTokenizer.WHITESPACE, true);
            Assert.IsFalse(ts.HasAttribute<IPayloadAttribute>());
            field.TokenStream = ts;
            writer.AddDocument(doc);

            DirectoryReader reader = writer.Reader;
            Terms terms = reader.GetTermVector(1, "field");
            Debug.Assert(terms != null);
            TermsEnum termsEnum = terms.Iterator(null);
            Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload")));
            DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null);
            Assert.AreEqual(0, de.NextDoc());
            Assert.AreEqual(0, de.NextPosition());
            Assert.AreEqual(new BytesRef("test"), de.Payload);
            writer.Dispose();
            reader.Dispose();
            dir.Dispose();
        }
Example #20
0
		/// <summary>Returns the next input Token, after being stemmed </summary>
        public override Token Next()
		{
			Token token = input.Next();
			if (token == null)
				return null;
			stemmer.SetCurrent(token.TermText());
			try
			{
				stemMethod.Invoke(stemmer, (System.Object[]) EMPTY_ARGS);
			}
			catch (System.Exception e)
			{
				throw new System.SystemException(e.ToString());
			}
			
			Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type());
			newToken.SetPositionIncrement(token.GetPositionIncrement());
			return newToken;
		}
        /* (non-Javadoc)
         * @see Lucene.Net.Highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
         */
        public virtual float GetTokenScore(Token token)
        {
            System.String termText = token.TermText();

            WeightedTerm queryTerm = (WeightedTerm)termsToFind[termText];

            if (queryTerm == null)
            {
                //not a query term - return
                return(0);
            }
            //found a query term - is it unique in this doc?
            if (!uniqueTermsInFragment.Contains(termText))
            {
                totalScore += queryTerm.GetWeight();
                uniqueTermsInFragment.Add(termText, termText);
            }
            return(queryTerm.GetWeight());
        }
Example #22
0
        /// <summary>
        /// Returns the next, stemmed, input Token.
        /// </summary>
        /// <returns>
        ///  The stemed form of a token.
        /// </returns>
        /// <throws>IOException</throws>
        public override Token Next()
        {
            Token token = input.Next();

            if (token == null)
            {
                return(null);
            }
            else
            {
                string str = stemmer.stem(token.TermText());
                //if ((System.Object) str != token.TermText())
                if (!str.Equals(token.TermText()))
                {
                    // Yes, I mean object reference comparison here
                    //token.TermText() = str;
                    return(new Token(str, token.StartOffset(), token.EndOffset(), token.Type()));
                }
                return(token);
            }
        }
Example #23
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            Token nextToken = input.Next(reusableToken);

            if (nextToken == null)
            {
                return(null);
            }

            char[] buffer       = nextToken.TermBuffer();
            int    bufferLength = nextToken.TermLength();

            System.String type = nextToken.Type();

            if (type == APOSTROPHE_TYPE &&
                bufferLength >= 2 &&
                buffer[bufferLength - 2] == '\'' &&
                (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                nextToken.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                nextToken.SetTermLength(upto);
            }

            return(nextToken);
        }
Example #24
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
Example #25
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
Example #26
0
        internal override void addTerm(Token t, RawPostingList p0)
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));

            TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0;
            p.freq++;

            if (doVectorOffsets)
            {
                int startOffset = fieldState.offset + t.StartOffset();
                int endOffset   = fieldState.offset + t.EndOffset();
                termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
                termsHashPerField.writeVInt(1, endOffset - startOffset);
                p.lastOffset = endOffset;
            }

            if (doVectorPositions)
            {
                termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition);
                p.lastPosition = fieldState.position;
            }
        }
Example #27
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
        internal override void addTerm(Token t, RawPostingList p0)
        {
            System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));

            TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0;
            p.freq++;

            if (doVectorOffsets)
            {
                int startOffset = fieldState.offset + t.StartOffset();
                int endOffset = fieldState.offset + t.EndOffset();
                termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
                termsHashPerField.writeVInt(1, endOffset - startOffset);
                p.lastOffset = endOffset;
            }

            if (doVectorPositions)
            {
                termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition);
                p.lastPosition = fieldState.position;
            }
        }
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 Token reusableToken = new Token();
                 for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                 {
                     terms.Add(nextToken.Term());
                 }
                 ProcessTerms((System.String[]) terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
Example #30
0
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             Token next = null;
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 while ((next = stream.Next()) != null)
                 {
                     terms.Add(next.TermText());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 Token reusableToken = new Token();
                 for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                 {
                     terms.Add(nextToken.Term());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
        /// <summary>Returns the next input Token, after being stemmed </summary>
        public override Token Next()
        {
            Token token = input.Next();

            if (token == null)
            {
                return(null);
            }
            stemmer.SetCurrent(token.TermText());
            try
            {
                stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS);
            }
            catch (System.Exception e)
            {
                throw new System.SystemException(e.ToString());
            }

            Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type());

            newToken.SetPositionIncrement(token.GetPositionIncrement());
            return(newToken);
        }
Example #33
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(Token result)
        {
            Token t = input.Next(result);

            if (t == null)
            {
                return(null);
            }

            char[] buffer       = t.TermBuffer();
            int    bufferLength = t.TermLength();

            System.String type = t.Type();

            if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                t.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                t.SetTermLength(upto);
            }

            return(t);
        }
		/// <summary> Fills Lucene token with the current token text.</summary>
		internal void  GetText(Token t)
		{
			t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
		}
 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
 /// not be overridden. Delegates to the backwards compatibility layer. 
 /// </deprecated>
 public override Token Next(Token reusableToken)
 {
     return base.Next(reusableToken);
 }
 public virtual void TestLegalbutVeryLargeOffsets()
 {
     Directory dir = NewDirectory();
     IndexWriter iw = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null));
     Document doc = new Document();
     Token t1 = new Token("foo", 0, int.MaxValue - 500);
     if (Random().NextBoolean())
     {
         t1.Payload = new BytesRef("test");
     }
     Token t2 = new Token("foo", int.MaxValue - 500, int.MaxValue);
     TokenStream tokenStream = new CannedTokenStream(new Token[] { t1, t2 });
     FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
     ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
     // store some term vectors for the checkindex cross-check
     ft.StoreTermVectors = true;
     ft.StoreTermVectorPositions = true;
     ft.StoreTermVectorOffsets = true;
     Field field = new Field("foo", tokenStream, ft);
     doc.Add(field);
     iw.AddDocument(doc);
     iw.Dispose();
     dir.Dispose();
 }
		public override Token Next()
		{
			if (currentRealToken == null)
			{
				Token nextRealToken = realStream.Next();
				if (nextRealToken == null)
				{
					return null;
				}
				System.String expansions = (System.String) synonyms[nextRealToken.TermText()];
				if (expansions == null)
				{
					return nextRealToken;
				}
				st = new Tokenizer(expansions, ",");
				if (st.HasMoreTokens())
				{
					currentRealToken = nextRealToken;
				}
				return currentRealToken;
			}
			else
			{
				System.String nextExpandedValue = st.NextToken();
				Token expandedToken = new Token(nextExpandedValue, currentRealToken.StartOffset(), currentRealToken.EndOffset());
				expandedToken.SetPositionIncrement(0);
				if (!st.HasMoreTokens())
				{
					currentRealToken = null;
					st = null;
				}
				return expandedToken;
			}
		}
        // TODO: more tests with other possibilities
        private void CheckTokens(Token[] tokens)
        {
            Directory dir = NewDirectory();
            RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, Iwc);
            bool success = false;
            try
            {
                FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
                ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                // store some term vectors for the checkindex cross-check
                ft.StoreTermVectors = true;
                ft.StoreTermVectorPositions = true;
                ft.StoreTermVectorOffsets = true;

                Document doc = new Document();
                doc.Add(new Field("body", new CannedTokenStream(tokens), ft));
                riw.AddDocument(doc);
                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Close(riw, dir);
                }
                else
                {
                    IOUtils.CloseWhileHandlingException(riw, dir);
                }
            }
        }
 /// <summary> Fills Lucene token with the current token text.</summary>
 internal void  GetText(Token t)
 {
     t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
 }
 // Called once per inverted token
 internal abstract void add(Token token);
Example #41
0
 public virtual bool IsNewFragment(Token token)
 {
     return(false);
 }
Example #42
0
 internal abstract void addTerm(Token t, RawPostingList p);
Example #43
0
 internal abstract void skippingLongTerm(Token t);
		/*
		* (non-Javadoc)
		*
		* @see Lucene.Net.Analysis.TokenStream#next()
		*/
		public override Token Next(Token result)
		{
			int posIncr = 1;
			
			while (true)
			{
				int tokenType = scanner.GetNextToken();
				
				if (tokenType == StandardTokenizerImpl.YYEOF)
				{
					return null;
				}
				
				if (scanner.Yylength() <= maxTokenLength)
				{
					result.Clear();
					result.SetPositionIncrement(posIncr);
					scanner.GetText(result);
					int start = scanner.Yychar();
					result.SetStartOffset(start);
					result.SetEndOffset(start + result.TermLength());
					// This 'if' should be removed in the next release. For now, it converts
					// invalid acronyms to HOST. When removed, only the 'else' part should
					// remain.
					if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
					{
						if (replaceInvalidAcronym)
						{
							result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
							result.SetTermLength(result.TermLength() - 1); // remove extra '.'
						}
						else
						{
							result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
						}
					}
					else
					{
						result.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
					}
					return result;
				}
				// When we skip a too-long term, we still increment the
				// position increment
				else
					posIncr++;
			}
		}
 private Term[] TapTerms(Token[] tap)
 {
     Term[] terms = new Term[tap.Length];
     for (int i = 0; i < terms.Length; i++)
     {
         terms[i] = new Term("field", tap[i].ToString());
     }
     return terms;
 }
Example #46
0
		/// <summary> Low level api.
		/// Returns a token stream or null if no offset info available in index.
		/// This can be used to feed the highlighter with a pre-parsed token stream 
		/// 
		/// In my tests the speeds to recreate 1000 token streams using this method are:
		/// - with TermVector offset only data stored - 420  milliseconds 
		/// - with TermVector offset AND position data stored - 271 milliseconds
		/// (nb timings for TermVector with position data are based on a tokenizer with contiguous
		/// positions - no overlaps or gaps)
		/// The cost of not using TermPositionVector to store
		/// pre-parsed content and using an analyzer to re-parse the original content: 
		/// - reanalyzing the original content - 980 milliseconds
		/// 
		/// The re-analyze timings will typically vary depending on -
		/// 1) The complexity of the analyzer code (timings above were using a 
		/// stemmer/lowercaser/stopword combo)
		/// 2) The  number of other fields (Lucene reads ALL fields off the disk 
		/// when accessing just one document field - can cost dear!)
		/// 3) Use of compression on field storage - could be faster cos of compression (less disk IO)
		/// or slower (more CPU burn) depending on the content.
		/// 
		/// </summary>
		/// <param name="">tpv
		/// </param>
		/// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
		/// to eek out the last drops of performance, set to true. If in doubt, set to false.
		/// </param>
		public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)
		{
			//an object used to iterate across an array of tokens
			//code to reconstruct the original sequence of Tokens
			System.String[] terms = tpv.GetTerms();
			int[] freq = tpv.GetTermFrequencies();
			int totalTokens = 0;
			for (int t = 0; t < freq.Length; t++)
			{
				totalTokens += freq[t];
			}
			Token[] tokensInOriginalOrder = new Token[totalTokens];
			System.Collections.ArrayList unsortedTokens = null;
			for (int t = 0; t < freq.Length; t++)
			{
				TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t);
				if (offsets == null)
				{
					return null;
				}
				
				int[] pos = null;
				if (tokenPositionsGuaranteedContiguous)
				{
					//try get the token position info to speed up assembly of tokens into sorted sequence
					pos = tpv.GetTermPositions(t);
				}
				if (pos == null)
				{
					//tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
					if (unsortedTokens == null)
					{
						unsortedTokens = new System.Collections.ArrayList();
					}
					for (int tp = 0; tp < offsets.Length; tp++)
					{
						unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()));
					}
				}
				else
				{
					//We have positions stored and a guarantee that the token position information is contiguous
					
					// This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
					// creates jumps in position numbers - this code would fail under those circumstances
					
					//tokens stored with positions - can use this to index straight into sorted array
					for (int tp = 0; tp < pos.Length; tp++)
					{
						tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset());
					}
				}
			}
			//If the field has been stored without position data we must perform a sort        
			if (unsortedTokens != null)
			{
				tokensInOriginalOrder = (Token[]) unsortedTokens.ToArray(typeof(Token));
				Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator());
			}
			return new StoredTokenStream(tokensInOriginalOrder);
		}
Example #47
0
			internal StoredTokenStream(Token[] tokens)
			{
				this.tokens = tokens;
			}
        // Secondary entry point (for 2nd & subsequent TermsHash),
        // because token text has already been "interned" into
        // textStart, so we hash by textStart
        public void add(Token token, int textStart)
        {
            int code = textStart;

            int hashPos = code & postingsHashMask;

            System.Diagnostics.Debug.Assert(!postingsCompacted);

            // Locate RawPostingList in hash
            p = postingsHash[hashPos];

            if (p != null && p.textStart != textStart)
            {
                // Conflict: keep searching different locations in
                // the hash table.
                int inc = ((code >> 8) + code) | 1;
                do
                {
                    code   += inc;
                    hashPos = code & postingsHashMask;
                    p       = postingsHash[hashPos];
                } while (p != null && p.textStart != textStart);
            }

            if (p == null)
            {
                // First time we are seeing this token since we last
                // flushed the hash.

                // Refill?
                if (0 == perThread.freePostingsCount)
                {
                    perThread.morePostings();
                }

                // Pull next free RawPostingList from free list
                p = perThread.freePostings[--perThread.freePostingsCount];
                System.Diagnostics.Debug.Assert(p != null);

                p.textStart = textStart;

                System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null);
                postingsHash[hashPos] = p;
                numPostings++;

                if (numPostings == postingsHashHalfSize)
                {
                    rehashPostings(2 * postingsHashSize);
                }

                // Init stream slices
                if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
                {
                    intPool.nextBuffer();
                }

                if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
                {
                    bytePool.NextBuffer();
                }

                intUptos         = intPool.buffer;
                intUptoStart     = intPool.intUpto;
                intPool.intUpto += streamCount;

                p.intStart = intUptoStart + intPool.intOffset;

                for (int i = 0; i < streamCount; i++)
                {
                    int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                    intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
                }
                p.byteStart = intUptos[intUptoStart];

                consumer.newTerm(token, p);
            }
            else
            {
                intUptos     = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
                intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
                consumer.addTerm(token, p);
            }
        }
Example #49
0
		internal virtual bool IsDistinct(Token token)
		{
			return token.StartOffset() >= endOffset;
		}
        // Primary entry point (for first TermsHash)
        internal override void add(Token token)
        {
            System.Diagnostics.Debug.Assert(!postingsCompacted);

            // We are first in the chain so we must "intern" the
            // term text into textStart address

            // Get the text of this term.
            char[] tokenText    = token.TermBuffer();
            int    tokenTextLen = token.TermLength();

            // Compute hashcode & replace any invalid UTF16 sequences
            int downto = tokenTextLen;
            int code   = 0;

            while (downto > 0)
            {
                char ch = tokenText[--downto];

                if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END)
                {
                    if (0 == downto)
                    {
                        // Unpaired
                        ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR;
                    }
                    else
                    {
                        char ch2 = tokenText[downto - 1];
                        if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END)
                        {
                            // OK: high followed by low.  This is a valid
                            // surrogate pair.
                            code = ((code * 31) + ch) * 31 + ch2;
                            downto--;
                            continue;
                        }
                        else
                        {
                            // Unpaired
                            ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR;
                        }
                    }
                }
                else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
                {
                    // Unpaired
                    ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR;
                }

                code = (code * 31) + ch;
            }

            int hashPos = code & postingsHashMask;

            // Locate RawPostingList in hash
            p = postingsHash[hashPos];

            if (p != null && !postingEquals(tokenText, tokenTextLen))
            {
                // Conflict: keep searching different locations in
                // the hash table.
                int inc = ((code >> 8) + code) | 1;
                do
                {
                    code   += inc;
                    hashPos = code & postingsHashMask;
                    p       = postingsHash[hashPos];
                } while (p != null && !postingEquals(tokenText, tokenTextLen));
            }

            if (p == null)
            {
                // First time we are seeing this token since we last
                // flushed the hash.
                int textLen1 = 1 + tokenTextLen;
                if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE)
                {
                    if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE)
                    {
                        // Just skip this term, to remain as robust as
                        // possible during indexing.  A TokenFilter
                        // can be inserted into the analyzer chain if
                        // other behavior is wanted (pruning the term
                        // to a prefix, throwing an exception, etc).

                        if (docState.maxTermPrefix == null)
                        {
                            docState.maxTermPrefix = new System.String(tokenText, 0, 30);
                        }

                        consumer.skippingLongTerm(token);
                        return;
                    }
                    charPool.nextBuffer();
                }

                // Refill?
                if (0 == perThread.freePostingsCount)
                {
                    perThread.morePostings();
                }

                // Pull next free RawPostingList from free list
                p = perThread.freePostings[--perThread.freePostingsCount];
                System.Diagnostics.Debug.Assert(p != null);

                char[] text     = charPool.buffer;
                int    textUpto = charPool.charUpto;
                p.textStart        = textUpto + charPool.charOffset;
                charPool.charUpto += textLen1;
                System.Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
                text[textUpto + tokenTextLen] = (char)0xffff;

                System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null);
                postingsHash[hashPos] = p;
                numPostings++;

                if (numPostings == postingsHashHalfSize)
                {
                    rehashPostings(2 * postingsHashSize);
                }

                // Init stream slices
                if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
                {
                    intPool.nextBuffer();
                }

                if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
                {
                    bytePool.NextBuffer();
                }

                intUptos         = intPool.buffer;
                intUptoStart     = intPool.intUpto;
                intPool.intUpto += streamCount;

                p.intStart = intUptoStart + intPool.intOffset;

                for (int i = 0; i < streamCount; i++)
                {
                    int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                    intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
                }
                p.byteStart = intUptos[intUptoStart];

                consumer.newTerm(token, p);
            }
            else
            {
                intUptos     = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
                intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
                consumer.addTerm(token, p);
            }

            if (doNextCall)
            {
                nextPerField.add(token, p.textStart);
            }
        }
 private Token MakeToken(string text, int posIncr, int startOffset, int endOffset)
 {
     Token t = new Token();
     t.Append(text);
     t.PositionIncrement = posIncr;
     t.SetOffset(startOffset, endOffset);
     return t;
 }
 private static Token MakeToken(string text, int posIncr)
 {
     Token t = new Token();
     t.Append(text);
     t.PositionIncrement = posIncr;
     return t;
 }
 public float GetTokenScore(Token token)
 {
     return 0;
 }
        public virtual void TestZeroPosIncr()
        {
            Directory dir = new RAMDirectory();
            Token[] tokens = new Token[3];
            tokens[0] = new Token();
            tokens[0].Append("a");
            tokens[0].PositionIncrement = 1;
            tokens[1] = new Token();
            tokens[1].Append("b");
            tokens[1].PositionIncrement = 0;
            tokens[2] = new Token();
            tokens[2].Append("c");
            tokens[2].PositionIncrement = 0;

            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir);
            Document doc = new Document();
            doc.Add(new TextField("field", new CannedTokenStream(tokens)));
            writer.AddDocument(doc);
            doc = new Document();
            doc.Add(new TextField("field", new CannedTokenStream(tokens)));
            writer.AddDocument(doc);
            IndexReader r = writer.Reader;
            writer.Dispose();
            IndexSearcher s = NewSearcher(r);
            MultiPhraseQuery mpq = new MultiPhraseQuery();
            //mpq.setSlop(1);

            // NOTE: not great that if we do the else clause here we
            // get different scores!  MultiPhraseQuery counts that
            // phrase as occurring twice per doc (it should be 1, I
            // think?).  this is because MultipleTermPositions is able to
            // return the same position more than once (0, in this
            // case):
            if (true)
            {
                mpq.Add(new Term[] { new Term("field", "b"), new Term("field", "c") }, 0);
                mpq.Add(new Term[] { new Term("field", "a") }, 0);
            }
            else
            {
                mpq.Add(new Term[] { new Term("field", "a") }, 0);
                mpq.Add(new Term[] { new Term("field", "b"), new Term("field", "c") }, 0);
            }
            TopDocs hits = s.Search(mpq, 2);
            Assert.AreEqual(2, hits.TotalHits);
            Assert.AreEqual(hits.ScoreDocs[0].Score, hits.ScoreDocs[1].Score, 1e-5);
            /*
            for(int hit=0;hit<hits.TotalHits;hit++) {
              ScoreDoc sd = hits.ScoreDocs[hit];
              System.out.println("  hit doc=" + sd.Doc + " score=" + sd.Score);
            }
            */
            r.Dispose();
            dir.Dispose();
        }
			private void  InitBlock(HighlighterTest enclosingInstance)
			{
				this.enclosingInstance = enclosingInstance;
				lst = new System.Collections.ArrayList();
				Token t;
				t = new Token("hi", 0, 2);
				lst.Add(t);
				t = new Token("hispeed", 0, 8);
				lst.Add(t);
				t = new Token("speed", 3, 8);
				t.SetPositionIncrement(0);
				lst.Add(t);
				t = new Token("10", 8, 10);
				lst.Add(t);
				t = new Token("foo", 11, 14);
				lst.Add(t);
				iter = lst.GetEnumerator();
			}
Example #56
0
 public override Token Next(Token reusableToken)
 {
     return(base.Next(reusableToken));
 }
Example #57
0
		/* (non-Javadoc)
		* @see Lucene.Net.Highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
		*/
		public virtual float GetTokenScore(Token token)
		{
			System.String termText = token.TermText();
			
			WeightedTerm queryTerm = (WeightedTerm) termsToFind[termText];
			if (queryTerm == null)
			{
				//not a query term - return
				return 0;
			}
			//found a query term - is it unique in this doc?
			if (!uniqueTermsInFragment.Contains(termText))
			{
				totalScore += queryTerm.GetWeight();
				uniqueTermsInFragment.Add(termText, termText);
			}
			return queryTerm.GetWeight();
		}
		public RepeatingTokenStream(System.String val)
		{
			t = new Token(val, 0, val.Length);
		}
			public override Token Next(Token result)
			{
				if (this.fieldName.Equals("crash") && count++ >= 4)
					throw new System.IO.IOException("I'm experiencing problems");
				return input.Next(result);
			}
 public void TestNegativePositions()
 {
     SinkTokenizer tokens = new SinkTokenizer();
     Token t = new Token();
     t.SetTermText("a");
     t.SetPositionIncrement(0);
     tokens.Add(t);
     t.SetTermText("b");
     t.SetPositionIncrement(1);
     tokens.Add(t);
     t.SetTermText("c");
     tokens.Add(t);
     MockRAMDirectory dir = new MockRAMDirectory();
     IndexWriter w = new IndexWriter(dir, false, new WhitespaceAnalyzer(), true);
     Document doc = new Document();
     doc.Add(new Field("field", tokens));
     w.AddDocument(doc);
     w.Close();
     IndexSearcher s = new IndexSearcher(dir);
     PhraseQuery pq = new PhraseQuery();
     pq.Add(new Term("field", "a"));
     pq.Add(new Term("field", "b"));
     pq.Add(new Term("field", "c"));
     Hits hits = s.Search(pq);
     Assert.AreEqual(1, hits.Length());
     Query q = new SpanTermQuery(new Term("field", "a"));
     hits = s.Search(q);
     Assert.AreEqual(1, hits.Length());
     TermPositions tps = s.GetIndexReader().TermPositions(new Term("field", "a"));
     Assert.IsTrue(tps.Next());
     Assert.AreEqual(1, tps.Freq());
     Assert.AreEqual(-1, tps.NextPosition());
     Assert.IsTrue(_TestUtil.CheckIndex(dir));
     s.Close();
     dir.Close();
 }