示例#1
0
 public override Token Next()
 {
     if (inPhrase)
     {
         inPhrase = false;
         return(new Token("phrase2", savedStart, savedEnd));
     }
     else
     {
         for (Token token = input.Next(); token != null; token = input.Next())
         {
             if (token.TermText().Equals("phrase"))
             {
                 inPhrase   = true;
                 savedStart = token.StartOffset();
                 savedEnd   = token.EndOffset();
                 return(new Token("phrase1", savedStart, savedEnd));
             }
             else if (!token.TermText().Equals("stop"))
             {
                 return(token);
             }
         }
     }
     return(null);
 }
示例#2
0
        internal static void  Test(System.IO.TextReader reader, bool verbose, long bytes)
        {
            Analyzer    analyzer = new SimpleAnalyzer();
            TokenStream stream   = analyzer.TokenStream(null, reader);

            System.DateTime start = System.DateTime.Now;

            int count = 0;

            for (Token t = stream.Next(); t != null; t = stream.Next())
            {
                if (verbose)
                {
                    System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset());
                }
                count++;
            }

            System.DateTime end = System.DateTime.Now;

            long time = end.Ticks - start.Ticks;

            System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
            System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
            System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
        }
示例#3
0
        /// <summary>
        /// Returns the next, stemmed, input Token.
        /// </summary>
        /// <returns>
        ///  The stemed form of a token.
        /// </returns>
        /// <throws>IOException</throws>
        public override Token Next()
        {
            Token token = input.Next();

            if (token == null)
            {
                return(null);
            }
            else
            {
                string str = stemmer.stem(token.TermText());
                //if ((System.Object) str != token.TermText())
                if (!str.Equals(token.TermText()))
                {
                    // Yes, I mean object reference comparison here
                    //token.TermText() = str;
                    return(new Token(str, token.StartOffset(), token.EndOffset(), token.Type()));
                }
                return(token);
            }
        }
示例#4
0
        public virtual void  TestIncrementingPositions()
        {
            Analyzer    analyzer = new WhitespaceAnalyzer();
            TokenStream ts       = analyzer.TokenStream("Field", new System.IO.StringReader("one two three four five"));

            while (true)
            {
                Token token = ts.Next();
                if (token == null)
                {
                    break;
                }
                Assert.AreEqual(1, token.GetPositionIncrement(), token.TermText());
            }
        }
        /* (non-Javadoc)
         * @see Lucene.Net.Highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
         */
        public virtual float GetTokenScore(Token token)
        {
            System.String termText = token.TermText();

            WeightedTerm queryTerm = (WeightedTerm)termsToFind[termText];

            if (queryTerm == null)
            {
                //not a query term - return
                return(0);
            }
            //found a query term - is it unique in this doc?
            if (!uniqueTermsInFragment.Contains(termText))
            {
                totalScore += queryTerm.GetWeight();
                uniqueTermsInFragment.Add(termText, termText);
            }
            return(queryTerm.GetWeight());
        }
示例#6
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
示例#7
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
示例#8
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
        /// <summary>Returns the next input Token, after being stemmed </summary>
        public override Token Next()
        {
            Token token = input.Next();

            if (token == null)
            {
                return(null);
            }
            stemmer.SetCurrent(token.TermText());
            try
            {
                stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS);
            }
            catch (System.Exception e)
            {
                throw new System.SystemException(e.ToString());
            }

            Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type());

            newToken.SetPositionIncrement(token.GetPositionIncrement());
            return(newToken);
        }
示例#10
0
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             Token next = null;
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 while ((next = stream.Next()) != null)
                 {
                     terms.Add(next.TermText());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
示例#11
0
		/* (non-Javadoc)
		* @see Lucene.Net.Highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
		*/
		public virtual float GetTokenScore(Token token)
		{
			System.String termText = token.TermText();
			
			WeightedTerm queryTerm = (WeightedTerm) termsToFind[termText];
			if (queryTerm == null)
			{
				//not a query term - return
				return 0;
			}
			//found a query term - is it unique in this doc?
			if (!uniqueTermsInFragment.Contains(termText))
			{
				totalScore += queryTerm.GetWeight();
				uniqueTermsInFragment.Add(termText, termText);
			}
			return queryTerm.GetWeight();
		}
示例#12
0
        // Tokenizes the fields of a document into Postings.
        private void  InvertDocument(Document doc)
        {
            foreach (Field field in doc.Fields())
            {
                System.String fieldName   = field.Name();
                int           fieldNumber = fieldInfos.FieldNumber(fieldName);

                int length   = fieldLengths[fieldNumber];               // length of field
                int position = fieldPositions[fieldNumber];             // position in field
                if (length > 0)
                {
                    position += analyzer.GetPositionIncrementGap(fieldName);
                }
                int offset = fieldOffsets[fieldNumber];                 // offset field

                if (field.IsIndexed())
                {
                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        if (field.IsStoreOffsetWithTermVector())
                        {
                            AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
                        }
                        else
                        {
                            AddPosition(fieldName, stringValue, position++, null);
                        }
                        offset += stringValue.Length;
                        length++;
                    }
                    else
                    {
                        System.IO.TextReader reader;                         // find or make Reader
                        if (field.ReaderValue() != null)
                        {
                            reader = field.ReaderValue();
                        }
                        else if (field.StringValue() != null)
                        {
                            reader = new System.IO.StringReader(field.StringValue());
                        }
                        else
                        {
                            throw new System.ArgumentException("field must have either String or Reader value");
                        }

                        // Tokenize field and add to postingTable
                        TokenStream stream = analyzer.TokenStream(fieldName, reader);
                        try
                        {
                            Token lastToken = null;
                            for (Token t = stream.Next(); t != null; t = stream.Next())
                            {
                                position += (t.GetPositionIncrement() - 1);

                                if (field.IsStoreOffsetWithTermVector())
                                {
                                    AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
                                }
                                else
                                {
                                    AddPosition(fieldName, t.TermText(), position++, null);
                                }

                                lastToken = t;
                                if (++length > maxFieldLength)
                                {
                                    if (infoStream != null)
                                    {
                                        infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
                                    }
                                    break;
                                }
                            }

                            if (lastToken != null)
                            {
                                offset += lastToken.EndOffset() + 1;
                            }
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldLengths[fieldNumber]   = length;                   // save field length
                    fieldPositions[fieldNumber] = position;                 // save field position
                    fieldBoosts[fieldNumber]   *= field.GetBoost();
                    fieldOffsets[fieldNumber]   = offset;
                }
            }
        }