Beispiel #1
0
		/// <summary> Build a filter that removes words that are too long or too
		/// short from the text.
		/// </summary>
		public LengthFilter(TokenStream in_Renamed, int min, int max)
            : base(in_Renamed)
		{
			this.min = min;
			this.max = max;
            termAtt = AddAttribute<ITermAttribute>();
		}
Beispiel #2
0
		/// <summary> Construct a token stream filtering the given input.
		/// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
		/// <c>makeStopSet()</c> was used to construct the set) it will be directly used
		/// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
		/// directly controls case sensitivity.
		/// <p/>
		/// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
		/// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
		/// used to specify the case sensitivity of that set.
		/// </summary>
		/// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
		/// <param name="input">Input TokenStream</param>
		/// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
        /// <param name="ignoreCase">if true, all words are lower cased first</param>
        public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase)
            : base(input)
		{
		    if (stopWords is CharArraySet)
		    {
		        this.stopWords = (CharArraySet) stopWords;
		    }
		    else
		    {
		        this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
		        this.stopWords.AddAll(stopWords);
		    }
		    this.enablePositionIncrements = enablePositionIncrements;
		    termAtt = AddAttribute<ITermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
		}
		public ASCIIFoldingFilter(TokenStream input):base(input)
		{
            termAtt = AddAttribute<ITermAttribute>();
		}
		public PorterStemFilter(TokenStream in_Renamed):base(in_Renamed)
		{
			stemmer = new PorterStemmer();
            termAtt = AddAttribute<ITermAttribute>();
		}
		public LowerCaseFilter(TokenStream @in)
			: base(@in)
		{
            termAtt = AddAttribute<ITermAttribute>();
		}
		public ISOLatin1AccentFilter(TokenStream input):base(input)
		{
            termAtt = AddAttribute<ITermAttribute>();
		}
Beispiel #7
0
	    /// <summary> Constructs a filter which removes words from the input
		/// TokenStream that are named in the Set.
		/// </summary>
		/// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
		///  <param name="in">Input stream</param>
		/// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param>
		/// <seealso cref="MakeStopSet(string[])"/>
		public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords)
			: this(enablePositionIncrements, @in, stopWords, false)
		{ }
		/// <summary> Instantiates a new TeeSinkTokenFilter.</summary>
		public TeeSinkTokenFilter(TokenStream input):base(input)
		{
		}
Beispiel #9
0
 /// <summary> Create a tokenized and indexed field that is not stored, optionally with 
 /// storing term vectors.  This is useful for pre-analyzed fields.
 /// The TokenStream is read only when the Document is added to the index,
 /// i.e. you may not close the TokenStream until <see cref="IndexWriter.AddDocument(Document)" />
 /// has been called.
 /// 
 /// </summary>
 /// <param name="name">The name of the field
 /// </param>
 /// <param name="tokenStream">The TokenStream with the content
 /// </param>
 /// <param name="termVector">Whether term vector should be stored
 /// </param>
 /// <throws>  NullPointerException if name or tokenStream is <c>null</c> </throws>
 public Field(System.String name, TokenStream tokenStream, TermVector termVector)
 {
     if (name == null)
         throw new System.NullReferenceException("name cannot be null");
     if (tokenStream == null)
         throw new System.NullReferenceException("tokenStream cannot be null");
     
     this.internalName = StringHelper.Intern(name); // field names are interned
     this.fieldsData = null;
     this.tokenStream = tokenStream;
     
     this.internalIsStored = false;
     
     this.internalIsIndexed = true;
     this.internalIsTokenized = true;
     
     this.internalIsBinary = false;
     
     SetStoreTermVector(termVector);
 }
Beispiel #10
0
 /// <summary> Create a tokenized and indexed field that is not stored. Term vectors will
 /// not be stored. This is useful for pre-analyzed fields.
 /// The TokenStream is read only when the Document is added to the index,
 /// i.e. you may not close the TokenStream until <see cref="IndexWriter.AddDocument(Document)" />
 /// has been called.
 /// 
 /// </summary>
 /// <param name="name">The name of the field
 /// </param>
 /// <param name="tokenStream">The TokenStream with the content
 /// </param>
 /// <throws>  NullPointerException if name or tokenStream is <c>null</c> </throws>
 public Field(System.String name, TokenStream tokenStream):this(name, tokenStream, TermVector.NO)
 {
 }
Beispiel #11
0
 /// <summary>Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
 /// May be combined with stored values from stringValue() or GetBinaryValue() 
 /// </summary>
 public void  SetTokenStream(TokenStream tokenStream)
 {
     this.internalIsIndexed = true;
     this.internalIsTokenized = true;
     this.tokenStream = tokenStream;
 }
 public LowerCaseFilter(TokenStream @in)
     : base(@in)
 {
     termAtt = AddAttribute <ITermAttribute>();
 }