예제 #1
0
 /**
  *
  * @param in Token stream to be filtered.
  * @param charTypeTable
  * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
  * @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
  * @param catenateWords  1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
  * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
  * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
  * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
  * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
  * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
  * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
  * @param protWords If not null is the set of tokens to protect from being delimited
  */
 public WordDelimiterFilter(TokenStream input, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal, int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) : base(input)
 {
     this.generateWordParts     = generateWordParts;
     this.generateNumberParts   = generateNumberParts;
     this.catenateWords         = catenateWords;
     this.catenateNumbers       = catenateNumbers;
     this.catenateAll           = catenateAll;
     this.splitOnCaseChange     = splitOnCaseChange;
     this.preserveOriginal      = preserveOriginal;
     this.charTypeTable         = charTypeTable;
     this.splitOnNumerics       = splitOnNumerics;
     this.stemEnglishPossessive = stemEnglishPossessive;
     this.protWords             = protWords;
 }
예제 #2
0
        public void inform(ResourceLoader loader)
        {
            string wordFiles = (string)args.get(PROTECTED_TOKENS);

            if (wordFiles != null)
            {
                try
                {
                    File protectedWordFiles = new File(wordFiles);
                    if (protectedWordFiles.exists())
                    {
                        List /*<String>*/ wlist = loader.getLines(wordFiles);
                        //This cast is safe in Lucene
                        protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally
                    }
                    else
                    {
                        List /*<String>*/ files = StrUtils.splitFileNames(wordFiles);
                        for (var iter = files.iterator(); iter.hasNext();)
                        {
                            string            file  = (string)iter.next();
                            List /*<String>*/ wlist = loader.getLines(file.Trim());
                            if (protectedWords == null)
                            {
                                protectedWords = new CharArraySet(wlist, false);
                            }
                            else
                            {
                                protectedWords.addAll(wlist);
                            }
                        }
                    }
                }
                catch (IOException e)
                {
                    throw new System.ApplicationException("Unexpected exception", e);
                }
            }
        }
예제 #3
0
 public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) : base(source)
 {
     this.protWords = protWords;
     this.stemmer   = stemmer;
     this.termAtt   = (TermAttribute)addAttribute(typeof(TermAttribute));
 }
예제 #4
0
 public EnglishPorterFilter(TokenStream source, CharArraySet protWords) :
     base(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords)
 {
 }
예제 #5
0
 public WordDelimiterFilter(TokenStream input, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal, int splitOnNumerics, CharArraySet protWords) :
     this(input, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords)
 {
 }