/** * * @param in Token stream to be filtered. * @param charTypeTable * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot" * @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42" * @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042" * @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42" * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se" * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" * @param protWords If not null is the set of tokens to protect from being delimited */ public WordDelimiterFilter(TokenStream input, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal, int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) : base(input) { this.generateWordParts = generateWordParts; this.generateNumberParts = generateNumberParts; this.catenateWords = catenateWords; this.catenateNumbers = catenateNumbers; this.catenateAll = catenateAll; this.splitOnCaseChange = splitOnCaseChange; this.preserveOriginal = preserveOriginal; this.charTypeTable = charTypeTable; this.splitOnNumerics = splitOnNumerics; this.stemEnglishPossessive = stemEnglishPossessive; this.protWords = protWords; }
public void inform(ResourceLoader loader) { string wordFiles = (string)args.get(PROTECTED_TOKENS); if (wordFiles != null) { try { File protectedWordFiles = new File(wordFiles); if (protectedWordFiles.exists()) { List /*<String>*/ wlist = loader.getLines(wordFiles); //This cast is safe in Lucene protectedWords = new CharArraySet(wlist, false);//No need to go through StopFilter as before, since it just uses a List internally } else { List /*<String>*/ files = StrUtils.splitFileNames(wordFiles); for (var iter = files.iterator(); iter.hasNext();) { string file = (string)iter.next(); List /*<String>*/ wlist = loader.getLines(file.Trim()); if (protectedWords == null) { protectedWords = new CharArraySet(wlist, false); } else { protectedWords.addAll(wlist); } } } } catch (IOException e) { throw new System.ApplicationException("Unexpected exception", e); } } }
public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet protWords) : base(source) { this.protWords = protWords; this.stemmer = stemmer; this.termAtt = (TermAttribute)addAttribute(typeof(TermAttribute)); }
public EnglishPorterFilter(TokenStream source, CharArraySet protWords) : base(source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords) { }
public WordDelimiterFilter(TokenStream input, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal, int splitOnNumerics, CharArraySet protWords) : this(input, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords) { }