/** * Constructs a new PTBTokenizer that optionally returns carriage returns * as their own token, and has a custom LexedTokenFactory. * If asked for, CRs come back as Words whose text is * the value of {@code PTBLexer.cr}. This constructor translates * between the traditional bool options of PTBTokenizer and the new * options String. * * @param r The Reader to read tokens from * @param tokenizeNLs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @param invertible if set to true, then will produce CoreLabels which * will have fields for the string before and after, and the * character offsets * @param suppressEscaping If true, all the traditional Penn Treebank * normalizations are turned off. Otherwise, they all happen. * @param tokenFactory The LexedTokenFactory to use to create * tokens from the text. */ private PTBTokenizer(TextReader r, bool tokenizeNLs, bool invertible, bool suppressEscaping, LexedTokenFactory <T> tokenFactory) { StringBuilder options = new StringBuilder(); if (suppressEscaping) { options.Append("ptb3Escaping=false"); } else { options.Append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations } if (tokenizeNLs) { options.Append(",tokenizeNLs"); } if (invertible) { options.Append(",invertible"); } lexer = new PTBLexer(r, tokenFactory, options.ToString()); }
public WhitespaceTokenizerFactory(LexedTokenFactory <T> factory, String options) { this.factory = factory; Dictionary <String, String> prop = StringUtils.stringToProperties(options); this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false); }
/** * Constructs a new WhitespaceTokenizer * @param r The Reader that is its source. * @param eolIsSignificant Whether eol tokens should be returned. */ public WhitespaceTokenizer(LexedTokenFactory <T> factory, TextReader r, bool eolIsSignificant) { this.eolIsSignificant = eolIsSignificant; // The conditional below is perhaps currently needed in LexicalizedParser, since // it passes in a null arg while doing type-checking for sentence escaping // but StreamTokenizer barfs on that. But maybe shouldn't be here. if (r != null) { lexer = new WhitespaceLexer(r, factory); } }
// Constructors // This one is historical private PTBTokenizerFactory(bool tokenizeNLs, bool invertible, bool suppressEscaping, LexedTokenFactory <T> factory) { this.factory = factory; StringBuilder optionsSB = new StringBuilder(); if (suppressEscaping) { optionsSB.Append("ptb3Escaping=false"); } else { optionsSB.Append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations } if (tokenizeNLs) { optionsSB.Append(",tokenizeNLs"); } if (invertible) { optionsSB.Append(",invertible"); } this.options = optionsSB.ToString(); }
public WhitespaceTokenizerFactory(LexedTokenFactory <T> factory, bool tokenizeNLs) { this.factory = factory; this.tokenizeNLs = tokenizeNLs; }
public WhitespaceTokenizerFactory(LexedTokenFactory <T> factory) : this(factory, false) { }
/** Make a factory for PTBTokenizers. * * @param tokenFactory A factory for the token type that the tokenizer will return * @param options Options to the tokenizer (see the class documentation for details) */ private PTBTokenizerFactory(LexedTokenFactory <T> tokenFactory, String options) { this.factory = tokenFactory; this.options = options; }
/** * Constructs a new PTBTokenizer that uses the LexedTokenFactory and * options passed in. * * @param tokenFactory The LexedTokenFactory * @param options A String of options * @return A TokenizerFactory that returns objects of the type of the * LexedTokenFactory */ public static /*<T extends HasWord>*/ PTBTokenizerFactory <T> newPTBTokenizerFactory(LexedTokenFactory <T> tokenFactory, String options) { return(new PTBTokenizerFactory <T>(tokenFactory, options)); }
/** Get a TokenizerFactory that does Penn Treebank tokenization. * This is now the recommended factory method to use. * * @param factory A TokenFactory that determines what form of token is returned by the Tokenizer * @param options A String specifying options (see the class javadoc for details) * @param <T> The type of the tokens built by the LexedTokenFactory * @return A TokenizerFactory that does Penn Treebank tokenization */ public static /*<T extends HasWord>*/ TokenizerFactory <T> factory <T>(LexedTokenFactory <T> factory, String options) where T : HasWord { return(new PTBTokenizerFactory <T>(factory, options)); }
/** * Constructs a new PTBTokenizer with a custom LexedTokenFactory. * Many options for tokenization and what is returned can be set via * the options String. See the class documentation for details on * the options String. This is the new recommended constructor! * * @param r The Reader to read tokens from. * @param tokenFactory The LexedTokenFactory to use to create * tokens from the text. * @param options Options to the lexer. See the extensive documentation * in the class javadoc. The String may be null or empty, * which means that all traditional PTB normalizations are * done. You can pass in "ptb3Escaping=false" and have no * normalizations done (that is, the behavior of the old * suppressEscaping=true option). */ public PTBTokenizer(TextReader r, LexedTokenFactory <T> tokenFactory, String options) { lexer = new PTBLexer(r, tokenFactory, options); }
/* user code: */ /** * See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS. * See: http://unicode.org/reports/tr13/tr13-9.html and * http://www.unicode.org/unicode/reports/tr18/#Line_Boundaries * for Unicode conventions, * including other separators (vertical tab and form feed). * <br> * We do not interpret the zero width joiner/non-joiner (U+200C, * U+200D) as white spaces. * <br> * No longer %standalone. See WhitespaceTokenizer for a main method. */ public WhitespaceLexer(TextReader r, LexedTokenFactory <object> tf) : this(r){ this.tokenFactory = tf; }
/* user code: */ /** * See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS. See: http://unicode.org/reports/tr13/tr13-9.html and http://www.unicode.org/unicode/reports/tr18/#Line_Boundaries for Unicode conventions, including other separators (vertical tab and form feed). <br> We do not interpret the zero width joiner/non-joiner (U+200C, U+200D) as white spaces. <br> No longer %standalone. See WhitespaceTokenizer for a main method. */ public WhitespaceLexer(TextReader r, LexedTokenFactory<object> tf):this(r){ this.tokenFactory = tf; }