/** * Constructs a new PTBTokenizer that optionally returns carriage returns * as their own token, and has a custom LexedTokenFactory. * If asked for, CRs come back as Words whose text is * the value of {@code PTBLexer.cr}. This constructor translates * between the traditional bool options of PTBTokenizer and the new * options String. * * @param r The Reader to read tokens from * @param tokenizeNLs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @param invertible if set to true, then will produce CoreLabels which * will have fields for the string before and after, and the * character offsets * @param suppressEscaping If true, all the traditional Penn Treebank * normalizations are turned off. Otherwise, they all happen. * @param tokenFactory The LexedTokenFactory to use to create * tokens from the text. */ private PTBTokenizer(TextReader r, bool tokenizeNLs, bool invertible, bool suppressEscaping, LexedTokenFactory <T> tokenFactory) { StringBuilder options = new StringBuilder(); if (suppressEscaping) { options.Append("ptb3Escaping=false"); } else { options.Append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations } if (tokenizeNLs) { options.Append(",tokenizeNLs"); } if (invertible) { options.Append(",invertible"); } lexer = new PTBLexer(r, tokenFactory, options.ToString()); }
/** * Constructs a new PTBTokenizer with a custom LexedTokenFactory. * Many options for tokenization and what is returned can be set via * the options String. See the class documentation for details on * the options String. This is the new recommended constructor! * * @param r The Reader to read tokens from. * @param tokenFactory The LexedTokenFactory to use to create * tokens from the text. * @param options Options to the lexer. See the extensive documentation * in the class javadoc. The String may be null or empty, * which means that all traditional PTB normalizations are * done. You can pass in "ptb3Escaping=false" and have no * normalizations done (that is, the behavior of the old * suppressEscaping=true option). */ public PTBTokenizer(TextReader r, LexedTokenFactory <T> tokenFactory, String options) { lexer = new PTBLexer(r, tokenFactory, options); }
/// <summary>Constructs a new PTBTokenizer with a custom LexedTokenFactory.</summary> /// <remarks> /// Constructs a new PTBTokenizer with a custom LexedTokenFactory. /// Many options for tokenization and what is returned can be set via /// the options String. See the class documentation for details on /// the options String. This is the new recommended constructor! /// </remarks> /// <param name="r">The Reader to read tokens from.</param> /// <param name="tokenFactory"> /// The LexedTokenFactory to use to create /// tokens from the text. /// </param> /// <param name="options"> /// Options to the lexer. See the extensive documentation /// in the class javadoc. The String may be null or empty, /// which means that all traditional PTB normalizations are /// done. You can pass in "ptb3Escaping=false" and have no /// normalizations done (that is, the behavior of the old /// suppressEscaping=true option). /// </param> public PTBTokenizer(Reader r, ILexedTokenFactory <T> tokenFactory, string options) { lexer = new PTBLexer(r, tokenFactory, options); }