Exemple #1
0
        /**
         * Constructs a new PTBTokenizer that optionally returns carriage returns
         * as their own token, and has a custom LexedTokenFactory.
         * If asked for, CRs come back as Words whose text is
         * the value of {@code PTBLexer.cr}.  This constructor translates
         * between the traditional bool options of PTBTokenizer and the new
         * options String.
         *
         * @param r The Reader to read tokens from
         * @param tokenizeNLs Whether to return newlines as separate tokens
         *         (otherwise they normally disappear as whitespace)
         * @param invertible if set to true, then will produce CoreLabels which
         *         will have fields for the string before and after, and the
         *         character offsets
         * @param suppressEscaping If true, all the traditional Penn Treebank
         *         normalizations are turned off.  Otherwise, they all happen.
         * @param tokenFactory The LexedTokenFactory to use to create
         *         tokens from the text.
         */
        private PTBTokenizer(TextReader r,
                             bool tokenizeNLs,
                             bool invertible,
                             bool suppressEscaping,
                             LexedTokenFactory <T> tokenFactory)
        {
            StringBuilder options = new StringBuilder();

            if (suppressEscaping)
            {
                options.Append("ptb3Escaping=false");
            }
            else
            {
                options.Append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
            }
            if (tokenizeNLs)
            {
                options.Append(",tokenizeNLs");
            }
            if (invertible)
            {
                options.Append(",invertible");
            }
            lexer = new PTBLexer(r, tokenFactory, options.ToString());
        }
Exemple #2
0
 /**
  * Constructs a new PTBTokenizer with a custom LexedTokenFactory.
  * Many options for tokenization and what is returned can be set via
  * the options String. See the class documentation for details on
  * the options String.  This is the new recommended constructor!
  *
  * @param r The Reader to read tokens from.
  * @param tokenFactory The LexedTokenFactory to use to create
  *         tokens from the text.
  * @param options Options to the lexer.  See the extensive documentation
  *         in the class javadoc.  The String may be null or empty,
  *         which means that all traditional PTB normalizations are
  *         done.  You can pass in "ptb3Escaping=false" and have no
  *         normalizations done (that is, the behavior of the old
  *         suppressEscaping=true option).
  */
 public PTBTokenizer(TextReader r,
                     LexedTokenFactory <T> tokenFactory,
                     String options)
 {
     lexer = new PTBLexer(r, tokenFactory, options);
 }
 /// <summary>Constructs a new PTBTokenizer with a custom LexedTokenFactory.</summary>
 /// <remarks>
 /// Constructs a new PTBTokenizer with a custom LexedTokenFactory.
 /// Many options for tokenization and what is returned can be set via
 /// the options String. See the class documentation for details on
 /// the options String.  This is the new recommended constructor!
 /// </remarks>
 /// <param name="r">The Reader to read tokens from.</param>
 /// <param name="tokenFactory">
 /// The LexedTokenFactory to use to create
 /// tokens from the text.
 /// </param>
 /// <param name="options">
 /// Options to the lexer.  See the extensive documentation
 /// in the class javadoc.  The String may be null or empty,
 /// which means that all traditional PTB normalizations are
 /// done.  You can pass in "ptb3Escaping=false" and have no
 /// normalizations done (that is, the behavior of the old
 /// suppressEscaping=true option).
 /// </param>
 public PTBTokenizer(Reader r, ILexedTokenFactory <T> tokenFactory, string options)
 {
     lexer = new PTBLexer(r, tokenFactory, options);
 }