コード例 #1
0
        /**
         * Constructs a new PTBTokenizer that optionally returns carriage returns
         * as their own token, and has a custom LexedTokenFactory.
         * If asked for, CRs come back as Words whose text is
         * the value of {@code PTBLexer.cr}.  This constructor translates
         * between the traditional bool options of PTBTokenizer and the new
         * options String.
         *
         * @param r The Reader to read tokens from
         * @param tokenizeNLs Whether to return newlines as separate tokens
         *         (otherwise they normally disappear as whitespace)
         * @param invertible if set to true, then will produce CoreLabels which
         *         will have fields for the string before and after, and the
         *         character offsets
         * @param suppressEscaping If true, all the traditional Penn Treebank
         *         normalizations are turned off.  Otherwise, they all happen.
         * @param tokenFactory The LexedTokenFactory to use to create
         *         tokens from the text.
         */
        private PTBTokenizer(TextReader r,
                             bool tokenizeNLs,
                             bool invertible,
                             bool suppressEscaping,
                             LexedTokenFactory <T> tokenFactory)
        {
            StringBuilder options = new StringBuilder();

            if (suppressEscaping)
            {
                options.Append("ptb3Escaping=false");
            }
            else
            {
                options.Append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
            }
            if (tokenizeNLs)
            {
                options.Append(",tokenizeNLs");
            }
            if (invertible)
            {
                options.Append(",invertible");
            }
            lexer = new PTBLexer(r, tokenFactory, options.ToString());
        }
コード例 #2
0
            public WhitespaceTokenizerFactory(LexedTokenFactory <T> factory,
                                              String options)
            {
                this.factory = factory;
                Dictionary <String, String> prop = StringUtils.stringToProperties(options);

                this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false);
            }
コード例 #3
0
 /**
  * Constructs a new WhitespaceTokenizer
  * @param r The Reader that is its source.
  * @param eolIsSignificant Whether eol tokens should be returned.
  */
 public WhitespaceTokenizer(LexedTokenFactory <T> factory,
                            TextReader r, bool eolIsSignificant)
 {
     this.eolIsSignificant = eolIsSignificant;
     // The conditional below is perhaps currently needed in LexicalizedParser, since
     // it passes in a null arg while doing type-checking for sentence escaping
     // but StreamTokenizer barfs on that.  But maybe shouldn't be here.
     if (r != null)
     {
         lexer = new WhitespaceLexer(r, factory);
     }
 }
コード例 #4
0
            // Constructors

            // This one is historical
            private PTBTokenizerFactory(bool tokenizeNLs, bool invertible, bool suppressEscaping, LexedTokenFactory <T> factory)
            {
                this.factory = factory;
                StringBuilder optionsSB = new StringBuilder();

                if (suppressEscaping)
                {
                    optionsSB.Append("ptb3Escaping=false");
                }
                else
                {
                    optionsSB.Append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
                }
                if (tokenizeNLs)
                {
                    optionsSB.Append(",tokenizeNLs");
                }
                if (invertible)
                {
                    optionsSB.Append(",invertible");
                }
                this.options = optionsSB.ToString();
            }
コード例 #5
0
 public WhitespaceTokenizerFactory(LexedTokenFactory <T> factory,
                                   bool tokenizeNLs)
 {
     this.factory     = factory;
     this.tokenizeNLs = tokenizeNLs;
 }
コード例 #6
0
 public WhitespaceTokenizerFactory(LexedTokenFactory <T> factory) :
     this(factory, false)
 {
 }
コード例 #7
0
 /** Make a factory for PTBTokenizers.
  *
  *  @param tokenFactory A factory for the token type that the tokenizer will return
  *  @param options Options to the tokenizer (see the class documentation for details)
  */
 private PTBTokenizerFactory(LexedTokenFactory <T> tokenFactory, String options)
 {
     this.factory = tokenFactory;
     this.options = options;
 }
コード例 #8
0
 /**
  * Constructs a new PTBTokenizer that uses the LexedTokenFactory and
  * options passed in.
  *
  * @param tokenFactory The LexedTokenFactory
  * @param options A String of options
  * @return A TokenizerFactory that returns objects of the type of the
  *         LexedTokenFactory
  */
 public static /*<T extends HasWord>*/ PTBTokenizerFactory <T> newPTBTokenizerFactory(LexedTokenFactory <T> tokenFactory, String options)
 {
     return(new PTBTokenizerFactory <T>(tokenFactory, options));
 }
コード例 #9
0
 /** Get a TokenizerFactory that does Penn Treebank tokenization.
  *  This is now the recommended factory method to use.
  *
  * @param factory A TokenFactory that determines what form of token is returned by the Tokenizer
  * @param options A String specifying options (see the class javadoc for details)
  * @param <T> The type of the tokens built by the LexedTokenFactory
  * @return A TokenizerFactory that does Penn Treebank tokenization
  */
 public static /*<T extends HasWord>*/ TokenizerFactory <T> factory <T>(LexedTokenFactory <T> factory, String options) where T : HasWord
 {
     return(new PTBTokenizerFactory <T>(factory, options));
 }
コード例 #10
0
 /**
  * Constructs a new PTBTokenizer with a custom LexedTokenFactory.
  * Many options for tokenization and what is returned can be set via
  * the options String. See the class documentation for details on
  * the options String.  This is the new recommended constructor!
  *
  * @param r The Reader to read tokens from.
  * @param tokenFactory The LexedTokenFactory to use to create
  *         tokens from the text.
  * @param options Options to the lexer.  See the extensive documentation
  *         in the class javadoc.  The String may be null or empty,
  *         which means that all traditional PTB normalizations are
  *         done.  You can pass in "ptb3Escaping=false" and have no
  *         normalizations done (that is, the behavior of the old
  *         suppressEscaping=true option).
  */
 public PTBTokenizer(TextReader r,
                     LexedTokenFactory <T> tokenFactory,
                     String options)
 {
     lexer = new PTBLexer(r, tokenFactory, options);
 }
コード例 #11
0
        /* user code: */

/**
 * See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS.
 * See: http://unicode.org/reports/tr13/tr13-9.html and
 * http://www.unicode.org/unicode/reports/tr18/#Line_Boundaries
 * for Unicode conventions,
 * including other separators (vertical tab and form feed).
 * <br>
 * We do not interpret the zero width joiner/non-joiner (U+200C,
 * U+200D) as white spaces.
 * <br>
 * No longer %standalone.  See WhitespaceTokenizer for a main method.
 */

        public WhitespaceLexer(TextReader r, LexedTokenFactory <object> tf) : this(r){
            this.tokenFactory = tf;
        }
コード例 #12
0
ファイル: WhitespaceLexer.cs プロジェクト: gblosser/OpenNlp
  /* user code: */
/**
 * See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS.
   See: http://unicode.org/reports/tr13/tr13-9.html and
   http://www.unicode.org/unicode/reports/tr18/#Line_Boundaries
   for Unicode conventions,
   including other separators (vertical tab and form feed).
   <br>
   We do not interpret the zero width joiner/non-joiner (U+200C,
   U+200D) as white spaces.
   <br>
   No longer %standalone.  See WhitespaceTokenizer for a main method.
 */

  public WhitespaceLexer(TextReader r, LexedTokenFactory<object> tf):this(r){
    this.tokenFactory = tf;
  }