예제 #1
0
            public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory, string options)
            {
                this.factory = factory;
                Properties prop = StringUtils.StringToProperties(options);

                this.tokenizeNLs = PropertiesUtils.GetBool(prop, "tokenizeNLs", false);
            }
            private PTBTokenizerFactory(bool tokenizeNLs, bool invertible, bool suppressEscaping, ILexedTokenFactory <T> factory)
            {
                // Constructors
                // This one is historical
                this.factory = factory;
                StringBuilder optionsSB = new StringBuilder();

                if (suppressEscaping)
                {
                    optionsSB.Append("ptb3Escaping=false");
                }
                else
                {
                    optionsSB.Append("ptb3Escaping=true");
                }
                // i.e., turn on all the historical PTB normalizations
                if (tokenizeNLs)
                {
                    optionsSB.Append(",tokenizeNLs");
                }
                if (invertible)
                {
                    optionsSB.Append(",invertible");
                }
                this.options = optionsSB.ToString();
            }
예제 #3
0
 /// <summary>Constructs a new WhitespaceTokenizer.</summary>
 /// <param name="r">The Reader that is its source.</param>
 /// <param name="eolIsSignificant">Whether eol tokens should be returned.</param>
 public WhitespaceTokenizer(ILexedTokenFactory factory, Reader r, bool eolIsSignificant)
 {
     this.eolIsSignificant = eolIsSignificant;
     // The conditional below is perhaps currently needed in LexicalizedParser, since
     // it passes in a null arg while doing type-checking for sentence escaping
     // but StreamTokenizer barfs on that.  But maybe shouldn't be here.
     if (r != null)
     {
         lexer = new WhitespaceLexer(r, factory);
     }
 }
예제 #4
0
 /// <summary>Constructor.</summary>
 /// <param name="r"/>
 /// <param name="tf"/>
 /// <param name="lexerProperties"/>
 /// <param name="splitCompounds"/>
 public FrenchTokenizer(Reader r, ILexedTokenFactory <T> tf, Properties lexerProperties, bool splitCompounds, bool splitContractions)
 {
     // The underlying JFlex lexer
     // Internal fields compound splitting
     // Produces the tokenization for parsing used by Green, de Marneffe, and Manning (2011)
     lexer = new FrenchLexer(r, tf, lexerProperties);
     this.splitCompounds    = splitCompounds;
     this.splitContractions = splitContractions;
     if (splitCompounds || splitContractions)
     {
         compoundBuffer = Generics.NewLinkedList();
     }
 }
예제 #5
0
 /// <summary>Constructor.</summary>
 /// <param name="r"/>
 /// <param name="tf"/>
 /// <param name="lexerProperties"/>
 /// <param name="splitCompounds"/>
 public SpanishTokenizer(Reader r, ILexedTokenFactory <T> tf, Properties lexerProperties, bool splitCompounds, bool splitVerbs, bool splitContractions)
 {
     // The underlying JFlex lexer
     // Internal fields compound splitting
     // Produces the tokenization for parsing used by AnCora (fixed) */
     lexer = new SpanishLexer(r, tf, lexerProperties);
     this.splitCompounds    = splitCompounds;
     this.splitVerbs        = splitVerbs;
     this.splitContractions = splitContractions;
     this.splitAny          = (splitCompounds || splitVerbs || splitContractions);
     if (splitAny)
     {
         compoundBuffer = Generics.NewArrayList(4);
     }
     if (splitVerbs)
     {
         verbStripper = SpanishVerbStripper.GetInstance();
     }
 }
 private ArabicTokenizerFactory(ILexedTokenFactory <T> factory)
 {
     this.factory = factory;
 }
 public ArabicTokenizer(Reader r, ILexedTokenFactory <T> tf, Properties lexerProperties)
 {
     lexer = new ArabicLexer(r, tf, lexerProperties);
 }
예제 #8
0
 public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory, bool tokenizeNLs)
 {
     this.factory     = factory;
     this.tokenizeNLs = tokenizeNLs;
 }
예제 #9
0
 /// <summary>Make a factory for SpanishTokenizers, default options</summary>
 private SpanishTokenizerFactory(ILexedTokenFactory <T> factory)
 {
     // Constructors
     this.factory = factory;
 }
예제 #10
0
 public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory)
     : this(factory, false)
 {
 }
예제 #11
0
 /// <summary>Make a factory for SpanishTokenizers, options passed in</summary>
 private SpanishTokenizerFactory(ILexedTokenFactory <T> factory, string options)
 {
     this.factory = factory;
     SetOptions(options);
 }
예제 #12
0
 /// <summary>Constructs a new SpanishTokenizer that returns T objects and uses the options passed in.</summary>
 /// <param name="options">a String of options, separated by commas</param>
 /// <returns>A TokenizerFactory that returns the right token types</returns>
 /// <param name="factory">a factory for the token type that the tokenizer will return</param>
 public static SpanishTokenizer.SpanishTokenizerFactory <T> NewSpanishTokenizerFactory <T>(ILexedTokenFactory <T> factory, string options)
     where T : IHasWord
 {
     return(new SpanishTokenizer.SpanishTokenizerFactory <T>(factory, options));
 }
예제 #13
0
 public static ITokenizerFactory <T> Factory <T>(ILexedTokenFactory <T> factory, string options)
     where T : IHasWord
 {
     return(new FrenchTokenizer.FrenchTokenizerFactory <T>(factory, options));
 }
예제 #14
0
 private FrenchTokenizerFactory(ILexedTokenFactory <T> factory)
 {
     this.factory = factory;
 }
 /// <summary>
 /// Constructs a new PTBTokenizer that uses the LexedTokenFactory and
 /// options passed in.
 /// </summary>
 /// <param name="tokenFactory">The LexedTokenFactory</param>
 /// <param name="options">A String of options</param>
 /// <returns>
 /// A TokenizerFactory that returns objects of the type of the
 /// LexedTokenFactory
 /// </returns>
 public static PTBTokenizer.PTBTokenizerFactory <T> NewPTBTokenizerFactory <T>(ILexedTokenFactory <T> tokenFactory, string options)
     where T : IHasWord
 {
     return(new PTBTokenizer.PTBTokenizerFactory <T>(tokenFactory, options));
 }
 /// <summary>Constructs a new PTBTokenizer with a custom LexedTokenFactory.</summary>
 /// <remarks>
 /// Constructs a new PTBTokenizer with a custom LexedTokenFactory.
 /// Many options for tokenization and what is returned can be set via
 /// the options String. See the class documentation for details on
 /// the options String.  This is the new recommended constructor!
 /// </remarks>
 /// <param name="r">The Reader to read tokens from.</param>
 /// <param name="tokenFactory">
 /// The LexedTokenFactory to use to create
 /// tokens from the text.
 /// </param>
 /// <param name="options">
 /// Options to the lexer.  See the extensive documentation
 /// in the class javadoc.  The String may be null or empty,
 /// which means that all traditional PTB normalizations are
 /// done.  You can pass in "ptb3Escaping=false" and have no
 /// normalizations done (that is, the behavior of the old
 /// suppressEscaping=true option).
 /// </param>
 public PTBTokenizer(Reader r, ILexedTokenFactory <T> tokenFactory, string options)
 {
     lexer = new PTBLexer(r, tokenFactory, options);
 }
        /// <summary>
        /// Constructs a new PTBTokenizer that optionally returns carriage returns
        /// as their own token, and has a custom LexedTokenFactory.
        /// </summary>
        /// <remarks>
        /// Constructs a new PTBTokenizer that optionally returns carriage returns
        /// as their own token, and has a custom LexedTokenFactory.
        /// If asked for, CRs come back as Words whose text is
        /// the value of
        /// <c>PTBLexer.cr</c>
        /// .  This constructor translates
        /// between the traditional boolean options of PTBTokenizer and the new
        /// options String.
        /// </remarks>
        /// <param name="r">The Reader to read tokens from</param>
        /// <param name="tokenizeNLs">
        /// Whether to return newlines as separate tokens
        /// (otherwise they normally disappear as whitespace)
        /// </param>
        /// <param name="invertible">
        /// if set to true, then will produce CoreLabels which
        /// will have fields for the string before and after, and the
        /// character offsets
        /// </param>
        /// <param name="suppressEscaping">
        /// If true, all the traditional Penn Treebank
        /// normalizations are turned off.  Otherwise, they all happen.
        /// </param>
        /// <param name="tokenFactory">
        /// The LexedTokenFactory to use to create
        /// tokens from the text.
        /// </param>
        private PTBTokenizer(Reader r, bool tokenizeNLs, bool invertible, bool suppressEscaping, ILexedTokenFactory <T> tokenFactory)
        {
            StringBuilder options = new StringBuilder();

            if (suppressEscaping)
            {
                options.Append("ptb3Escaping=false");
            }
            else
            {
                options.Append("ptb3Escaping=true");
            }
            // i.e., turn on all the historical PTB normalizations
            if (tokenizeNLs)
            {
                options.Append(",tokenizeNLs");
            }
            if (invertible)
            {
                options.Append(",invertible");
            }
            lexer = new PTBLexer(r, tokenFactory, options.ToString());
        }
예제 #18
0
 private FrenchTokenizerFactory(ILexedTokenFactory <T> factory, string options)
     : this(factory)
 {
     SetOptions(options);
 }
 /// <summary>Make a factory for PTBTokenizers.</summary>
 /// <param name="tokenFactory">A factory for the token type that the tokenizer will return</param>
 /// <param name="options">Options to the tokenizer (see the class documentation for details)</param>
 private PTBTokenizerFactory(ILexedTokenFactory <T> tokenFactory, string options)
 {
     this.factory = tokenFactory;
     this.options = options;
 }
 /// <summary>See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS.</summary>
 /// <remarks>
 /// See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS.
 /// See: http://unicode.org/reports/tr13/tr13-9.html and
 /// http://www.unicode.org/unicode/reports/tr18/#Line_Boundaries
 /// for Unicode conventions,
 /// including other separators (vertical tab and form feed).
 /// <br />
 /// We do not interpret the zero width joiner/non-joiner (U+200C,
 /// U+200D) as white spaces.
 /// <br />
 /// No longer %standalone.  See WhitespaceTokenizer for a main method.
 /// </remarks>
 public WhitespaceLexer(Reader r, ILexedTokenFactory <object> tf)
     : this(r)
 {
     /* user code: */
     this.tokenFactory = tf;
 }
예제 #21
0
 public static ITokenizerFactory <T> Factory <T>(ILexedTokenFactory <T> factory)
     where T : IHasWord
 {
     return(new SpanishTokenizer.SpanishTokenizerFactory <T>(factory, AncoraOptions));
 }