public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory, string options) { this.factory = factory; Properties prop = StringUtils.StringToProperties(options); this.tokenizeNLs = PropertiesUtils.GetBool(prop, "tokenizeNLs", false); }
private PTBTokenizerFactory(bool tokenizeNLs, bool invertible, bool suppressEscaping, ILexedTokenFactory <T> factory) { // Constructors // This one is historical this.factory = factory; StringBuilder optionsSB = new StringBuilder(); if (suppressEscaping) { optionsSB.Append("ptb3Escaping=false"); } else { optionsSB.Append("ptb3Escaping=true"); } // i.e., turn on all the historical PTB normalizations if (tokenizeNLs) { optionsSB.Append(",tokenizeNLs"); } if (invertible) { optionsSB.Append(",invertible"); } this.options = optionsSB.ToString(); }
/// <summary>Constructs a new WhitespaceTokenizer.</summary> /// <param name="r">The Reader that is its source.</param> /// <param name="eolIsSignificant">Whether eol tokens should be returned.</param> public WhitespaceTokenizer(ILexedTokenFactory factory, Reader r, bool eolIsSignificant) { this.eolIsSignificant = eolIsSignificant; // The conditional below is perhaps currently needed in LexicalizedParser, since // it passes in a null arg while doing type-checking for sentence escaping // but StreamTokenizer barfs on that. But maybe shouldn't be here. if (r != null) { lexer = new WhitespaceLexer(r, factory); } }
/// <summary>Constructor.</summary> /// <param name="r"/> /// <param name="tf"/> /// <param name="lexerProperties"/> /// <param name="splitCompounds"/> public FrenchTokenizer(Reader r, ILexedTokenFactory <T> tf, Properties lexerProperties, bool splitCompounds, bool splitContractions) { // The underlying JFlex lexer // Internal fields compound splitting // Produces the tokenization for parsing used by Green, de Marneffe, and Manning (2011) lexer = new FrenchLexer(r, tf, lexerProperties); this.splitCompounds = splitCompounds; this.splitContractions = splitContractions; if (splitCompounds || splitContractions) { compoundBuffer = Generics.NewLinkedList(); } }
/// <summary>Constructor.</summary> /// <param name="r"/> /// <param name="tf"/> /// <param name="lexerProperties"/> /// <param name="splitCompounds"/> public SpanishTokenizer(Reader r, ILexedTokenFactory <T> tf, Properties lexerProperties, bool splitCompounds, bool splitVerbs, bool splitContractions) { // The underlying JFlex lexer // Internal fields compound splitting // Produces the tokenization for parsing used by AnCora (fixed) */ lexer = new SpanishLexer(r, tf, lexerProperties); this.splitCompounds = splitCompounds; this.splitVerbs = splitVerbs; this.splitContractions = splitContractions; this.splitAny = (splitCompounds || splitVerbs || splitContractions); if (splitAny) { compoundBuffer = Generics.NewArrayList(4); } if (splitVerbs) { verbStripper = SpanishVerbStripper.GetInstance(); } }
private ArabicTokenizerFactory(ILexedTokenFactory <T> factory) { this.factory = factory; }
public ArabicTokenizer(Reader r, ILexedTokenFactory <T> tf, Properties lexerProperties) { lexer = new ArabicLexer(r, tf, lexerProperties); }
public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory, bool tokenizeNLs) { this.factory = factory; this.tokenizeNLs = tokenizeNLs; }
/// <summary>Make a factory for SpanishTokenizers, default options</summary> private SpanishTokenizerFactory(ILexedTokenFactory <T> factory) { // Constructors this.factory = factory; }
public WhitespaceTokenizerFactory(ILexedTokenFactory <T> factory) : this(factory, false) { }
/// <summary>Make a factory for SpanishTokenizers, options passed in</summary> private SpanishTokenizerFactory(ILexedTokenFactory <T> factory, string options) { this.factory = factory; SetOptions(options); }
/// <summary>Constructs a new SpanishTokenizer that returns T objects and uses the options passed in.</summary> /// <param name="options">a String of options, separated by commas</param> /// <returns>A TokenizerFactory that returns the right token types</returns> /// <param name="factory">a factory for the token type that the tokenizer will return</param> public static SpanishTokenizer.SpanishTokenizerFactory <T> NewSpanishTokenizerFactory <T>(ILexedTokenFactory <T> factory, string options) where T : IHasWord { return(new SpanishTokenizer.SpanishTokenizerFactory <T>(factory, options)); }
public static ITokenizerFactory <T> Factory <T>(ILexedTokenFactory <T> factory, string options) where T : IHasWord { return(new FrenchTokenizer.FrenchTokenizerFactory <T>(factory, options)); }
private FrenchTokenizerFactory(ILexedTokenFactory <T> factory) { this.factory = factory; }
/// <summary> /// Constructs a new PTBTokenizer that uses the LexedTokenFactory and /// options passed in. /// </summary> /// <param name="tokenFactory">The LexedTokenFactory</param> /// <param name="options">A String of options</param> /// <returns> /// A TokenizerFactory that returns objects of the type of the /// LexedTokenFactory /// </returns> public static PTBTokenizer.PTBTokenizerFactory <T> NewPTBTokenizerFactory <T>(ILexedTokenFactory <T> tokenFactory, string options) where T : IHasWord { return(new PTBTokenizer.PTBTokenizerFactory <T>(tokenFactory, options)); }
/// <summary>Constructs a new PTBTokenizer with a custom LexedTokenFactory.</summary> /// <remarks> /// Constructs a new PTBTokenizer with a custom LexedTokenFactory. /// Many options for tokenization and what is returned can be set via /// the options String. See the class documentation for details on /// the options String. This is the new recommended constructor! /// </remarks> /// <param name="r">The Reader to read tokens from.</param> /// <param name="tokenFactory"> /// The LexedTokenFactory to use to create /// tokens from the text. /// </param> /// <param name="options"> /// Options to the lexer. See the extensive documentation /// in the class javadoc. The String may be null or empty, /// which means that all traditional PTB normalizations are /// done. You can pass in "ptb3Escaping=false" and have no /// normalizations done (that is, the behavior of the old /// suppressEscaping=true option). /// </param> public PTBTokenizer(Reader r, ILexedTokenFactory <T> tokenFactory, string options) { lexer = new PTBLexer(r, tokenFactory, options); }
/// <summary> /// Constructs a new PTBTokenizer that optionally returns carriage returns /// as their own token, and has a custom LexedTokenFactory. /// </summary> /// <remarks> /// Constructs a new PTBTokenizer that optionally returns carriage returns /// as their own token, and has a custom LexedTokenFactory. /// If asked for, CRs come back as Words whose text is /// the value of /// <c>PTBLexer.cr</c> /// . This constructor translates /// between the traditional boolean options of PTBTokenizer and the new /// options String. /// </remarks> /// <param name="r">The Reader to read tokens from</param> /// <param name="tokenizeNLs"> /// Whether to return newlines as separate tokens /// (otherwise they normally disappear as whitespace) /// </param> /// <param name="invertible"> /// if set to true, then will produce CoreLabels which /// will have fields for the string before and after, and the /// character offsets /// </param> /// <param name="suppressEscaping"> /// If true, all the traditional Penn Treebank /// normalizations are turned off. Otherwise, they all happen. /// </param> /// <param name="tokenFactory"> /// The LexedTokenFactory to use to create /// tokens from the text. /// </param> private PTBTokenizer(Reader r, bool tokenizeNLs, bool invertible, bool suppressEscaping, ILexedTokenFactory <T> tokenFactory) { StringBuilder options = new StringBuilder(); if (suppressEscaping) { options.Append("ptb3Escaping=false"); } else { options.Append("ptb3Escaping=true"); } // i.e., turn on all the historical PTB normalizations if (tokenizeNLs) { options.Append(",tokenizeNLs"); } if (invertible) { options.Append(",invertible"); } lexer = new PTBLexer(r, tokenFactory, options.ToString()); }
private FrenchTokenizerFactory(ILexedTokenFactory <T> factory, string options) : this(factory) { SetOptions(options); }
/// <summary>Make a factory for PTBTokenizers.</summary> /// <param name="tokenFactory">A factory for the token type that the tokenizer will return</param> /// <param name="options">Options to the tokenizer (see the class documentation for details)</param> private PTBTokenizerFactory(ILexedTokenFactory <T> tokenFactory, string options) { this.factory = tokenFactory; this.options = options; }
/// <summary>See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS.</summary> /// <remarks> /// See: http://www.w3.org/TR/newline on Web newline chars: NEL, LS, PS. /// See: http://unicode.org/reports/tr13/tr13-9.html and /// http://www.unicode.org/unicode/reports/tr18/#Line_Boundaries /// for Unicode conventions, /// including other separators (vertical tab and form feed). /// <br /> /// We do not interpret the zero width joiner/non-joiner (U+200C, /// U+200D) as white spaces. /// <br /> /// No longer %standalone. See WhitespaceTokenizer for a main method. /// </remarks> public WhitespaceLexer(Reader r, ILexedTokenFactory <object> tf) : this(r) { /* user code: */ this.tokenFactory = tf; }
public static ITokenizerFactory <T> Factory <T>(ILexedTokenFactory <T> factory) where T : IHasWord { return(new SpanishTokenizer.SpanishTokenizerFactory <T>(factory, AncoraOptions)); }