/// <summary> /// Return a tokenizer which might be suitable for tokenizing text that will be used with this /// Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white /// space). /// </summary> /// <remarks> /// Return a tokenizer which might be suitable for tokenizing text that will be used with this /// Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white /// space). The implementation in AbstractTreebankLanguagePack returns a factory for /// <see cref="Edu.Stanford.Nlp.Process.WhitespaceTokenizer{T}"/> /// . /// </remarks> /// <returns>A tokenizer</returns> public override ITokenizerFactory <IHasWord> GetTokenizerFactory() { return(SpanishTokenizer.Factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=true,splitAll=true")); }
/// <summary> /// initFactory returns the right type of TokenizerFactory based on the options in the properties file /// and the type. /// </summary> /// <remarks> /// initFactory returns the right type of TokenizerFactory based on the options in the properties file /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve /// your tokenizer from the properties file, and then add a class is the switch structure here to /// instantiate the new Tokenizer type. /// </remarks> /// <param name="type">the TokenizerType</param> /// <param name="props">the properties file</param> /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param> /// <exception cref="System.ArgumentException"/> private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions) { ITokenizerFactory <CoreLabel> factory; string options = props.GetProperty("tokenize.options", null); // set it to the equivalent of both extraOptions and options // TODO: maybe we should always have getDefaultOptions() and // expect the user to turn off default options. That would // require all options to have negated options, but // currently there are some which don't have that if (options == null) { options = type.GetDefaultOptions(); } if (extraOptions != null) { if (extraOptions.EndsWith(",")) { options = extraOptions + options; } else { options = extraOptions + ',' + options; } } switch (type) { case TokenizerAnnotator.TokenizerType.Arabic: case TokenizerAnnotator.TokenizerType.Chinese: { factory = null; break; } case TokenizerAnnotator.TokenizerType.Spanish: { factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.French: { factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.Whitespace: { bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false")); eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant); break; } case TokenizerAnnotator.TokenizerType.English: case TokenizerAnnotator.TokenizerType.German: { factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.Unspecified: { log.Info("No tokenizer type provided. Defaulting to PTBTokenizer."); factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } default: { throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer."); } } return(factory); }