/// <summary>
 /// Return a tokenizer which might be suitable for tokenizing text that will be used with this
 /// Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white
 /// space).
 /// </summary>
 /// <remarks>
 /// Return a tokenizer which might be suitable for tokenizing text that will be used with this
 /// Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white
 /// space).  The implementation in AbstractTreebankLanguagePack returns a factory for
 /// <see cref="Edu.Stanford.Nlp.Process.WhitespaceTokenizer{T}"/>
 /// .
 /// </remarks>
 /// <returns>A tokenizer</returns>
 public override ITokenizerFactory <IHasWord> GetTokenizerFactory()
 {
     return(SpanishTokenizer.Factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=true,splitAll=true"));
 }
示例#2
0
        /// <summary>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type.
        /// </summary>
        /// <remarks>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve
        /// your tokenizer from the properties file, and then add a class is the switch structure here to
        /// instantiate the new Tokenizer type.
        /// </remarks>
        /// <param name="type">the TokenizerType</param>
        /// <param name="props">the properties file</param>
        /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param>
        /// <exception cref="System.ArgumentException"/>
        private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions)
        {
            ITokenizerFactory <CoreLabel> factory;
            string options = props.GetProperty("tokenize.options", null);

            // set it to the equivalent of both extraOptions and options
            // TODO: maybe we should always have getDefaultOptions() and
            // expect the user to turn off default options.  That would
            // require all options to have negated options, but
            // currently there are some which don't have that
            if (options == null)
            {
                options = type.GetDefaultOptions();
            }
            if (extraOptions != null)
            {
                if (extraOptions.EndsWith(","))
                {
                    options = extraOptions + options;
                }
                else
                {
                    options = extraOptions + ',' + options;
                }
            }
            switch (type)
            {
            case TokenizerAnnotator.TokenizerType.Arabic:
            case TokenizerAnnotator.TokenizerType.Chinese:
            {
                factory = null;
                break;
            }

            case TokenizerAnnotator.TokenizerType.Spanish:
            {
                factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.French:
            {
                factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Whitespace:
            {
                bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false"));
                eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
                factory          = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant);
                break;
            }

            case TokenizerAnnotator.TokenizerType.English:
            case TokenizerAnnotator.TokenizerType.German:
            {
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Unspecified:
            {
                log.Info("No tokenizer type provided. Defaulting to PTBTokenizer.");
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            default:
            {
                throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer.");
            }
            }
            return(factory);
        }