/// <summary>Get TokenizerType based on what's in the properties.</summary> /// <param name="props">Properties to find tokenizer options in</param> /// <returns>An element of the TokenizerType enum indicating the tokenizer to use</returns> public static TokenizerAnnotator.TokenizerType GetTokenizerType(Properties props) { string tokClass = props.GetProperty("tokenize.class", null); bool whitespace = bool.ValueOf(props.GetProperty("tokenize.whitespace", "false")); string language = props.GetProperty("tokenize.language", "en"); if (whitespace) { return(TokenizerAnnotator.TokenizerType.Whitespace); } if (tokClass != null) { TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.classToTokenizerMap[tokClass.ToUpper()]; if (type == null) { throw new ArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass); } return(type); } if (language != null) { TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.nameToTokenizerMap[language.ToUpper()]; if (type == null) { throw new ArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language); } return(type); } return(TokenizerAnnotator.TokenizerType.Unspecified); }
public TokenizerAnnotator(bool verbose, Properties props, string options) { if (props == null) { props = new Properties(); } // check if segmenting must be done if (props.GetProperty("tokenize.language") != null && LanguageInfo.IsSegmenterLanguage(props.GetProperty("tokenize.language"))) { useSegmenter = true; if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Arabic) { segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props); } else { if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Chinese) { segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props); } else { segmenterAnnotator = null; throw new Exception("No segmenter implemented for: " + LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language"))); } } } else { useSegmenter = false; segmenterAnnotator = null; } Verbose = PropertiesUtils.GetBool(props, "tokenize.verbose", verbose); TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.GetTokenizerType(props); factory = InitFactory(type, props, options); }
/// <summary> /// initFactory returns the right type of TokenizerFactory based on the options in the properties file /// and the type. /// </summary> /// <remarks> /// initFactory returns the right type of TokenizerFactory based on the options in the properties file /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve /// your tokenizer from the properties file, and then add a class is the switch structure here to /// instantiate the new Tokenizer type. /// </remarks> /// <param name="type">the TokenizerType</param> /// <param name="props">the properties file</param> /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param> /// <exception cref="System.ArgumentException"/> private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions) { ITokenizerFactory <CoreLabel> factory; string options = props.GetProperty("tokenize.options", null); // set it to the equivalent of both extraOptions and options // TODO: maybe we should always have getDefaultOptions() and // expect the user to turn off default options. That would // require all options to have negated options, but // currently there are some which don't have that if (options == null) { options = type.GetDefaultOptions(); } if (extraOptions != null) { if (extraOptions.EndsWith(",")) { options = extraOptions + options; } else { options = extraOptions + ',' + options; } } switch (type) { case TokenizerAnnotator.TokenizerType.Arabic: case TokenizerAnnotator.TokenizerType.Chinese: { factory = null; break; } case TokenizerAnnotator.TokenizerType.Spanish: { factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.French: { factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.Whitespace: { bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false")); eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant); break; } case TokenizerAnnotator.TokenizerType.English: case TokenizerAnnotator.TokenizerType.German: { factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.Unspecified: { log.Info("No tokenizer type provided. Defaulting to PTBTokenizer."); factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } default: { throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer."); } } return(factory); }
public TokenizerAnnotator(bool verbose, TokenizerAnnotator.TokenizerType lang) : this(verbose, lang.ToString()) { }