예제 #1
0
            /// <summary>Get TokenizerType based on what's in the properties.</summary>
            /// <param name="props">Properties to find tokenizer options in</param>
            /// <returns>An element of the TokenizerType enum indicating the tokenizer to use</returns>
            public static TokenizerAnnotator.TokenizerType GetTokenizerType(Properties props)
            {
                string tokClass   = props.GetProperty("tokenize.class", null);
                bool   whitespace = bool.ValueOf(props.GetProperty("tokenize.whitespace", "false"));
                string language   = props.GetProperty("tokenize.language", "en");

                if (whitespace)
                {
                    return(TokenizerAnnotator.TokenizerType.Whitespace);
                }
                if (tokClass != null)
                {
                    TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.classToTokenizerMap[tokClass.ToUpper()];
                    if (type == null)
                    {
                        throw new ArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass);
                    }
                    return(type);
                }
                if (language != null)
                {
                    TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.nameToTokenizerMap[language.ToUpper()];
                    if (type == null)
                    {
                        throw new ArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language);
                    }
                    return(type);
                }
                return(TokenizerAnnotator.TokenizerType.Unspecified);
            }
예제 #2
0
 public TokenizerAnnotator(bool verbose, Properties props, string options)
 {
     if (props == null)
     {
         props = new Properties();
     }
     // check if segmenting must be done
     if (props.GetProperty("tokenize.language") != null && LanguageInfo.IsSegmenterLanguage(props.GetProperty("tokenize.language")))
     {
         useSegmenter = true;
         if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Arabic)
         {
             segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
         }
         else
         {
             if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Chinese)
             {
                 segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
             }
             else
             {
                 segmenterAnnotator = null;
                 throw new Exception("No segmenter implemented for: " + LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")));
             }
         }
     }
     else
     {
         useSegmenter       = false;
         segmenterAnnotator = null;
     }
     Verbose = PropertiesUtils.GetBool(props, "tokenize.verbose", verbose);
     TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.GetTokenizerType(props);
     factory = InitFactory(type, props, options);
 }
예제 #3
0
        /// <summary>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type.
        /// </summary>
        /// <remarks>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve
        /// your tokenizer from the properties file, and then add a class is the switch structure here to
        /// instantiate the new Tokenizer type.
        /// </remarks>
        /// <param name="type">the TokenizerType</param>
        /// <param name="props">the properties file</param>
        /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param>
        /// <exception cref="System.ArgumentException"/>
        private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions)
        {
            ITokenizerFactory <CoreLabel> factory;
            string options = props.GetProperty("tokenize.options", null);

            // set it to the equivalent of both extraOptions and options
            // TODO: maybe we should always have getDefaultOptions() and
            // expect the user to turn off default options.  That would
            // require all options to have negated options, but
            // currently there are some which don't have that
            if (options == null)
            {
                options = type.GetDefaultOptions();
            }
            if (extraOptions != null)
            {
                if (extraOptions.EndsWith(","))
                {
                    options = extraOptions + options;
                }
                else
                {
                    options = extraOptions + ',' + options;
                }
            }
            switch (type)
            {
            case TokenizerAnnotator.TokenizerType.Arabic:
            case TokenizerAnnotator.TokenizerType.Chinese:
            {
                factory = null;
                break;
            }

            case TokenizerAnnotator.TokenizerType.Spanish:
            {
                factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.French:
            {
                factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Whitespace:
            {
                bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false"));
                eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
                factory          = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant);
                break;
            }

            case TokenizerAnnotator.TokenizerType.English:
            case TokenizerAnnotator.TokenizerType.German:
            {
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Unspecified:
            {
                log.Info("No tokenizer type provided. Defaulting to PTBTokenizer.");
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            default:
            {
                throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer.");
            }
            }
            return(factory);
        }
예제 #4
0
 public TokenizerAnnotator(bool verbose, TokenizerAnnotator.TokenizerType lang)
     : this(verbose, lang.ToString())
 {
 }