Esempio n. 1
0
        public void RunSimpleTraining()
        {
            IBMModel1.Helpers.IBMModel1 model = new IBMModel1.Helpers.IBMModel1();

            List <SimpleTrainingRecord> training_corpus = new List <SimpleTrainingRecord>();

            EnglishTokenizer tokenizer_output = new EnglishTokenizer();
            FrenchTokenizer  tokenizer_input  = new FrenchTokenizer();


            SimpleTrainingMethod.Train(model, training_corpus);


            string sentence_input  = "la maison bleue";
            string sentence_output = "the blue house";

            string[] input_lang  = tokenizer_input.Tokenize(sentence_input);
            string[] output_lang = tokenizer_output.Tokenize(sentence_output);
            int[]    alignment   = model.GetAlignment(input_lang, output_lang);
            Dictionary <int, string> output_mapping = new Dictionary <int, string>();
            int m_input_len = input_lang.Length;

            string output_word = "";
            int    ia          = 1; //adding

            for (int j = 0; j < m_input_len; ++j)
            {
                int a_j = alignment[j] + ia;
                output_word         = output_lang[a_j];
                output_mapping[a_j] = output_word;
                ia++;
            }
            List <int> output_sentence_index_list = output_mapping.Keys.ToList();

            output_sentence_index_list.Sort();

            string[] predicted_output_lang = new string[output_sentence_index_list.Count];
            for (int i = 0; i < predicted_output_lang.Length; ++i)
            {
                predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]];
            }

            Console.WriteLine("Original French Sentence: {0}", sentence_input);
            Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang));
            Console.ReadLine();
        }
 //wsg2011: The distributed treebank is encoding in ISO8859_1, but
 //the current FrenchTreebankParserParams is currently configured to
 //read UTF-8, PTB style trees that have been extracted from the XML
 //files.
 //The raw treebank uses "PONCT". Change to LDC convention.
 public override ITokenizerFactory <IHasWord> GetTokenizerFactory()
 {
     return(FrenchTokenizer.FtbFactory());
 }
Esempio n. 3
0
        /// <summary>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type.
        /// </summary>
        /// <remarks>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve
        /// your tokenizer from the properties file, and then add a class is the switch structure here to
        /// instantiate the new Tokenizer type.
        /// </remarks>
        /// <param name="type">the TokenizerType</param>
        /// <param name="props">the properties file</param>
        /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param>
        /// <exception cref="System.ArgumentException"/>
        private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions)
        {
            ITokenizerFactory <CoreLabel> factory;
            string options = props.GetProperty("tokenize.options", null);

            // set it to the equivalent of both extraOptions and options
            // TODO: maybe we should always have getDefaultOptions() and
            // expect the user to turn off default options.  That would
            // require all options to have negated options, but
            // currently there are some which don't have that
            if (options == null)
            {
                options = type.GetDefaultOptions();
            }
            if (extraOptions != null)
            {
                if (extraOptions.EndsWith(","))
                {
                    options = extraOptions + options;
                }
                else
                {
                    options = extraOptions + ',' + options;
                }
            }
            switch (type)
            {
            case TokenizerAnnotator.TokenizerType.Arabic:
            case TokenizerAnnotator.TokenizerType.Chinese:
            {
                factory = null;
                break;
            }

            case TokenizerAnnotator.TokenizerType.Spanish:
            {
                factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.French:
            {
                factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Whitespace:
            {
                bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false"));
                eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
                factory          = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant);
                break;
            }

            case TokenizerAnnotator.TokenizerType.English:
            case TokenizerAnnotator.TokenizerType.German:
            {
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Unspecified:
            {
                log.Info("No tokenizer type provided. Defaulting to PTBTokenizer.");
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            default:
            {
                throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer.");
            }
            }
            return(factory);
        }