public void RunSimpleTraining() { IBMModel1.Helpers.IBMModel1 model = new IBMModel1.Helpers.IBMModel1(); List <SimpleTrainingRecord> training_corpus = new List <SimpleTrainingRecord>(); EnglishTokenizer tokenizer_output = new EnglishTokenizer(); FrenchTokenizer tokenizer_input = new FrenchTokenizer(); SimpleTrainingMethod.Train(model, training_corpus); string sentence_input = "la maison bleue"; string sentence_output = "the blue house"; string[] input_lang = tokenizer_input.Tokenize(sentence_input); string[] output_lang = tokenizer_output.Tokenize(sentence_output); int[] alignment = model.GetAlignment(input_lang, output_lang); Dictionary <int, string> output_mapping = new Dictionary <int, string>(); int m_input_len = input_lang.Length; string output_word = ""; int ia = 1; //adding for (int j = 0; j < m_input_len; ++j) { int a_j = alignment[j] + ia; output_word = output_lang[a_j]; output_mapping[a_j] = output_word; ia++; } List <int> output_sentence_index_list = output_mapping.Keys.ToList(); output_sentence_index_list.Sort(); string[] predicted_output_lang = new string[output_sentence_index_list.Count]; for (int i = 0; i < predicted_output_lang.Length; ++i) { predicted_output_lang[i] = output_mapping[output_sentence_index_list[i]]; } Console.WriteLine("Original French Sentence: {0}", sentence_input); Console.WriteLine("Predicted English Translation: {0}", string.Join(" ", predicted_output_lang)); Console.ReadLine(); }
//wsg2011: The distributed treebank is encoding in ISO8859_1, but //the current FrenchTreebankParserParams is currently configured to //read UTF-8, PTB style trees that have been extracted from the XML //files. //The raw treebank uses "PONCT". Change to LDC convention. public override ITokenizerFactory <IHasWord> GetTokenizerFactory() { return(FrenchTokenizer.FtbFactory()); }
/// <summary> /// initFactory returns the right type of TokenizerFactory based on the options in the properties file /// and the type. /// </summary> /// <remarks> /// initFactory returns the right type of TokenizerFactory based on the options in the properties file /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve /// your tokenizer from the properties file, and then add a class is the switch structure here to /// instantiate the new Tokenizer type. /// </remarks> /// <param name="type">the TokenizerType</param> /// <param name="props">the properties file</param> /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param> /// <exception cref="System.ArgumentException"/> private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions) { ITokenizerFactory <CoreLabel> factory; string options = props.GetProperty("tokenize.options", null); // set it to the equivalent of both extraOptions and options // TODO: maybe we should always have getDefaultOptions() and // expect the user to turn off default options. That would // require all options to have negated options, but // currently there are some which don't have that if (options == null) { options = type.GetDefaultOptions(); } if (extraOptions != null) { if (extraOptions.EndsWith(",")) { options = extraOptions + options; } else { options = extraOptions + ',' + options; } } switch (type) { case TokenizerAnnotator.TokenizerType.Arabic: case TokenizerAnnotator.TokenizerType.Chinese: { factory = null; break; } case TokenizerAnnotator.TokenizerType.Spanish: { factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.French: { factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.Whitespace: { bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false")); eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant); break; } case TokenizerAnnotator.TokenizerType.English: case TokenizerAnnotator.TokenizerType.German: { factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } case TokenizerAnnotator.TokenizerType.Unspecified: { log.Info("No tokenizer type provided. Defaulting to PTBTokenizer."); factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options); break; } default: { throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer."); } } return(factory); }