/// <summary> /// Determines whether the specified train parameters are valid. /// </summary> /// <param name="trainParams">The train parameters.</param> /// <returns><c>true</c> if the specified train parameters are valid; otherwise, <c>false</c>.</returns> public static bool IsValid(TrainingParameters trainParams) { if (!trainParams.IsValid()) { return(false); } var algorithmName = trainParams.Get(Parameters.Algorithm); if (!(builtInTrainers.ContainsKey(algorithmName) || GetTrainerType(trainParams) != null)) { return(false); } var dataIndexer = trainParams.Get(Parameters.DataIndexer); if (dataIndexer != null) { switch (dataIndexer) { case Parameters.DataIndexers.OnePass: case Parameters.DataIndexers.TwoPass: break; default: return(false); } } return(true); }
/// <summary> /// Gets the event trainer. /// </summary> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="reportMap">The report map.</param> /// <param name="monitor">A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>The <see cref="IEventTrainer" /> trainer object.</returns> /// <exception cref="System.InvalidOperationException"> /// Unable to retrieve the trainer from the training parameters. /// or /// The constructor of the trainer must have a standard constructor. /// </exception> public static IEventTrainer GetEventTrainer(TrainingParameters parameters, Dictionary <string, string> reportMap, Monitor monitor) { var algorithm = parameters.Get(Parameters.Algorithm); if (algorithm == null) { AbstractEventTrainer trainer = new GIS(monitor); trainer.Init(parameters, reportMap); return(trainer); } var trainerType = GetTrainerType(parameters); if (trainerType.HasValue && trainerType.Value == TrainerType.EventModelTrainer) { var type = GetTrainer(algorithm); if (type == null) { throw new InvalidOperationException("Unable to retrieve the trainer from the training parameters."); } var ctor = type.GetConstructor(new [] { typeof(Monitor) }); if (ctor == null) { throw new InvalidOperationException("The constructor of the trainer must have a standard constructor."); } var trainer = (IEventTrainer)ctor.Invoke(new object[] { monitor }); trainer.Init(parameters, reportMap); return(trainer); } return(null); }
/// <summary> /// Gets the sequence model trainer. /// </summary> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="reportMap">The report map.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The <see cref="ISequenceTrainer"/> trainer object.</returns> /// <exception cref="System.InvalidOperationException">Trainer type couldn't be determined!</exception> public static ISequenceTrainer GetSequenceModelTrainer(TrainingParameters parameters, Dictionary <string, string> reportMap, Monitor monitor) { var trainerType = parameters.Get(Parameters.Algorithm); ISequenceTrainer trainer = null; if (trainerType != null) { if (builtInTrainers.ContainsKey(trainerType)) { trainer = CreateBuiltinTrainer <ISequenceTrainer>(trainerType, monitor); } if (customTrainers.ContainsKey(trainerType)) { trainer = CreateCustomTrainer <ISequenceTrainer>(trainerType, monitor); } } if (trainer == null) { throw new InvalidOperationException("Trainer type couldn't be determined!"); } trainer.Init(parameters, reportMap); return(trainer); }
/// <summary> /// Trains a name finder model with the given parameters. /// </summary> /// <param name="languageCode">The language of the training data.</param> /// <param name="type">Overrides the type parameter in the provided samples. This value can be null.</param> /// <param name="samples">The training samples.</param> /// <param name="parameters">The machine learning train parameters.</param> /// <param name="factory">The name finder factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns> public static TokenNameFinderModel Train(string languageCode, string type, IObjectStream <NameSample> samples, TrainingParameters parameters, TokenNameFinderFactory factory, Monitor monitor) { var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); var manifestInfoEntries = new Dictionary <string, string>(); var trainerType = TrainerFactory.GetTrainerType(parameters); IMaxentModel meModel = null; ML.Model.ISequenceClassificationModel <string> scModel = null; switch (trainerType) { case TrainerType.EventModelTrainer: var eventStream = new NameFinderEventStream(samples, type, factory.CreateContextGenerator(), factory.CreateSequenceCodec()); var nfTrainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); meModel = nfTrainer.Train(eventStream); break; case TrainerType.EventModelSequenceTrainer: var sampleStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator()); var nsTrainer = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor); meModel = nsTrainer.Train(sampleStream); break; case TrainerType.SequenceTrainer: var sequenceStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator()); var sqTrainer = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); scModel = sqTrainer.Train(sequenceStream); break; default: throw new InvalidOperationException("Unexpected trainer type!"); } if (scModel != null) { return(new TokenNameFinderModel( languageCode, scModel, factory.FeatureGenerator, factory.Resources, manifestInfoEntries, factory.SequenceCodec)); } return(new TokenNameFinderModel( languageCode, meModel, beamSize, factory.FeatureGenerator, factory.Resources, manifestInfoEntries, factory.SequenceCodec)); }
/// <summary> /// Gets the parameter from the train parameters. /// </summary> /// <param name="key">The param key.</param> /// <param name="defaultValue">The default value.</param> /// <returns>A value or the <paramref name="defaultValue"/>.</returns> protected string GetStringParam(string key, string defaultValue) { var value = trainParams.Get(key) ?? defaultValue; if (reportMap != null) { reportMap[key] = value; } return(value); }
/// <summary> /// Trains a lemmatizer model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training /// operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="LemmatizerModel" /> object.</returns> /// <exception cref="System.InvalidOperationException">The trainer was not specified.</exception> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static LemmatizerModel Train(string languageCode, IObjectStream <LemmaSample> samples, TrainingParameters parameters, LemmatizerFactory factory, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); var cg = factory.GetContextGenerator(); var trainerType = TrainerFactory.GetTrainerType(parameters); if (!trainerType.HasValue) { throw new InvalidOperationException("The trainer was not specified."); } IMaxentModel model = null; ML.Model.ISequenceClassificationModel <string> seqModel = null; switch (trainerType) { case TrainerType.EventModelTrainer: var s1 = new LemmaSampleEventStream(samples, cg); var t1 = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); model = t1.Train(s1); break; case TrainerType.EventModelSequenceTrainer: var s2 = new LemmaSampleSequenceStream(samples, cg); var t2 = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor); model = t2.Train(s2); break; case TrainerType.SequenceTrainer: var s3 = new LemmaSampleSequenceStream(samples, cg); var t3 = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); seqModel = t3.Train(s3); break; default: throw new NotSupportedException("Trainer type is not supported."); } return(model != null ? new LemmatizerModel(languageCode, model, beamSize, manifestInfoEntries, factory) : new LemmatizerModel(languageCode, seqModel, manifestInfoEntries, factory)); }
/// <summary> /// Trains a chunker model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>The trained <see cref="ChunkerModel"/> object.</returns> /// <exception cref="System.InvalidOperationException">The trainer was not specified.</exception> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ChunkerModel Train(string languageCode, IObjectStream <ChunkSample> samples, TrainingParameters parameters, ChunkerFactory factory, Monitor monitor) { var trainerType = TrainerFactory.GetTrainerType(parameters); if (!trainerType.HasValue) { throw new InvalidOperationException("The trainer was not specified."); } var manifestInfoEntries = new Dictionary <string, string>(); IMaxentModel chunkerModel = null; ML.Model.ISequenceClassificationModel <string> seqChunkerModel = null; switch (trainerType) { case TrainerType.SequenceTrainer: var st = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); // TODO: This will probably cause issue, since the feature generator uses the outcomes array var ss = new ChunkSampleSequenceStream(samples, factory.GetContextGenerator()); seqChunkerModel = st.Train(ss); break; case TrainerType.EventModelTrainer: var es = new ChunkerEventStream(samples, factory.GetContextGenerator()); var et = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); chunkerModel = et.Train(es); break; default: throw new NotSupportedException("Trainer type is not supported."); } var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); return(chunkerModel != null ? new ChunkerModel(languageCode, chunkerModel, beamSize, manifestInfoEntries, factory) : new ChunkerModel(languageCode, seqChunkerModel, manifestInfoEntries, factory)); }
/// <summary> /// Gets the type of the trainer from the <see cref="TrainingParameters"/> object. /// </summary> /// <param name="trainParams">The train parameters.</param> /// <returns>A nullable <see cref="TrainerType"/> value.</returns> public static TrainerType?GetTrainerType(TrainingParameters trainParams) { var algorithm = trainParams.Get(Parameters.Algorithm); if (algorithm == null) { return(TrainerType.EventModelTrainer); } Type trainerType = null; if (builtInTrainers.ContainsKey(algorithm)) { trainerType = builtInTrainers[algorithm]; } else if (customTrainers.ContainsKey(algorithm)) { trainerType = customTrainers[algorithm]; } return(GetTrainerType(trainerType)); }
/// <summary> /// Gets the event model sequence trainer. /// </summary> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="reportMap">The report map.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The <see cref="IEventModelSequenceTrainer"/> trainer object.</returns> /// <exception cref="System.InvalidOperationException">Trainer type couldn't be determined!</exception> public static IEventModelSequenceTrainer GetEventModelSequenceTrainer(TrainingParameters parameters, Dictionary <string, string> reportMap, Monitor monitor) { var trainerType = parameters.Get(Parameters.Algorithm); if (!string.IsNullOrEmpty(trainerType)) { if (builtInTrainers.ContainsKey(trainerType)) { var trainer = CreateBuiltinTrainer <IEventModelSequenceTrainer>(trainerType, monitor); trainer.Init(parameters, reportMap); return(trainer); } if (customTrainers.ContainsKey(trainerType)) { var type = customTrainers[trainerType]; var trainer2 = (IEventModelSequenceTrainer)Activator.CreateInstance(type, monitor); trainer2.Init(parameters, reportMap); return(trainer2); } } throw new InvalidOperationException("Trainer type couldn't be determined!"); }
/// <summary> /// Gets the event trainer. /// </summary> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="reportMap">The report map.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The <see cref="IEventTrainer"/> trainer object.</returns> public static IEventTrainer GetEventTrainer(TrainingParameters parameters, Dictionary <string, string> reportMap, Monitor monitor) { var algorithm = parameters.Get(Parameters.Algorithm); if (algorithm == null) { AbstractEventTrainer trainer = new GIS(monitor); trainer.Init(parameters, reportMap); return(trainer); } var trainerType = GetTrainerType(parameters); if (trainerType.HasValue && trainerType.Value == TrainerType.EventModelTrainer) { var type = GetTrainer(algorithm); var trainer = (IEventTrainer)Activator.CreateInstance(type, monitor); trainer.Init(parameters, reportMap); return(trainer); } return(null); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream <Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* * if (chunks[ci].Parent == null) { * chunks[ci].Show(); * } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) { window[wi++] = chunks[ci - 2].Head.CoveredText; } if (ci - 1 >= 0) { window[wi++] = chunks[ci - 1].Head.CoveredText; } window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) { window[wi++] = chunks[ci + 1].Head.CoveredText; } if (ci + 2 < chunks.Length) { window[wi++] = chunks[ci + 2].Head.CoveredText; } if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return(dict.ToDictionary(true)); }
/// <summary> /// Trains a name finder model with the given parameters. /// </summary> /// <param name="languageCode">The language of the training data.</param> /// <param name="type">Overrides the type parameter in the provided samples. This value can be null.</param> /// <param name="samples">The training samples.</param> /// <param name="parameters">The machine learning train parameters.</param> /// <param name="factory">The name finder factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns> public static TokenNameFinderModel Train(string languageCode, string type, IObjectStream<NameSample> samples, TrainingParameters parameters, TokenNameFinderFactory factory, Monitor monitor) { var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); var manifestInfoEntries = new Dictionary<string, string>(); var trainerType = TrainerFactory.GetTrainerType(parameters); IMaxentModel meModel = null; ML.Model.ISequenceClassificationModel<string> seqModel = null; switch (trainerType) { case TrainerType.EventModelTrainer: var eventStream = new NameFinderEventStream(samples, type, factory.CreateContextGenerator(), factory.CreateSequenceCodec()); var nfTrainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); meModel = nfTrainer.Train(eventStream); break; case TrainerType.EventModelSequenceTrainer: var sampleStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator()); var nsTrainer = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor); meModel = nsTrainer.Train(sampleStream); break; case TrainerType.SequenceTrainer: var sequenceStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator()); var sqTrainer = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); seqModel = sqTrainer.Train(sequenceStream); break; default: throw new InvalidOperationException("Unexpected trainer type!"); } if (seqModel != null) { return new TokenNameFinderModel( languageCode, seqModel, factory.FeatureGenerator, factory.Resources, manifestInfoEntries, factory.SequenceCodec, factory); } return new TokenNameFinderModel( languageCode, meModel, beamSize, factory.FeatureGenerator, factory.Resources, manifestInfoEntries, factory.SequenceCodec, factory); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream<Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* if (chunks[ci].Parent == null) { chunks[ci].Show(); } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].Head.CoveredText; if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].Head.CoveredText; window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) window[wi++] = chunks[ci + 1].Head.CoveredText; if (ci + 2 < chunks.Length) window[wi++] = chunks[ci + 2].Head.CoveredText; if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return dict.ToDictionary(true); }
/// <summary> /// Trains a chunker model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="factory">The sentence detector factory.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value.</param> /// <returns>The trained <see cref="ChunkerModel"/> object.</returns> /// <exception cref="System.InvalidOperationException">The trainer was not specified.</exception> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ChunkerModel Train(string languageCode, IObjectStream<ChunkSample> samples, TrainingParameters parameters, ChunkerFactory factory, Monitor monitor) { var trainerType = TrainerFactory.GetTrainerType(parameters); if (!trainerType.HasValue) { throw new InvalidOperationException("The trainer was not specified."); } var manifestInfoEntries = new Dictionary<string, string>(); IMaxentModel chunkerModel = null; ML.Model.ISequenceClassificationModel<string> seqChunkerModel = null; switch (trainerType) { case TrainerType.SequenceTrainer: var st = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor); // TODO: This will probably cause issue, since the feature generator uses the outcomes array var ss = new ChunkSampleSequenceStream(samples, factory.GetContextGenerator()); seqChunkerModel = st.Train(ss); break; case TrainerType.EventModelTrainer: var es = new ChunkerEventStream(samples, factory.GetContextGenerator()); var et = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor); chunkerModel = et.Train(es); break; default: throw new NotSupportedException("Trainer type is not supported."); } var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize); return chunkerModel != null ? new ChunkerModel(languageCode, chunkerModel, beamSize, manifestInfoEntries, factory) : new ChunkerModel(languageCode, seqChunkerModel, manifestInfoEntries, factory); }