Ejemplo n.º 1
0
        public ParserEventStream(IObjectStream<Parse> d, AbstractHeadRules rules, ParserEventTypeEnum type)
            : base(d, rules, type) {

            buildContextGenerator = new BuildContextGenerator();
            attachContextGenerator = new AttachContextGenerator(Punctuation);
            checkContextGenerator = new CheckContextGenerator(Punctuation);
        }
Ejemplo n.º 2
0
 /// <summary>
 /// Initializes a new instance of the <see cref="NameSampleSequenceStream"/> class.
 /// </summary>
 /// <param name="psi">The sample stream.</param>
 /// <param name="pcg">The context generator.</param>
 /// <param name="useOutcomes">if set to <c>true</c> will be used in the samples.</param>
 /// <param name="seqCodec">The sequence codec.</param>
 public NameSampleSequenceStream(IObjectStream<NameSample> psi, INameContextGenerator pcg, bool useOutcomes,
     ISequenceCodec<string> seqCodec) {
     this.psi = psi;
     this.useOutcomes = useOutcomes;
     this.pcg = pcg;
     this.seqCodec = seqCodec;
 }
Ejemplo n.º 3
0
        /// <summary>
        /// Initializes a new instance of the <see cref="OnePassDataIndexer"/> class, using a event stream, a cutoff value and a value that indicates if the events should be sorted.
        /// </summary>
        /// <param name="eventStream">The event stream.</param>
        /// <param name="cutoff">The cutoff.</param>
        /// <param name="sort">if set to <c>true</c> the events will be sorted during the indexing.</param>
        /// <param name="monitor">The evaluation monitor.</param>
        public OnePassDataIndexer(IObjectStream<Event> eventStream, int cutoff, bool sort, Monitor monitor)
            : base(monitor) {

            EventStream = eventStream;
            Cutoff = cutoff;
            Sort = sort;
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AdChunkSampleStream"/> class.
        /// </summary>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid AD sentences will be skipped.</param>
        /// <exception cref="System.ArgumentNullException">
        /// <paramref name="lineStream"/>
        /// </exception>
        public AdChunkSampleStream(IObjectStream <string> lineStream, bool safeParse) : this()
        {
            if (lineStream == null)
            {
                throw new ArgumentNullException("lineStream");
            }

            adSentenceStream = new AdSentenceStream(lineStream, safeParse);
        }
Ejemplo n.º 5
0
        protected AbstractEventStream(IObjectStream <T> samples)
        {
            if (samples == null)
            {
                throw new ArgumentNullException("samples");
            }

            this.samples = samples;
        }
Ejemplo n.º 6
0
 public FrequencyTableSequence(IObjectStream @object, string name = null)
     : base(@object)
 {
     if (name != null)
     {
         Name = name;
     }
     GenerateAuto = new GenerateAutoTable( );
 }
Ejemplo n.º 7
0
 /// <summary>
 /// Trains a name finder model with the given parameters.
 /// </summary>
 /// <param name="languageCode">The language of the training data.</param>
 /// <param name="type">Overrides the type parameter in the provided samples. This value can be null.</param>
 /// <param name="samples">The training samples.</param>
 /// <param name="parameters">The machine learning train parameters.</param>
 /// <param name="factory">The name finder factory.</param>
 /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns>
 public static TokenNameFinderModel Train(
     string languageCode,
     string type,
     IObjectStream <NameSample> samples,
     TrainingParameters parameters,
     TokenNameFinderFactory factory)
 {
     return(Train(languageCode, type, samples, parameters, factory, null));
 }
Ejemplo n.º 8
0
        /// <summary>
        /// Initializes a new instance of the <see cref="TokenSampleStream"/> class.
        /// </summary>
        /// <param name="samples">The samples.</param>
        /// <param name="separatorChars">The separator chars.</param>
        /// <exception cref="ArgumentNullException">separatorChars</exception>
        public TokenSampleStream(IObjectStream <string> samples, string separatorChars) : base(samples)
        {
            if (separatorChars == null)
            {
                throw new ArgumentNullException("separatorChars");
            }

            this.separatorChars = separatorChars;
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Initializes a new instance of the <see cref="RealBasicEventStream"/> class.
        /// </summary>
        /// <param name="objectStream">The object stream.</param>
        /// <exception cref="System.ArgumentNullException">objectStream</exception>
        public RealBasicEventStream(IObjectStream <string> objectStream)
        {
            if (objectStream == null)
            {
                throw new ArgumentNullException("objectStream");
            }

            this.objectStream = objectStream;
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Параметризованный конструктор
        /// </summary>
        /// <param name="@object">Объект последовательности</param>
        /// <param name="name">Название таблицы</param>
        public TableSequence(IObjectStream @object, string name = null)
            : this()
        {
            TypeTable = GetTypeTable(@object);
            Name      = name == null?GetTitle(@object, TypeTable) : name;

            Data = new DataTable();
            Text = "0";
        }
Ejemplo n.º 11
0
 public DelayTableSequence(IObjectStream @object, string name = null)
     : base(@object)
 {
     if (name != null)
     {
         Name = name;
     }
     DelayAuto = new DelayAutoTable( );
 }
Ejemplo n.º 12
0
 /// <summary>
 /// Trains a parser model with the given parameters.
 /// </summary>
 /// <param name="languageCode">The language code.</param>
 /// <param name="samples">The data samples.</param>
 /// <param name="rules">The head rules.</param>
 /// <param name="iterations">The number of training iterations.</param>
 /// <param name="cutoff">The min number of times a feature must be seen.</param>
 /// <returns>The trained <see cref="ParserModel" /> object.</returns>
 public static ParserModel Train(
     string languageCode,
     IObjectStream <Parse> samples,
     AbstractHeadRules rules,
     int iterations,
     int cutoff)
 {
     return(Train(null, languageCode, samples, rules, iterations, cutoff));
 }
Ejemplo n.º 13
0
        /// <summary>
        /// Initializes a new instance of the <see cref="ChunkSampleSequenceStream" /> class using the given parameters.
        /// </summary>
        /// <param name="samples">The chunk samples.</param>
        /// <param name="contextGenerator">The chunker context generator.</param>
        /// <exception cref="System.ArgumentNullException">
        /// The <paramref name="samples"/> is null.
        /// or
        /// The <paramref name="contextGenerator"/> is null.
        /// </exception>
        public ChunkSampleSequenceStream(IObjectStream<ChunkSample> samples, IChunkerContextGenerator contextGenerator) {
            if (samples == null)
                throw new ArgumentNullException("samples");

            if (contextGenerator == null)
                throw new ArgumentNullException("contextGenerator");

            this.samples = samples;
            this.contextGenerator = contextGenerator;
        }
Ejemplo n.º 14
0
        /// <summary>
        /// Trains document categorizer model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="factory">The document categorizer factory.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="DocumentCategorizerModel"/> model.</returns>
        public static DocumentCategorizerModel Train(string languageCode, IObjectStream <DocumentSample> samples, TrainingParameters parameters, DocumentCategorizerFactory factory, Monitor monitor)
        {
            var manifestInfoEntries = new Dictionary <string, string>();

            var eventStream = new DocumentCategorizerEventStream(samples, factory.FeatureGenerators);
            var trainer     = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);
            var model       = trainer.Train(eventStream);

            return(new DocumentCategorizerModel(languageCode, model, manifestInfoEntries, factory));
        }
Ejemplo n.º 15
0
        /// <summary>
        /// Initializes a new instance of the <see cref="PtbTokenSampleStream"/> using a <see cref="T:IObjectStream{string}"/> and a evaluation monitor.
        /// </summary>
        /// <param name="language">The language.</param>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="detokenizer">The detokenizer.</param>
        /// <param name="monitor">The monitor.</param>
        public PtbTokenSampleStream(string language, IObjectStream <string> lineStream, IDetokenizer detokenizer, Monitor monitor)
            : base(new PtbStreamReader(language, lineStream, false, monitor))
        {
            if (detokenizer == null)
            {
                throw new ArgumentNullException(nameof(detokenizer));
            }

            this.detokenizer = detokenizer;
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="AdContractionNameSampleStream" /> from a <paramref name="lineStream" /> object.
        /// </summary>
        /// <param name="monitor">The execution monitor.</param>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid Ad sentences will be skipped.</param>
        /// <exception cref="System.ArgumentNullException">
        /// <paramref name="monitor"/>
        /// or
        /// <paramref name="lineStream"/>
        /// </exception>
        public AdContractionNameSampleStream(Monitor monitor, IObjectStream <string> lineStream, bool safeParse)
            : this(lineStream, safeParse)
        {
            if (monitor == null)
            {
                throw new ArgumentNullException(nameof(monitor));
            }

            this.monitor = monitor;
        }
Ejemplo n.º 17
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AdNameSampleStream" /> from a <paramref name="lineStream" /> object.
        /// </summary>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="splitHyphenatedTokens">if set to <c>true</c> hyphenated tokens will be separated: "carros-monstro" &gt; "carros" Hyphen "monstro".</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid data in the file will be skipped.</param>
        /// <exception cref="System.ArgumentNullException">lineStream</exception>
        public AdNameSampleStream(IObjectStream <string> lineStream, bool splitHyphenatedTokens, bool safeParse)
        {
            if (lineStream == null)
            {
                throw new ArgumentNullException(nameof(lineStream));
            }

            adSentenceStream           = new AdSentenceStream(lineStream, safeParse);
            this.splitHyphenatedTokens = splitHyphenatedTokens;
        }
Ejemplo n.º 18
0
        /// <summary>
        /// Возвращает таблицу последовательности
        /// </summary>
        internal TableSequence GetTable(IObjectStream @object, string nameTable)
        {
            TableSequence table = null;

            if (_loadTables.TryGetValue(nameTable, out table))
            {
                _loadTableObjects[table.Name].Add(@object);
            }
            return(table);
        }
Ejemplo n.º 19
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AdChunkSampleStream"/> class.
        /// </summary>
        /// <param name="monitor">The execution monitor.</param>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid AD sentences will be skipped.</param>
        /// <exception cref="System.ArgumentNullException">
        /// <paramref name="monitor"/>
        /// or
        /// <paramref name="lineStream"/>
        /// </exception>
        public AdChunkSampleStream(Monitor monitor, IObjectStream <string> lineStream, bool safeParse)
            : this(lineStream, safeParse)
        {
            if (monitor == null)
            {
                throw new ArgumentNullException("monitor");
            }

            this.monitor = monitor;
        }
Ejemplo n.º 20
0
 public SetObjectCommand(Document document, IStream stream, eTypeObjectCollection typeCollection, int index, string text)
     : base(document)
 {
     _oldObject = stream[typeCollection][index];
     _newObject = stream.GetObject(typeCollection, index, text);
     _newObject.CorrectionSequence(this);
     if (_newObject.Type != eTypeObjectStream.Default && typeCollection == eTypeObjectCollection._1D && stream.EventCount == index + 1)
     {
         Add(new InsertColumnCommand(document, index + 1));
     }
 }
Ejemplo n.º 21
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AdNameSampleStream"/> from a <paramref name="lineStream"/> object.
        /// </summary>
        /// <param name="monitor">The execution monitor.</param>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="splitHyphenatedTokens">if set to <c>true</c> hyphenated tokens will be separated: "carros-monstro" &gt; "carros" Hyphen "monstro".</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid data in the file will be skipped.</param>
        /// <exception cref="System.ArgumentNullException">lineStream</exception>
        public AdNameSampleStream(Monitor monitor, IObjectStream <string> lineStream, bool splitHyphenatedTokens,
                                  bool safeParse)
            : this(lineStream, splitHyphenatedTokens, safeParse)
        {
            if (monitor == null)
            {
                throw new ArgumentNullException(nameof(monitor));
            }

            this.monitor = monitor;
        }
Ejemplo n.º 22
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AdTokenSampleStream"/> from a <paramref name="lineStream"/> object.
        /// </summary>
        /// <param name="monitor">The evaluation monitor.</param>
        /// <param name="lineStream">The line stream.</param>
        /// <param name="detokenizer">The detokenizer used create the samples.</param>
        /// <param name="splitHyphenatedTokens">if set to <c>true</c> hyphenated tokens will be separated: "carros-monstro" &gt; "carros" Hyphen "monstro".</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid data in the file will be skipped.</param>
        /// <exception cref="System.ArgumentNullException">lineStream</exception>
        public AdTokenSampleStream(Monitor monitor, IObjectStream <string> lineStream, IDetokenizer detokenizer, bool splitHyphenatedTokens,
                                   bool safeParse)
            : this(lineStream, detokenizer, splitHyphenatedTokens, safeParse)
        {
            if (monitor == null)
            {
                throw new ArgumentNullException("monitor");
            }

            this.monitor = monitor;
        }
Ejemplo n.º 23
0
 public SetObjectCommand(Document document, IStream element, eTypeObjectCollection typeCollection, int index, eTypeObjectStream typeObject, object data = null)
     : base(document)
 {
     _oldObject = element[typeCollection][index];
     _newObject = element.GetObject(typeCollection, index, typeObject, data);
     _newObject.CorrectionSequence(this);
     if (_newObject.Type != eTypeObjectStream.Default && typeCollection == eTypeObjectCollection._1D && element.EventCount == index + 1)
     {
         Add(new InsertColumnCommand(document, index + 1));
     }
 }
Ejemplo n.º 24
0
        internal static NaiveBayesModel TrainModel(IObjectStream <Event> samples, int cutoff = 1)
        {
            var parameters = TrainingParameters.DefaultParameters();

            parameters.Set(Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));

            var trainer = new NaiveBayesTrainer();

            trainer.Init(parameters, null);

            return(trainer.Train(samples));
        }
Ejemplo n.º 25
0
        public NameFinderEventStream(IObjectStream <NameSample> dataStream, string type, INameContextGenerator contextGenerator, ISequenceCodec <string> codec) : base(dataStream)
        {
            this.codec = codec ?? new BioCodec();

            additionalContextFeatureGenerator = new AdditionalContextFeatureGenerator();

            this.contextGenerator = contextGenerator;
            this.contextGenerator.AddFeatureGenerator(new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));

            // TODO: How to make the type really do something?!
            // Type = type ?? "default";
        }
Ejemplo n.º 26
0
        /// <summary>
        /// Trains a model for the <see cref="TokenizerME"/>.
        /// </summary>
        /// <param name="samples">The samples used for the training.</param>
        /// <param name="factory">A <see cref="TokenizerFactory"/> to get resources from.</param>
        /// <param name="parameters">The machine learning train parameters.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="TokenizerModel"/>.</returns>
        public static TokenizerModel Train(IObjectStream <TokenSample> samples, TokenizerFactory factory, TrainingParameters parameters, Monitor monitor)
        {
            var manifestInfoEntries = new Dictionary <string, string>();

            var eventStream = new TokSpanEventStream(samples, factory.UseAlphaNumericOptimization,
                                                     factory.AlphaNumericPattern, factory.ContextGenerator);

            var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);
            var model   = trainer.Train(eventStream);

            return(new TokenizerModel(model, manifestInfoEntries, factory));
        }
Ejemplo n.º 27
0
        /// <summary>
        /// Evaluates the samples with a given number of partitions.
        /// </summary>
        /// <param name="samples">The samples to train and test.</param>
        /// <param name="partitions">The number of folds.</param>
        public void Evaluate(IObjectStream <T> samples, int partitions)
        {
            var partitioner = new CrossValidationPartitioner <T>(samples, partitions);

            while (partitioner.HasNext)
            {
                var ps = partitioner.Next();

                var fm = Process(ps);

                FMeasure.MergeInto(fm);
            }
        }
Ejemplo n.º 28
0
        public static void PopulatePOSDictionary(IObjectStream <POSSample> samples, IMutableTagDictionary dictionary, bool caseSensitive, int cutoff)
        {
            var       newEntries = new Dictionary <string, Dictionary <string, int> >();
            POSSample sample;

            while ((sample = samples.Read()) != null)
            {
                for (int i = 0; i < sample.Sentence.Length; i++)
                {
                    if (!StringPattern.Recognize(sample.Sentence[i]).ContainsDigit)
                    {
                        string word = caseSensitive ? sample.Sentence[i] : sample.Sentence[i].ToLowerInvariant();

                        if (!newEntries.ContainsKey(word))
                        {
                            newEntries.Add(word, new Dictionary <string, int>());
                        }

                        var dicTags = dictionary.GetTags(word);
                        if (dicTags != null)
                        {
                            foreach (var tag in dicTags)
                            {
                                if (!newEntries[word].ContainsKey(tag))
                                {
                                    newEntries[word].Add(tag, cutoff);
                                }
                            }
                        }

                        if (!newEntries[word].ContainsKey(sample.Tags[i]))
                        {
                            newEntries[word].Add(sample.Tags[i], 1);
                        }
                        else
                        {
                            newEntries[word][sample.Tags[i]]++;
                        }
                    }
                }
            }

            foreach (var wordEntry in newEntries)
            {
                var tagsForWord = (from entry in wordEntry.Value where entry.Value >= cutoff select entry.Key).ToList();
                if (tagsForWord.Count > 0)
                {
                    dictionary.Put(wordEntry.Key, tagsForWord.ToArray());
                }
            }
        }
Ejemplo n.º 29
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <returns>The trained <see cref="ParserModel" /> object.</returns>
        public static ParserModel Train(
            Monitor monitor,
            string languageCode,
            IObjectStream <Parse> samples,
            AbstractHeadRules rules,
            TrainingParameters parameters)
        {
            var dict = BuildDictionary(samples, rules, parameters);

            samples.Reset();

            var manifestInfoEntries = new Dictionary <string, string>();

            // build
            //System.err.println("Training builder");
            var bes            = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dict);
            var buildReportMap = new Dictionary <string, string>();
            var buildTrainer   = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor);


            var buildModel = buildTrainer.Train(bes);

            MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build");

            samples.Reset();

            // tag
            var posModel = POSTaggerME.Train(languageCode, new PosSampleStream(samples),
                                             parameters.GetNamespace("tagger"), new POSTaggerFactory());

            samples.Reset();

            // chunk
            var chunkModel = ChunkerME.Train(languageCode, new ChunkSampleStream(samples),
                                             parameters.GetNamespace("chunker"), new ChunkerFactory());

            samples.Reset();

            // check
            //System.err.println("Training checker");
            var kes            = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check);
            var checkReportMap = new Dictionary <string, string>();
            var checkTrainer   = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor);

            var checkModel = checkTrainer.Train(kes);

            MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check");

            return(new ParserModel(languageCode, buildModel, checkModel, posModel, chunkModel, rules,
                                   ParserType.Chunking, manifestInfoEntries));
        }
Ejemplo n.º 30
0
        /// <summary>
        /// Trains a lemmatizer model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="factory">The sentence detector factory.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training
        /// operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="LemmatizerModel" /> object.</returns>
        /// <exception cref="System.InvalidOperationException">The trainer was not specified.</exception>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static LemmatizerModel Train(string languageCode, IObjectStream <LemmaSample> samples, TrainingParameters parameters, LemmatizerFactory factory, Monitor monitor)
        {
            var manifestInfoEntries = new Dictionary <string, string>();
            var beamSize            = parameters.Get(Parameters.BeamSize, DefaultBeamSize);
            var cg = factory.GetContextGenerator();


            var trainerType = TrainerFactory.GetTrainerType(parameters);

            if (!trainerType.HasValue)
            {
                throw new InvalidOperationException("The trainer was not specified.");
            }


            IMaxentModel model = null;

            ML.Model.ISequenceClassificationModel <string> seqModel = null;

            switch (trainerType)
            {
            case TrainerType.EventModelTrainer:
                var s1 = new LemmaSampleEventStream(samples, cg);
                var t1 = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);

                model = t1.Train(s1);
                break;

            case TrainerType.EventModelSequenceTrainer:
                var s2 = new LemmaSampleSequenceStream(samples, cg);
                var t2 = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor);

                model = t2.Train(s2);
                break;

            case TrainerType.SequenceTrainer:
                var s3 = new LemmaSampleSequenceStream(samples, cg);
                var t3 = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor);

                seqModel = t3.Train(s3);
                break;

            default:
                throw new NotSupportedException("Trainer type is not supported.");
            }

            return(model != null
                ? new LemmatizerModel(languageCode, model, beamSize, manifestInfoEntries, factory)
                : new LemmatizerModel(languageCode, seqModel, manifestInfoEntries, factory));
        }
Ejemplo n.º 31
0
        /// <summary>
        /// Create an event stream based on the specified data stream of the specified type using the specified head rules.
        /// </summary>
        /// <param name="samples">A 1-parse-per-line Penn Treebank Style parse.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="eType">The type of events desired (tag, chunk, build, or check).</param>
        /// <param name="dictionary">A tri-gram dictionary to reduce feature generation.</param>
        public ParserEventStream(IObjectStream<Parse> samples, AbstractHeadRules rules, ParserEventTypeEnum eType,
            Dictionary.Dictionary dictionary)
            : base(samples, rules, eType, dictionary) {

            switch (eType) {
                case ParserEventTypeEnum.Build:
                    bcg = new BuildContextGenerator(dictionary);
                    break;
                case ParserEventTypeEnum.Check:
                    kcg = new CheckContextGenerator();
                    break;
            }

        }
Ejemplo n.º 32
0
        /// <summary>
        /// Initializes a new instance of the <see cref="DocumentCategorizerEventStream"/> class with the given feature generators.
        /// </summary>
        /// <param name="samples">The samples.</param>
        /// <param name="featureGenerators">The feature generators.</param>
        /// <exception cref="System.ArgumentNullException">featureGenerators</exception>
        /// <exception cref="System.ArgumentOutOfRangeException">featureGenerators</exception>
        public DocumentCategorizerEventStream(IObjectStream <DocumentSample> samples, params IFeatureGenerator[] featureGenerators) : base(samples)
        {
            if (featureGenerators == null)
            {
                throw new ArgumentNullException("featureGenerators");
            }

            if (featureGenerators.Length == 0)
            {
                throw new ArgumentOutOfRangeException("featureGenerators");
            }

            cg = new DocumentCategorizerContextGenerator(featureGenerators);
        }
Ejemplo n.º 33
0
        /// <summary>
        /// Creates a new data indexer for the given event stream.
        /// </summary>
        /// <param name="events">The event stream.</param>
        /// <returns>IDataIndexer.</returns>
        /// <exception cref="System.InvalidOperationException">Unexpected data indexer name: Name</exception>
        public IDataIndexer GetDataIndexer(IObjectStream <Event> events)
        {
            switch (DataIndexerName)
            {
            case Parameters.DataIndexers.OnePass:
                return(new OnePassDataIndexer(events, Cutoff, IsSortAndMerge, Monitor));

            case Parameters.DataIndexers.TwoPass:
                return(new TwoPassDataIndexer(events, Cutoff, IsSortAndMerge, Monitor));

            default:
                throw new InvalidOperationException("Unexpected data indexer name: " + DataIndexerName);
            }
        }
Ejemplo n.º 34
0
        /// <summary>
        /// Trains sentence detection model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="factory">The sentence detector factory.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="SentenceModel"/> object.</returns>
        public static SentenceModel Train(string languageCode, IObjectStream <SentenceSample> samples, SentenceDetectorFactory factory, TrainingParameters parameters, Monitor monitor)
        {
            var manifestInfoEntries = new Dictionary <string, string>();

            // TODO: Fix the EventStream to throw exceptions when training goes wrong
            var eventStream = new SentenceEventStream(
                samples,
                factory.GetContextGenerator(),
                factory.GetEndOfSentenceScanner());

            var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);
            var model   = trainer.Train(eventStream);

            return(new SentenceModel(languageCode, model, manifestInfoEntries, factory));
        }
Ejemplo n.º 35
0
        /// <summary>
        /// Initializes a new instance of the <see cref="BioNLP2004NameSampleStream"/> class.
        /// </summary>
        /// <param name="inputStream">The input stream.</param>
        /// <param name="types">The types.</param>
        /// <exception cref="System.ArgumentNullException">inputStream</exception>
        /// <exception cref="System.ArgumentException">The input stream was not readable.</exception>
        public BioNLP2004NameSampleStream(Stream inputStream, int types)
        {
            if (inputStream == null)
            {
                throw new ArgumentNullException("inputStream");
            }

            if (!inputStream.CanRead)
            {
                throw new ArgumentException(@"The input stream was not readable.", "inputStream");
            }

            lineStream = new PlainTextByLineStream(inputStream, Encoding.UTF8);
            this.types = types;
        }
Ejemplo n.º 36
0
        /// <summary>
        /// Evaluates the specified chunk samples.
        /// </summary>
        /// <param name="samples">The chunk samples to be evaluated.</param>
        /// <param name="partitions">The partitions (folds).</param>
        public void Evaluate(IObjectStream<ChunkSample> samples, int partitions) {
            var partitioner = new CrossValidationPartitioner<ChunkSample>(samples, partitions);

            while (partitioner.HasNext) {

                var trainingSampleStream = partitioner.Next();

                var model = ChunkerME.Train(languageCode, trainingSampleStream, parameters, chunkerFactory);

                var evaluator = new ChunkerEvaluator(new ChunkerME(model), listeners);

                evaluator.Evaluate(trainingSampleStream.GetTestSampleStream());

                FMeasure.MergeInto(evaluator.FMeasure);
            }
        }
Ejemplo n.º 37
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="ParserModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static ParserModel Train(
            string languageCode,
            IObjectStream<Parse> samples, 
            AbstractHeadRules rules, 
            TrainingParameters parameters,
            Monitor monitor) {

            var manifestInfoEntries = new Dictionary<string, string>();

#if DEBUG
            System.Diagnostics.Debug.Print("Building dictionary");
#endif

            var dictionary = BuildDictionary(samples, rules, parameters);

            samples.Reset();

            // tag
            var posModel = POSTaggerME.Train(
                languageCode,
                new PosSampleStream(samples),
                parameters.GetNamespace("tagger"),
                new POSTaggerFactory(), monitor);

            samples.Reset();

            // chunk
            var chunkModel = ChunkerME.Train(
                languageCode,
                new ChunkSampleStream(samples),
                parameters.GetNamespace("chunker"),
                new ParserChunkerFactory(), 
                monitor);

            samples.Reset();

            // build

#if DEBUG
            System.Diagnostics.Debug.Print("Training builder");
#endif

            var bes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dictionary);
            var buildReportMap = new Dictionary<string, string>();
            var buildTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor);

            var buildModel = buildTrainer.Train(bes);

            Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build");

            samples.Reset();

            // check
#if DEBUG
            System.Diagnostics.Debug.Print("Training checker");
#endif
            var kes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check);
            var checkReportMap = new Dictionary<string, string>();

            var checkTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor);

            var checkModel = checkTrainer.Train(kes);

            Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check");

            samples.Reset();

            // attach
#if DEBUG
            System.Diagnostics.Debug.Print("Training attacher");
#endif
            var attachEvents = new ParserEventStream(samples, rules, ParserEventTypeEnum.Attach);
            var attachReportMap = new Dictionary<string, string>();

            var attachTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("attach"), attachReportMap, monitor);

            var attachModel = attachTrainer.Train(attachEvents);

            Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, attachReportMap, "attach");

            return new ParserModel(
                languageCode,
                buildModel,
                checkModel,
                attachModel,
                posModel,
                chunkModel,
                rules,
                ParserType.TreeInsert,
                manifestInfoEntries);
        }
Ejemplo n.º 38
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <returns>The trained <see cref="ParserModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static ParserModel Train(
            string languageCode,
            IObjectStream<Parse> samples,
            AbstractHeadRules rules,
            TrainingParameters parameters) {

            return Train(languageCode, samples, rules, parameters, null);
        }
Ejemplo n.º 39
0
        /// <summary>
        /// Trains a Part of Speech model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="factory">The sentence detector factory.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="POSModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static POSModel Train(string languageCode, IObjectStream<POSSample> samples, TrainingParameters parameters, POSTaggerFactory factory, Monitor monitor) {

            //int beamSize = trainParams.Get(Parameters.BeamSize, NameFinderME.DefaultBeamSize);

            var contextGenerator = factory.GetPOSContextGenerator();
            var manifestInfoEntries = new Dictionary<string, string>();

            var trainerType = TrainerFactory.GetTrainerType(parameters);


            switch (trainerType) {
                case TrainerType.EventModelTrainer:
                    var es = new POSSampleEventStream(samples, contextGenerator);
                    var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);

                    var eventModel = trainer.Train(es);

                    return new POSModel(languageCode, eventModel, manifestInfoEntries, factory);

                case TrainerType.EventModelSequenceTrainer:
                    var ss = new POSSampleSequenceStream(samples, contextGenerator);
                    var trainer2 = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor);

                    var seqModel = trainer2.Train(ss);

                    return new POSModel(languageCode, seqModel, manifestInfoEntries, factory);

                case TrainerType.SequenceTrainer:
                    var trainer3 = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor);

                    // TODO: This will probably cause issue, since the feature generator uses the outcomes array

                    var ss2 = new POSSampleSequenceStream(samples, contextGenerator);
                    var seqPosModel = trainer3.Train(ss2);

                    return new POSModel(languageCode, seqPosModel, manifestInfoEntries, factory);
                default:
                    throw new NotSupportedException("Trainer type is not supported.");
            }
           

        }
Ejemplo n.º 40
0
 /// <summary>
 /// Initializes a new instance of the <see cref="NameSampleSequenceStream"/> class.
 /// </summary>
 /// <param name="psi">The sample stream.</param>
 /// <param name="featureGen">The feature generator.</param>
 public NameSampleSequenceStream(IObjectStream<NameSample> psi, IAdaptiveFeatureGenerator featureGen)
     : this(psi, new DefaultNameContextGenerator(featureGen), true) {}
Ejemplo n.º 41
0
 /// <summary>
 /// Two argument constructor for DataIndexer.
 /// </summary>
 /// <param name="eventStream">An event stream which contains the a list of all the Events seen in the training data.</param>
 /// <param name="cutoff">The minimum number of times a predicate must have been observed in order to be included in the model.</param>
 public TwoPassDataIndexer(IObjectStream<Event> eventStream, int cutoff) : this(eventStream, cutoff, true, null) { }
Ejemplo n.º 42
0
 /// <summary>
 /// Initializes a new instance of the <see cref="NameSampleSequenceStream"/> class.
 /// </summary>
 /// <param name="psi">The sample stream.</param>
 /// <param name="pcg">The context generator.</param>
 public NameSampleSequenceStream(IObjectStream<NameSample> psi, INameContextGenerator pcg)
     : this(psi, pcg, true) {}
Ejemplo n.º 43
0
 /// <summary>
 /// Trains a model for the <see cref="TokenizerME"/>.
 /// </summary>
 /// <param name="samples">The samples used for the training.</param>
 /// <param name="factory">A <see cref="TokenizerFactory"/> to get resources from.</param>
 /// <param name="parameters">The machine learning train parameters.</param>
 /// <returns>The trained <see cref="TokenizerModel"/>.</returns>
 public static TokenizerModel Train(IObjectStream<TokenSample> samples, TokenizerFactory factory, TrainingParameters parameters) {
     return Train(samples, factory, parameters, null);
 }
Ejemplo n.º 44
0
        /// <summary>
        /// Trains document categorizer model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="factory">The document categorizer factory.</param>
        /// <returns>The trained <see cref="DocumentCategorizerModel"/> model.</returns>
        public static DocumentCategorizerModel Train(
            string languageCode,
            IObjectStream<DocumentSample> samples,
            TrainingParameters parameters,
            DocumentCategorizerFactory factory) {

            return Train(languageCode, samples, parameters, factory, null);
        }
Ejemplo n.º 45
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <returns>The trained <see cref="ParserModel" /> object.</returns>
        public static ParserModel Train(
            Monitor monitor,
            string languageCode, 
            IObjectStream<Parse> samples, 
            AbstractHeadRules rules,
            TrainingParameters parameters) {

            var dict = BuildDictionary(samples, rules, parameters);

            samples.Reset();

            var manifestInfoEntries = new Dictionary<string, string>();

            // build
            //System.err.println("Training builder");
            var bes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dict);
            var buildReportMap = new Dictionary<string, string>();
            var buildTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor);


            var buildModel = buildTrainer.Train(bes);

            MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build");

            samples.Reset();

            // tag
            var posTaggerParams = parameters.GetNamespace("tagger");
            if (!posTaggerParams.Contains(Parameters.BeamSize))
                posTaggerParams.Set(Parameters.BeamSize, "10");


            var posModel = POSTaggerME.Train(languageCode, new PosSampleStream(samples),
                parameters.GetNamespace("tagger"), new POSTaggerFactory());

            samples.Reset();

            // chunk
            var chunkModel = ChunkerME.Train(languageCode, 
                new ChunkSampleStream(samples),
                parameters.GetNamespace("chunker"),
                new ParserChunkerFactory());

            samples.Reset();

            // check
            //System.err.println("Training checker");
            var kes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check);
            var checkReportMap = new Dictionary<string, string>();
            var checkTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor);

            var checkModel = checkTrainer.Train(kes);
            MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check");

            return new ParserModel(languageCode, buildModel, checkModel, posModel, chunkModel, rules, manifestInfoEntries);
        }
Ejemplo n.º 46
0
        /// <summary>
        /// Trains a model for the <see cref="TokenizerME"/>.
        /// </summary>
        /// <param name="samples">The samples used for the training.</param>
        /// <param name="factory">A <see cref="TokenizerFactory"/> to get resources from.</param>
        /// <param name="parameters">The machine learning train parameters.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="TokenizerModel"/>.</returns>
        public static TokenizerModel Train(IObjectStream<TokenSample> samples, TokenizerFactory factory, TrainingParameters parameters, Monitor monitor) {
            var manifestInfoEntries = new Dictionary<string, string>();

            var eventStream = new TokSpanEventStream(samples, factory.UseAlphaNumericOptimization,
                factory.AlphaNumericPattern, factory.ContextGenerator);

            var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);
            var model = trainer.Train(eventStream);

            return new TokenizerModel(model, manifestInfoEntries, factory);
        }
Ejemplo n.º 47
0
 /// <summary>
 /// Trains a name finder model.
 /// </summary>
 /// <param name="languageCode">The language of the training data.</param>
 /// <param name="samples">The training samples.</param>
 /// <param name="parameters">The machine learning train parameters.</param>
 /// <param name="factory">The name finder factory.</param>
 /// <param name="monitor">
 /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
 /// This argument can be a <c>null</c> value.</param>
 /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns>
 public static TokenNameFinderModel Train(string languageCode, IObjectStream<NameSample> samples, TrainingParameters parameters, TokenNameFinderFactory factory, Monitor monitor) {
     return Train(languageCode, DefaultType, samples, parameters, factory, monitor);
 }
Ejemplo n.º 48
0
        /// <summary>
        /// Trains a name finder model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language of the training data.</param>
        /// <param name="type">Overrides the type parameter in the provided samples. This value can be null.</param>
        /// <param name="samples">The training samples.</param>
        /// <param name="parameters">The machine learning train parameters.</param>
        /// <param name="factory">The name finder factory.</param>
        /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns>
        public static TokenNameFinderModel Train(
            string languageCode,
            string type,
            IObjectStream<NameSample> samples,
            TrainingParameters parameters,
            TokenNameFinderFactory factory) {

            return Train(languageCode, type, samples, parameters, factory, null);
        }
Ejemplo n.º 49
0
 /// <summary>
 /// Initializes a new instance of the <see cref="NameSampleSequenceStream"/> class using the <seealso cref="BioCodec"/> as sequence codec.
 /// </summary>
 /// <param name="psi">The sample stream.</param>
 /// <param name="pcg">The context generator.</param>
 /// <param name="useOutcomes">if set to <c>true</c> will be used in the samples.</param>
 public NameSampleSequenceStream(IObjectStream<NameSample> psi, INameContextGenerator pcg, bool useOutcomes)
     : this(psi, pcg, useOutcomes, new BioCodec()) {}
Ejemplo n.º 50
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="iterations">The number of training iterations.</param>
        /// <param name="cutoff">The min number of times a feature must be seen.</param>
        /// <returns>The trained <see cref="ParserModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static ParserModel Train(string languageCode, IObjectStream<Parse> samples, AbstractHeadRules rules,
            int iterations, int cutoff) {

            return Train(languageCode, samples, rules, iterations, cutoff, null);
        }
Ejemplo n.º 51
0
        /// <summary>
        /// Trains a parser model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="rules">The head rules.</param>
        /// <param name="iterations">The number of training iterations.</param>
        /// <param name="cutoff">The min number of times a feature must be seen.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="ParserModel"/> object.</returns>
        /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
        public static ParserModel Train(string languageCode, IObjectStream<Parse> samples, AbstractHeadRules rules, int iterations, int cutoff, Monitor monitor) {

            var param = new TrainingParameters();

            param.Set("dict", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));

            param.Set("tagger", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("tagger", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("chunker", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("chunker", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("check", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("check", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            param.Set("build", Parameters.Cutoff, cutoff.ToString(CultureInfo.InvariantCulture));
            param.Set("build", Parameters.Iterations, iterations.ToString(CultureInfo.InvariantCulture));

            return Train(languageCode, samples, rules, param, monitor);
        }
Ejemplo n.º 52
0
        private List<ComparableEvent> Index(
            IObjectStream<Event> indexEventStream,
            Dictionary<string, int> predicateIndex) {

            var map = new Dictionary<string, int>();
            var indexedContext = new List<int>();
            var eventsToCompare = new List<ComparableEvent>();
            int outcomeCount = 0;

            Event ev;
            while ((ev = indexEventStream.Read()) != null) {
                int ocID;

                if (Monitor != null && Monitor.Token.CanBeCanceled)
                    Monitor.Token.ThrowIfCancellationRequested();

                if (map.ContainsKey(ev.Outcome)) {
                    ocID = map[ev.Outcome];
                } else {
                    ocID = outcomeCount++;
                    map[ev.Outcome] = ocID;
                }

                // ReSharper disable once LoopCanBeConvertedToQuery
                foreach (var pred in ev.Context) {
                    if (predicateIndex.ContainsKey(pred)) {
                        indexedContext.Add(predicateIndex[pred]);
                    }
                }

                // drop events with no active features
                if (indexedContext.Count > 0) {
                    var cons = new int[indexedContext.Count];
                    for (int ci = 0; ci < cons.Length; ci++) {
                        cons[ci] = indexedContext[ci];
                    }
                    eventsToCompare.Add(new ComparableEvent(ocID, cons));
                } else {
                    if (Monitor != null)
                        Monitor.OnWarning(string.Format("Dropped event {0}:{1}", ev.Outcome, ev.Context.ToDisplay()));
                }
                indexedContext.Clear();
            }

            outcomeLabels = ToIndexedStringArray(map);
            predLabels = ToIndexedStringArray(predicateIndex);
            return eventsToCompare;
        }
Ejemplo n.º 53
0
        /// <summary>
        /// Trains document categorizer model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language code.</param>
        /// <param name="samples">The data samples.</param>
        /// <param name="parameters">The machine learnable parameters.</param>
        /// <param name="factory">The document categorizer factory.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.
        /// </param>
        /// <returns>The trained <see cref="DocumentCategorizerModel"/> model.</returns>
        public static DocumentCategorizerModel Train(string languageCode, IObjectStream<DocumentSample> samples, TrainingParameters parameters, DocumentCategorizerFactory factory, Monitor monitor) {

            var manifestInfoEntries = new Dictionary<string, string>();

            var eventStream = new DocumentCategorizerEventStream(samples, factory.FeatureGenerators);
            var trainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);
            var model = trainer.Train(eventStream);

            return new DocumentCategorizerModel(languageCode, model, manifestInfoEntries, factory);
        }
Ejemplo n.º 54
0
 /// <summary>
 /// Initializes a new instance of the <see cref="NameSampleSequenceStream"/> class.
 /// </summary>
 /// <param name="psi">The sample stream.</param>
 /// <param name="featureGen">The feature generator.</param>
 /// <param name="useOutcomes">if set to <c>true</c> will be used in the samples.</param>
 public NameSampleSequenceStream(IObjectStream<NameSample> psi, IAdaptiveFeatureGenerator featureGen,
     bool useOutcomes) : this(psi, new DefaultNameContextGenerator(featureGen), useOutcomes) {}
Ejemplo n.º 55
0
        /// <summary>
        /// Builds the NGram dictionary with the given samples.
        /// </summary>
        /// <param name="samples">The samples.</param>
        /// <param name="cutoff">The cutoff.</param>
        /// <returns>The NGram dictionary.</returns>
        public static Dict BuildNGramDictionary(IObjectStream<POSSample> samples, int cutoff) {

            var model = new NGramModel();
            POSSample sample;

            while ((sample = samples.Read()) != null) {

                if (sample.Sentence.Length > 0) {
                    model.Add(new StringList(sample.Sentence), 1, 1);
                }

            }
            model.CutOff(cutoff, int.MaxValue);

            return model.ToDictionary();
        }
Ejemplo n.º 56
0
        public static void PopulatePOSDictionary(IObjectStream<POSSample> samples, IMutableTagDictionary dictionary, bool caseSensitive, int cutoff) {

            var newEntries = new Dictionary<string, Dictionary<string, int>>();
            POSSample sample;
            while ((sample = samples.Read()) != null) {

                for (int i = 0; i < sample.Sentence.Length; i++) {
                    if (!StringPattern.Recognize(sample.Sentence[i]).ContainsDigit) {
                        string word = caseSensitive ? sample.Sentence[i] : sample.Sentence[i].ToLowerInvariant();

                        if (!newEntries.ContainsKey(word)) {
                            newEntries.Add(word, new Dictionary<string, int>());
                        }

                        var dicTags = dictionary.GetTags(word);
                        if (dicTags != null) {
                            foreach (var tag in dicTags) {
                                if (!newEntries[word].ContainsKey(tag)) {
                                    newEntries[word].Add(tag, cutoff);
                                }
                            }
                        }

                        if (!newEntries[word].ContainsKey(sample.Tags[i])) {
                            newEntries[word].Add(sample.Tags[i], 1);
                        } else {
                            newEntries[word][sample.Tags[i]]++;
                        }
                    }
                }
            }

            foreach (var wordEntry in newEntries) {
                var tagsForWord = (from entry in wordEntry.Value where entry.Value >= cutoff select entry.Key).ToList();
                if (tagsForWord.Count > 0)
                    dictionary.Put(wordEntry.Key, tagsForWord.ToArray());
                
            }
        }
Ejemplo n.º 57
0
 /// <summary>
 /// One argument constructor for DataIndexer which calls the two argument constructor assuming no cutoff.
 /// </summary>
 /// <param name="eventStream">An event stream which contains the a list of all the Events seen in the training data.</param>
 public TwoPassDataIndexer(IObjectStream<Event> eventStream) : this(eventStream, 0) { }
Ejemplo n.º 58
0
 /// <summary>
 /// Trains a Part of Speech model with the given parameters.
 /// </summary>
 /// <param name="languageCode">The language code.</param>
 /// <param name="samples">The data samples.</param>
 /// <param name="parameters">The machine learnable parameters.</param>
 /// <param name="factory">The sentence detector factory.</param>
 /// <returns>The trained <see cref="POSModel"/> object.</returns>
 /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception>
 public static POSModel Train(string languageCode, IObjectStream<POSSample> samples,
     TrainingParameters parameters, POSTaggerFactory factory) {
     return Train(languageCode, samples, parameters, factory, null);
 }
Ejemplo n.º 59
0
        /// <summary>
        /// Two argument constructor for DataIndexer.
        /// </summary>
        /// <param name="eventStream">An event stream which contains the a list of all the Events seen in the training data.</param>
        /// <param name="cutoff">The minimum number of times a predicate must have been observed in order to be included in the model.</param>
        /// <param name="sort">if set to <c>true</c> the events will be sorted.</param>
        public TwoPassDataIndexer(IObjectStream<Event> eventStream, int cutoff, bool sort)
            : this(eventStream, cutoff, sort, null) {

        }
Ejemplo n.º 60
0
        /// <summary>
        /// Trains a name finder model with the given parameters.
        /// </summary>
        /// <param name="languageCode">The language of the training data.</param>
        /// <param name="type">Overrides the type parameter in the provided samples. This value can be null.</param>
        /// <param name="samples">The training samples.</param>
        /// <param name="parameters">The machine learning train parameters.</param>
        /// <param name="factory">The name finder factory.</param>
        /// <param name="monitor">
        /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation.
        /// This argument can be a <c>null</c> value.</param>
        /// <returns>the newly <see cref="TokenNameFinderModel"/> trained model.</returns>
        public static TokenNameFinderModel Train(string languageCode, string type, IObjectStream<NameSample> samples, TrainingParameters parameters, TokenNameFinderFactory factory, Monitor monitor) {
            var beamSize = parameters.Get(Parameters.BeamSize, DefaultBeamSize);
            var manifestInfoEntries = new Dictionary<string, string>();
            var trainerType = TrainerFactory.GetTrainerType(parameters);

            IMaxentModel meModel = null;
            ML.Model.ISequenceClassificationModel<string> seqModel = null;

            switch (trainerType) {
                case TrainerType.EventModelTrainer:
                    var eventStream = new NameFinderEventStream(samples, type, factory.CreateContextGenerator(),
                        factory.CreateSequenceCodec());
                    var nfTrainer = TrainerFactory.GetEventTrainer(parameters, manifestInfoEntries, monitor);

                    meModel = nfTrainer.Train(eventStream);
                    break;
                case TrainerType.EventModelSequenceTrainer:
                    var sampleStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator());
                    var nsTrainer = TrainerFactory.GetEventModelSequenceTrainer(parameters, manifestInfoEntries, monitor);

                    meModel = nsTrainer.Train(sampleStream);
                    break;
                case TrainerType.SequenceTrainer:
                    var sequenceStream = new NameSampleSequenceStream(samples, factory.CreateContextGenerator());
                    var sqTrainer = TrainerFactory.GetSequenceModelTrainer(parameters, manifestInfoEntries, monitor);


                    seqModel = sqTrainer.Train(sequenceStream);
                    break;
                default:
                    throw new InvalidOperationException("Unexpected trainer type!");
            }

            if (seqModel != null) {
                return new TokenNameFinderModel(
                    languageCode,
                    seqModel,
                    factory.FeatureGenerator,
                    factory.Resources,
                    manifestInfoEntries,
                    factory.SequenceCodec,
                    factory);
            }

            return new TokenNameFinderModel(
                languageCode,
                meModel,
                beamSize,
                factory.FeatureGenerator,
                factory.Resources,
                manifestInfoEntries,
                factory.SequenceCodec,
                factory);
        }