Пример #1
0
        /// <summary>
        /// Trains a model with a collection of emails and a flag to indicate invalidity.
        /// </summary>
        /// <param name="emailsAndValidities">A collection of emails and whether this email is invalid</param>
        /// <param name="iterations">The number of iterations for the training</param>
        /// <param name="cut">The cut for the training</param>
        /// <returns>The trained GisModel</returns>
        public static GisModel TrainModel(IEnumerable <EmailAndValidity> emailsAndValidities, int iterations, int cut)
        {
            var trainer     = new GisTrainer();
            var eventReader = new InvalidEmailDetectionDataEventReader(emailsAndValidities);

            trainer.TrainModel(eventReader, iterations, cut);

            return(new GisModel(trainer));
        }
Пример #2
0
        /// <summary>
        /// Trains a tokenizer model from input files well formatted for
        /// a token event reader.
        /// </summary>
        /// <param name="inputFiles">The collection of training input files</param>
        /// <param name="iterations">The number of iterations to run when training the model</param>
        /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param>
        /// <param name="splitMarker">The character indicating a split in the files</param>
        /// <returns>The freshly trained GisModel</returns>
        public static GisModel Train(IEnumerable <string> inputFiles, int iterations, int cut, char splitMarker = '|')
        {
            var trainer = new GisTrainer(0.1);

            foreach (var inputFile in inputFiles)
            {
                var dataReader  = new StreamReader(inputFile);
                var eventReader = new TokenEventReader(dataReader, splitMarker);

                trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut));
            }
            return(new GisModel(trainer));
        }
Пример #3
0
        public static GisModel TrainModel(IEnumerable <string> filePaths, int iterations, int cut, IEndOfSentenceScanner scanner)
        {
            var trainer = new GisTrainer();

            var readers = filePaths.Select(path => new StreamReader(path)).ToList();

            // train the model
            ITrainingDataReader <string> dataReader  = new MultipleFilesPlainTextByLineDataReader(readers);
            ITrainingEventReader         eventReader = new SentenceDetectionEventReader(dataReader, scanner);

            trainer.TrainModel(eventReader, iterations, cut);

            return(new GisModel(trainer));
        }
Пример #4
0
        /// <summary>
        /// Trains a model with a collection of input files with the following format:
        /// [email protected]  0
        /// mqsldkqsmlqsmdklqs@sdlsqjd  1
        /// ...
        /// The first line represents a valid email, the second an invalid.
        /// </summary>
        /// <param name="filePaths">The collection of file paths</param>
        /// <param name="iterations">The number of iterations for the training</param>
        /// <param name="cut">The cut for the training</param>
        /// <returns>The trained GisModel</returns>
        public static GisModel TrainModel(IEnumerable <string> filePaths, int iterations, int cut)
        {
            var trainer = new GisTrainer();

            foreach (var file in filePaths)
            {
                using (var streamReader = new StreamReader(file))
                {
                    ITrainingDataReader <string> dataReader  = new PlainTextByLineDataReader(streamReader);
                    ITrainingEventReader         eventReader = new InvalidEmailDetectionEventReader(dataReader);

                    trainer.TrainModel(eventReader, iterations, cut);
                }
            }

            return(new GisModel(trainer));
        }
Пример #5
0
        public static GisModel TrainModel(IEnumerable <string> files, int iterations, int cut, IEndOfSentenceScanner scanner)
        {
            var trainer = new GisTrainer();

            foreach (var file in files)
            {
                using (var streamReader = new StreamReader(file))
                {
                    ITrainingDataReader <string> dataReader  = new PlainTextByLineDataReader(streamReader);
                    ITrainingEventReader         eventReader = new SentenceDetectionEventReader(dataReader, scanner);

                    trainer.TrainModel(eventReader, iterations, cut);
                }
            }

            return(new GisModel(trainer));
        }
Пример #6
0
        /// <summary>
        /// Trains a tokenizer model from input files well formatted for
        /// a token event reader.
        /// </summary>
        /// <param name="inputFilePaths">The collection of training input files</param>
        /// <param name="iterations">The number of iterations to run when training the model</param>
        /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param>
        /// <param name="splitMarker">The character indicating a split in the files</param>
        /// <returns>The freshly trained GisModel</returns>
        public static GisModel Train(IEnumerable <string> inputFilePaths, int iterations, int cut, char splitMarker = '|', bool includeAllCapsExamples = false)
        {
            var trainer = new GisTrainer(0.1);

            var dataReaders = new List <StreamReader>();

            foreach (var path in inputFilePaths)
            {
                var dataReader = new StreamReader(path);
                dataReaders.Add(dataReader);
            }

            // train the model
            var eventReader = new MultipleFileTokenEventReader(dataReaders, splitMarker, includeAllCapsExamples);

            trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut));

            return(new GisModel(trainer));
        }
Пример #7
0
	    /// <summary>
	    /// Trains a tokenizer model from input files well formatted for
	    /// a token event reader.
	    /// </summary>
	    /// <param name="inputFilePaths">The collection of training input files</param>
	    /// <param name="iterations">The number of iterations to run when training the model</param>
	    /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param>
	    /// <param name="splitMarker">The character indicating a split in the files</param>
	    /// <returns>The freshly trained GisModel</returns>
        public static GisModel Train(IEnumerable<string> inputFilePaths, int iterations, int cut, char splitMarker = '|', bool includeAllCapsExamples = false)
	    {
	        var trainer = new GisTrainer(0.1);

            var dataReaders = new List<StreamReader>();
	        foreach (var path in inputFilePaths)
	        {
                var dataReader = new StreamReader(path);
                dataReaders.Add(dataReader);
	        }

            // train the model
            var eventReader = new MultipleFileTokenEventReader(dataReaders, splitMarker, includeAllCapsExamples);
            trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut));

            return new GisModel(trainer);
	    }
        /// <summary>
        /// Trains a tokenizer model from input files well formatted for
        /// a token event reader.
        /// </summary>
        /// <param name="inputFiles">The collection of training input files</param>
        /// <param name="iterations">The number of iterations to run when training the model</param>
        /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param>
        /// <param name="splitMarker">The character indicating a split in the files</param>
        /// <returns>The freshly trained GisModel</returns>
        public static GisModel Train(IEnumerable<string> inputFiles, int iterations, int cut, char splitMarker = '|')
        {
            var trainer = new GisTrainer(0.1);
            foreach (var inputFile in inputFiles)
            {
                var dataReader = new StreamReader(inputFile);
                var eventReader = new TokenEventReader(dataReader, splitMarker);

                trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut));
            }
            return new GisModel(trainer);
        }
        public static GisModel TrainModel(IEnumerable<string> filePaths, int iterations, int cut, IEndOfSentenceScanner scanner)
        {
            var trainer = new GisTrainer();

            var readers = filePaths.Select(path => new StreamReader(path)).ToList();

            // train the model
            ITrainingDataReader<string> dataReader = new MultipleFilesPlainTextByLineDataReader(readers);
            ITrainingEventReader eventReader = new SentenceDetectionEventReader(dataReader, scanner);

            trainer.TrainModel(eventReader, iterations, cut);

            return new GisModel(trainer);
        }
        public static GisModel TrainModel(IEnumerable<string> files, int iterations, int cut, IEndOfSentenceScanner scanner)
        {
            var trainer = new GisTrainer();

            foreach (var file in files)
            {
                using (var streamReader = new StreamReader(file))
                {
                    ITrainingDataReader<string> dataReader = new PlainTextByLineDataReader(streamReader);
                    ITrainingEventReader eventReader = new SentenceDetectionEventReader(dataReader, scanner);

                    trainer.TrainModel(eventReader, iterations, cut);
                }
            }

            return new GisModel(trainer);
        }