/// <summary> /// Trains a model with a collection of emails and a flag to indicate invalidity. /// </summary> /// <param name="emailsAndValidities">A collection of emails and whether this email is invalid</param> /// <param name="iterations">The number of iterations for the training</param> /// <param name="cut">The cut for the training</param> /// <returns>The trained GisModel</returns> public static GisModel TrainModel(IEnumerable <EmailAndValidity> emailsAndValidities, int iterations, int cut) { var trainer = new GisTrainer(); var eventReader = new InvalidEmailDetectionDataEventReader(emailsAndValidities); trainer.TrainModel(eventReader, iterations, cut); return(new GisModel(trainer)); }
/// <summary> /// Trains a tokenizer model from input files well formatted for /// a token event reader. /// </summary> /// <param name="inputFiles">The collection of training input files</param> /// <param name="iterations">The number of iterations to run when training the model</param> /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param> /// <param name="splitMarker">The character indicating a split in the files</param> /// <returns>The freshly trained GisModel</returns> public static GisModel Train(IEnumerable <string> inputFiles, int iterations, int cut, char splitMarker = '|') { var trainer = new GisTrainer(0.1); foreach (var inputFile in inputFiles) { var dataReader = new StreamReader(inputFile); var eventReader = new TokenEventReader(dataReader, splitMarker); trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut)); } return(new GisModel(trainer)); }
public static GisModel TrainModel(IEnumerable <string> filePaths, int iterations, int cut, IEndOfSentenceScanner scanner) { var trainer = new GisTrainer(); var readers = filePaths.Select(path => new StreamReader(path)).ToList(); // train the model ITrainingDataReader <string> dataReader = new MultipleFilesPlainTextByLineDataReader(readers); ITrainingEventReader eventReader = new SentenceDetectionEventReader(dataReader, scanner); trainer.TrainModel(eventReader, iterations, cut); return(new GisModel(trainer)); }
/// <summary> /// Trains a model with a collection of input files with the following format: /// [email protected] 0 /// mqsldkqsmlqsmdklqs@sdlsqjd 1 /// ... /// The first line represents a valid email, the second an invalid. /// </summary> /// <param name="filePaths">The collection of file paths</param> /// <param name="iterations">The number of iterations for the training</param> /// <param name="cut">The cut for the training</param> /// <returns>The trained GisModel</returns> public static GisModel TrainModel(IEnumerable <string> filePaths, int iterations, int cut) { var trainer = new GisTrainer(); foreach (var file in filePaths) { using (var streamReader = new StreamReader(file)) { ITrainingDataReader <string> dataReader = new PlainTextByLineDataReader(streamReader); ITrainingEventReader eventReader = new InvalidEmailDetectionEventReader(dataReader); trainer.TrainModel(eventReader, iterations, cut); } } return(new GisModel(trainer)); }
public static GisModel TrainModel(IEnumerable <string> files, int iterations, int cut, IEndOfSentenceScanner scanner) { var trainer = new GisTrainer(); foreach (var file in files) { using (var streamReader = new StreamReader(file)) { ITrainingDataReader <string> dataReader = new PlainTextByLineDataReader(streamReader); ITrainingEventReader eventReader = new SentenceDetectionEventReader(dataReader, scanner); trainer.TrainModel(eventReader, iterations, cut); } } return(new GisModel(trainer)); }
/// <summary> /// Trains a tokenizer model from input files well formatted for /// a token event reader. /// </summary> /// <param name="inputFilePaths">The collection of training input files</param> /// <param name="iterations">The number of iterations to run when training the model</param> /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param> /// <param name="splitMarker">The character indicating a split in the files</param> /// <returns>The freshly trained GisModel</returns> public static GisModel Train(IEnumerable <string> inputFilePaths, int iterations, int cut, char splitMarker = '|', bool includeAllCapsExamples = false) { var trainer = new GisTrainer(0.1); var dataReaders = new List <StreamReader>(); foreach (var path in inputFilePaths) { var dataReader = new StreamReader(path); dataReaders.Add(dataReader); } // train the model var eventReader = new MultipleFileTokenEventReader(dataReaders, splitMarker, includeAllCapsExamples); trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut)); return(new GisModel(trainer)); }
/// <summary> /// Trains a tokenizer model from input files well formatted for /// a token event reader. /// </summary> /// <param name="inputFilePaths">The collection of training input files</param> /// <param name="iterations">The number of iterations to run when training the model</param> /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param> /// <param name="splitMarker">The character indicating a split in the files</param> /// <returns>The freshly trained GisModel</returns> public static GisModel Train(IEnumerable<string> inputFilePaths, int iterations, int cut, char splitMarker = '|', bool includeAllCapsExamples = false) { var trainer = new GisTrainer(0.1); var dataReaders = new List<StreamReader>(); foreach (var path in inputFilePaths) { var dataReader = new StreamReader(path); dataReaders.Add(dataReader); } // train the model var eventReader = new MultipleFileTokenEventReader(dataReaders, splitMarker, includeAllCapsExamples); trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut)); return new GisModel(trainer); }
/// <summary> /// Trains a tokenizer model from input files well formatted for /// a token event reader. /// </summary> /// <param name="inputFiles">The collection of training input files</param> /// <param name="iterations">The number of iterations to run when training the model</param> /// <param name="cut">The minimum nb of occurences for statistical relevancy in the trained model</param> /// <param name="splitMarker">The character indicating a split in the files</param> /// <returns>The freshly trained GisModel</returns> public static GisModel Train(IEnumerable<string> inputFiles, int iterations, int cut, char splitMarker = '|') { var trainer = new GisTrainer(0.1); foreach (var inputFile in inputFiles) { var dataReader = new StreamReader(inputFile); var eventReader = new TokenEventReader(dataReader, splitMarker); trainer.TrainModel(iterations, new TwoPassDataIndexer(eventReader, cut)); } return new GisModel(trainer); }
public static GisModel TrainModel(IEnumerable<string> filePaths, int iterations, int cut, IEndOfSentenceScanner scanner) { var trainer = new GisTrainer(); var readers = filePaths.Select(path => new StreamReader(path)).ToList(); // train the model ITrainingDataReader<string> dataReader = new MultipleFilesPlainTextByLineDataReader(readers); ITrainingEventReader eventReader = new SentenceDetectionEventReader(dataReader, scanner); trainer.TrainModel(eventReader, iterations, cut); return new GisModel(trainer); }
public static GisModel TrainModel(IEnumerable<string> files, int iterations, int cut, IEndOfSentenceScanner scanner) { var trainer = new GisTrainer(); foreach (var file in files) { using (var streamReader = new StreamReader(file)) { ITrainingDataReader<string> dataReader = new PlainTextByLineDataReader(streamReader); ITrainingEventReader eventReader = new SentenceDetectionEventReader(dataReader, scanner); trainer.TrainModel(eventReader, iterations, cut); } } return new GisModel(trainer); }