public GenericModelWriter(AbstractModel model, Jfile file) { string filename = file.Name; OutputStream os; // handle the zipped/not zipped distinction if (filename.EndsWith(".gz", StringComparison.Ordinal)) { os = new GZIPOutputStream(new FileOutputStream(file)); filename = filename.Substring(0, filename.Length - 3); } else { os = new FileOutputStream(file); } // handle the different formats if (filename.EndsWith(".bin", StringComparison.Ordinal)) { init(model, new DataOutputStream(os)); } else // filename ends with ".txt" { init(model, new BufferedWriter(new OutputStreamWriter(os))); } }
protected AbstractModelReader(Jfile f) { string filename = f.Name; InputStream input; // handle the zipped/not zipped distinction if (filename.EndsWith(".gz", StringComparison.Ordinal)) { input = new GZIPInputStream(new FileInputStream(f)); filename = filename.Substring(0, filename.Length - 3); } else { input = new FileInputStream(f); } // handle the different formats if (filename.EndsWith(".bin", StringComparison.Ordinal)) { dataReader = new BinaryFileDataReader(input); } else // filename ends with ".txt" { dataReader = new PlainTextFileDataReader(input); } }
/// <summary> /// Constructor which takes a GISModel and a File and invokes the /// GISModelWriter appropriate for the suffix. /// </summary> /// <param name="model"> The GISModel which is to be persisted. </param> /// <param name="f"> The File in which the model is to be stored. </param> public SuffixSensitivePerceptronModelWriter(AbstractModel model, Jfile f) : base(model) { OutputStream output; string filename = f.Name; // handle the zipped/not zipped distinction if (filename.EndsWith(".gz", StringComparison.Ordinal)) { output = new GZIPOutputStream(new FileOutputStream(f)); filename = filename.Substring(0, filename.Length - 3); } else { output = new DataOutputStream(new FileOutputStream(f)); } // handle the different formats if (filename.EndsWith(".bin", StringComparison.Ordinal)) { suffixAppropriateWriter = new BinaryPerceptronModelWriter(model, new DataOutputStream(output)); } else // default is ".txt" { suffixAppropriateWriter = new PlainTextPerceptronModelWriter(model, new BufferedWriter(new OutputStreamWriter(output))); } }
internal static sbyte[] openFeatureGeneratorBytes(Jfile featureGenDescriptorFile) { sbyte[] featureGeneratorBytes = null; // load descriptor file into memory if (featureGenDescriptorFile != null) { InputStream bytesIn = CmdLineUtil.openInFile(featureGenDescriptorFile); try { featureGeneratorBytes = ModelUtil.read(bytesIn); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { bytesIn.close(); } catch (IOException) { // sorry that this can fail } } } return(featureGeneratorBytes); }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false); if (mlParams != null) { if (TrainUtil.isSequenceTraining(mlParams.Settings)) { throw new TerminateToolException(1, "Sequence training is not supported!"); } } if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } Jfile modelOutFile = @params.Model; CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile); char[] eos = null; if (@params.EosChars != null) { eos = @params.EosChars.ToCharArray(); } SentenceModel model; try { Dictionary dict = loadDict(@params.AbbDict); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(@params.Factory, @params.Lang, true, dict, eos); model = SentenceDetectorME.train(@params.Lang, sampleStream, sdFactory, mlParams); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("sentence detector", modelOutFile, model); }
internal static Dictionary loadDict(Jfile f) { Dictionary dict = null; if (f != null) { CmdLineUtil.checkInputFile("abb dict", f); dict = new Dictionary(new FileInputStream(f)); } return(dict); }
/// <summary> /// Constructor which takes a GISModel and a File and prepares itself to write /// the model to that file. Detects whether the file is gzipped or not based on /// whether the suffix contains ".gz". /// </summary> /// <param name="model"> /// The GISModel which is to be persisted. </param> /// <param name="f"> /// The File in which the model is to be persisted. </param> public BinaryGISModelWriter(AbstractModel model, Jfile f) : base(model) { if (f.Name.EndsWith(".gz", StringComparison.Ordinal)) { output = new DataOutputStream(new GZIPOutputStream(new FileOutputStream(f))); } else { output = new DataOutputStream(new FileOutputStream(f)); } }
/// <summary> /// Constructor which takes a GISModel and a File and prepares itself to /// write the model to that file. Detects whether the file is gzipped or not /// based on whether the suffix contains ".gz". /// </summary> /// <param name="model"> The GISModel which is to be persisted. </param> /// <param name="f"> The File in which the model is to be persisted. </param> public PlainTextGISModelWriter(AbstractModel model, Jfile f) : base(model) { if (f.Name.EndsWith(".gz", StringComparison.Ordinal)) { output = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f)))); } else { output = new BufferedWriter(new FileWriter(f)); } }
public static FileInputStream openInFile(Jfile file) { try { return(new FileInputStream(file)); } catch (FileNotFoundException e) { throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e); } }
public BinaryFileDataReader(Jfile f) { if (f.Name.EndsWith(".gz", StringComparison.Ordinal)) { input = new DataInputStream( new BufferedInputStream(new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))))); } else { input = new DataInputStream(new BufferedInputStream(new FileInputStream(f))); } }
/// <summary> /// Tries to ensure that it is possible to write to an output file. /// <para> /// The method does nothing if it is possible to write otherwise /// it prints an appropriate error message and a <seealso cref="TerminateToolException"/> is thrown. /// </para> /// <para> /// Computing the contents of an output file (e.g. ME model) can be very time consuming. /// Prior to this computation it should be checked once that writing this output file is /// possible to be able to fail fast if not. If this validation is only done after a time /// consuming computation it could frustrate the user. /// /// </para> /// </summary> /// <param name="name"> human-friendly file name. for example perceptron model </param> /// <param name="outFile"> file </param> public static void checkOutputFile(string name, Jfile outFile) { string isFailure = null; if (outFile.exists()) { // The file already exists, ensure that it is a normal file and that it is // possible to write into it if (outFile.IsDirectory) { isFailure = "The " + name + " file is a directory!"; } else if (outFile.IsFile) { if (!outFile.canWrite()) { isFailure = "No permissions to write the " + name + " file!"; } } else { isFailure = "The " + name + " file is not a normal file!"; } } else { // The file does not exist ensure its parent // directory exists and has write permissions to create // a new file in it Jfile parentDir = outFile.AbsoluteFile.ParentFile; if (parentDir != null && parentDir.exists()) { if (!parentDir.canWrite()) { isFailure = "No permissions to create the " + name + " file!"; } } else { isFailure = "The parent directory of the " + name + " file does not exist, " + "please create it first!"; } } if (null != isFailure) { throw new TerminateToolException(-1, isFailure + " Path: " + outFile.AbsolutePath); } }
/// <summary> /// Two argument constructor for DataIndexer. /// </summary> /// <param name="eventStream"> An Event[] which contains the a list of all the Events /// seen in the training data. </param> /// <param name="cutoff"> The minimum number of times a predicate must have been /// observed in order to be included in the model. </param> public TwoPassDataIndexer(EventStream eventStream, int cutoff, bool sort) { IDictionary <string, int?> predicateIndex = new Dictionary <string, int?>(); List <ComparableEvent> eventsToCompare; Console.WriteLine("Indexing events using cutoff of " + cutoff + "\n"); Console.Write("\tComputing event counts... "); try { Jfile tmp = Jfile.createTempFile("events", null); tmp.deleteOnExit(); Writer osw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmp), "UTF8")); int numEvents = computeEventCounts(eventStream, osw, predicateIndex, cutoff); Console.WriteLine("done. " + numEvents + " events"); Console.Write("\tIndexing... "); FileEventStream fes = new FileEventStream(tmp); try { eventsToCompare = index(numEvents, fes, predicateIndex); } finally { fes.close(); } // done with predicates predicateIndex = null; tmp.delete(); Console.WriteLine("done."); if (sort) { Console.Write("Sorting and merging events... "); } else { Console.Write("Collecting events... "); } sortAndMerge(eventsToCompare, sort); Console.WriteLine("Done indexing."); } catch (IOException e) { Console.Error.WriteLine(e); } }
/// <summary> /// Writes a <seealso cref="BaseModel"/> to disk. Occurring errors are printed to the console /// to inform the user. /// </summary> /// <param name="modelName"> type of the model, name is used in error messages. </param> /// <param name="modelFile"> output file of the model </param> /// <param name="model"> the model itself which should be written to disk </param> public static void writeModel(string modelName, Jfile modelFile, BaseModel model) { CmdLineUtil.checkOutputFile(modelName + " model", modelFile); Console.Error.Write("Writing " + modelName + " model ... "); long beginModelWritingTime = DateTime.Now.Ticks; FileOutputStream modelOut = null; try { modelOut = new FileOutputStream(modelFile); model.serialize(modelOut); } catch (IOException e) { Console.Error.WriteLine("failed"); throw new TerminateToolException(-1, "Error during writing model file '" + modelFile + "'", e); } finally { if (modelOut != null) { try { modelOut.close(); } catch (IOException e) { Console.Error.WriteLine("Failed to properly close model file '" + modelFile + "': " + e.Message); } } } long modelWritingDuration = DateTime.Now.Ticks - beginModelWritingTime; //System.Error.printf("done (%.3fs)\n", modelWritingDuration / 1000d); Console.Error.WriteLine(); Console.Error.WriteLine("Wrote " + modelName + " model to"); Console.Error.WriteLine("path: " + modelFile.AbsolutePath); Console.Error.WriteLine(); }
public virtual T load(Jfile modelFile) { long beginModelLoadingTime = DateTimeHelperClass.CurrentUnixTimeMillis(); CmdLineUtil.checkInputFile(modelName + " model", modelFile); Console.Error.Write("Loading " + modelName + " model ... "); InputStream modelIn = new BufferedInputStream(CmdLineUtil.openInFile(modelFile), CmdLineUtil.IO_BUFFER_SIZE); T model; try { model = loadModel(modelIn); } catch (InvalidFormatException e) { Console.Error.WriteLine("failed"); throw new TerminateToolException(-1, "Model has invalid format", e); } catch (IOException e) { Console.Error.WriteLine("failed"); throw new TerminateToolException(-1, "IO error while loading model file '" + modelFile + "'", e); } finally { // will not be null because openInFile would // terminate in this case try { modelIn.close(); } catch (IOException) { // sorry that this can fail } } long modelLoadingDuration = DateTimeHelperClass.CurrentUnixTimeMillis() - beginModelLoadingTime; System.err.printf("done (%.3fs)\n", modelLoadingDuration / 1000d); return(model); }
/// <summary> /// Check that the given input file is valid. /// <para> /// To pass the test it must:<br> /// - exist<br> /// - not be a directory<br> /// - accessibly<br> /// /// </para> /// </summary> /// <param name="name"> the name which is used to refer to the file in an error message, it /// should start with a capital letter. /// </param> /// <param name="inFile"> the particular file to check to qualify an input file /// </param> /// <exception cref="TerminateToolException"> if test does not pass this exception is /// thrown and an error message is printed to the console. </exception> public static void checkInputFile(string name, Jfile inFile) { string isFailure = null; if (inFile.IsDirectory) { isFailure = "The " + name + " file is a directory!"; } else if (!inFile.exists()) { isFailure = "The " + name + " file does not exist!"; } else if (!inFile.canRead()) { isFailure = "No permissions to read the " + name + " file!"; } if (null != isFailure) { throw new TerminateToolException(-1, isFailure + " Path: " + inFile.AbsolutePath); } }
public sealed override void run(string format, string[] args) { ModelUpdaterParams @params = validateAndParseParams <ModelUpdaterParams>(ArgumentParser.filter(args, typeof(ModelUpdaterParams)), typeof(ModelUpdaterParams)); // Load model to be updated Jfile modelFile = @params.Model; ParserModel originalParserModel = (new ParserModelLoader()).load(modelFile); ObjectStreamFactory factory = getStreamFactory(format); string[] fargs = ArgumentParser.filter(args, factory.Parameters); validateFactoryArgs(factory, fargs); ObjectStream <Parse> sampleStream = factory.create <Parse>(fargs); ParserModel updatedParserModel; try { updatedParserModel = trainAndUpdate(originalParserModel, sampleStream, @params); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("parser", modelFile, updatedParserModel); }
/// <summary> /// Constructor which takes a File and creates a reader for it. Detects whether /// the file is gzipped or not based on whether the suffix contains ".gz". /// </summary> /// <param name="f"> /// The File in which the model is stored. </param> public PlainTextGISModelReader(Jfile f) : base(f) { }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false); if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } POSTaggerEvaluationMonitor missclassifiedListener = null; if (@params.Misclassified.Value) { missclassifiedListener = new POSEvaluationErrorListener(); } POSTaggerFineGrainedReportListener reportListener = null; Jfile reportFile = @params.ReportOutputFile; OutputStream reportOutputStream = null; if (reportFile != null) { CmdLineUtil.checkOutputFile("Report Output File", reportFile); try { reportOutputStream = new FileOutputStream(reportFile); reportListener = new POSTaggerFineGrainedReportListener(reportOutputStream); } catch (FileNotFoundException e) { throw new TerminateToolException(-1, "IO error while creating POS Tagger fine-grained report file: " + e.Message); } } POSTaggerCrossValidator validator; try { validator = new POSTaggerCrossValidator(@params.Lang, mlParams, @params.Dict, @params.Ngram, @params.TagDictCutoff, @params.Factory, missclassifiedListener, reportListener); validator.evaluate(sampleStream, @params.Folds.Value); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } Console.WriteLine("done"); if (reportListener != null) { Console.WriteLine("Writing fine-grained report to " + @params.ReportOutputFile.AbsolutePath); reportListener.writeReport(); try { // TODO: is it a problem to close the stream now? reportOutputStream.close(); } catch (IOException) { // nothing to do } } Console.WriteLine(); Console.WriteLine("Accuracy: " + validator.WordAccuracy); }
public virtual bool accept(Jfile file) { return(file.Name.ToLower().EndsWith(".sgm", StringComparison.Ordinal)); }
public RealValueFileEventStream(Jfile file) : base(file) { }
/// <summary> /// A reader for GIS models which inspects the filename and invokes the /// appropriate GISModelReader depending on the filename's suffixes. /// /// <para>The following assumption are made about suffixes: /// <li>.gz --> the file is gzipped (must be the last suffix) /// <li>.txt --> the file is plain text /// <li>.bin --> the file is binary /// </para> /// </summary> /// <param name="f"> </param> /// <exception cref="IOException"> </exception> public PooledGISModelReader(Jfile f) : base(f) { }
public GISModelReader(Jfile file) : base(file) { }
/// <summary> /// Main method. Call as follows: /// <para> /// java ModelTrainer dataFile modelFile /// </para> /// </summary> public static void Main(string[] args) { int ai = 0; bool real = false; string type = "maxent"; int maxit = 100; int cutoff = 1; double sigma = 1.0; if (args.Length == 0) { usage(); } while (args[ai].StartsWith("-", StringComparison.Ordinal)) { if (args[ai].Equals("-real")) { real = true; } else if (args[ai].Equals("-perceptron")) { type = "perceptron"; } else if (args[ai].Equals("-maxit")) { maxit = Convert.ToInt32(args[++ai]); } else if (args[ai].Equals("-cutoff")) { cutoff = Convert.ToInt32(args[++ai]); } else if (args[ai].Equals("-sigma")) { sigma = Convert.ToDouble(args[++ai]); } else { Console.Error.WriteLine("Unknown option: " + args[ai]); usage(); } ai++; } string dataFileName = args[ai++]; string modelFileName = args[ai]; try { FileReader datafr = new FileReader(new Jfile(dataFileName)); EventStream es; if (!real) { es = new BasicEventStream(new PlainTextByLineDataStream(datafr), ","); } else { es = new RealBasicEventStream(new PlainTextByLineDataStream(datafr)); } Jfile outputFile = new Jfile(modelFileName); AbstractModelWriter writer; AbstractModel model; if (type.Equals("maxent")) { GIS.SMOOTHING_OBSERVATION = SMOOTHING_OBSERVATION; if (!real) { model = GIS.trainModel(es, maxit, cutoff, sigma); } else { model = GIS.trainModel(maxit, new OnePassRealValueDataIndexer(es, cutoff), USE_SMOOTHING); } writer = new SuffixSensitiveGISModelWriter(model, outputFile); } else if (type.Equals("perceptron")) { //System.err.println("Perceptron training"); model = (new PerceptronTrainer()).trainModel(maxit, new OnePassDataIndexer(es, cutoff), cutoff); writer = new SuffixSensitivePerceptronModelWriter(model, outputFile); } else { throw new Exception("Unknown model type: " + type); } writer.persist(); } catch (Exception e) { Console.Write("Unable to create model due to exception: "); Console.WriteLine(e); Console.WriteLine(e.ToString()); Console.Write(e.StackTrace); } }
public FileOutputStream(Jfile file) : base(new FileStream(file.AbsolutePath, FileMode.OpenOrCreate)) { }
/// <summary> /// Creates a new file event stream from the specified file. </summary> /// <param name="file"> the file containing the events. </param> /// <exception cref="IOException"> When the specified file can not be read. </exception> public FileEventStream(Jfile file) { reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF8")); }
/// <summary> /// Constructor which takes a File and creates a reader for it. Detects /// whether the file is gzipped or not based on whether the suffix contains /// ".gz". /// </summary> /// <param name="f"> The File in which the model is stored. </param> public PlainTextPerceptronModelReader(Jfile f) : base(f) { }
public FileInputStream(Jfile jfile) : base(jfile.Name) { }
// TODO: Add param to train tree insert parser public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, true); if (mlParams != null) { if (!TrainUtil.isValid(mlParams.getSettings("build"))) { throw new TerminateToolException(1, "Build training parameters are invalid!"); } if (!TrainUtil.isValid(mlParams.getSettings("check"))) { throw new TerminateToolException(1, "Check training parameters are invalid!"); } if (!TrainUtil.isValid(mlParams.getSettings("attach"))) { throw new TerminateToolException(1, "Attach training parameters are invalid!"); } if (!TrainUtil.isValid(mlParams.getSettings("tagger"))) { throw new TerminateToolException(1, "Tagger training parameters are invalid!"); } if (!TrainUtil.isValid(mlParams.getSettings("chunker"))) { throw new TerminateToolException(1, "Chunker training parameters are invalid!"); } } if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } Jfile modelOutFile = @params.Model; CmdLineUtil.checkOutputFile("parser model", modelOutFile); ParserModel model; try { // TODO hard-coded language reference HeadRules rules = new opennlp.tools.parser.lang.en.HeadRules(new InputStreamReader(new FileInputStream(@params.HeadRules), @params.Encoding)); var type = parseParserType(@params.ParserType); if (@params.Fun.Value) { Parse.useFunctionTags(true); } if (ParserType.CHUNKING == type) { model = Parser.train(@params.Lang, sampleStream, rules, mlParams); } else if (ParserType.TREEINSERT == type) { model = opennlp.tools.parser.treeinsert.Parser.train(@params.Lang, sampleStream, rules, mlParams); } else { throw new IllegalStateException(); } } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("parser", modelOutFile, model); }
public QNModelReader(Jfile file) : base(file) { }
public GenericModelReader(Jfile f) : base(f) { }