示例#1
0
        /// <summary>
        /// Copy customization config file from the executable dir (those are kept as default which you can revert to)
        /// </summary>
        private void CopyConfigs()
        {
            FileInfo baseCustomizeYml = new FileInfo(
                HelperFunctions.GetLocalAppDataPath(OpusCatMTEngineSettings.Default.CustomizationBaseConfig));
            FileInfo defaultCustomizeYml = new FileInfo(OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            //There might be a previous customize.yml file present, don't overwrite it unless it's older
            if (!baseCustomizeYml.Exists || (defaultCustomizeYml.LastWriteTime > baseCustomizeYml.LastWriteTime))
            {
                File.Copy(OpusCatMTEngineSettings.Default.CustomizationBaseConfig, baseCustomizeYml.FullName, true);
            }
        }
示例#2
0
        public Process Customize()
        {
            this.OnProgressChanged(new ProgressChangedEventArgs(1, new MarianCustomizationStatus(CustomizationStep.Copying_model, null)));
            //First copy the model to new dir
            try
            {
                this.CopyModelDir(this.modelDir, this.customLabel);
                //Save model config as soon as the model dir exists
                this.customModel.SaveModelConfig();
            }
            catch (Exception ex)
            {
                Log.Information($"Customization failed: {ex.Message}");
                return(null);
            }

            //Save the batch to translate after customization to a file (to be batch translated after successful exit)
            if (this.postCustomizationBatch != null && this.postCustomizationBatch.Count > 0)
            {
                FileInfo postCustomizationBatchFile = new FileInfo(Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.PostFinetuneBatchName));
                using (var writer = postCustomizationBatchFile.CreateText())
                {
                    foreach (var sourceString in this.postCustomizationBatch)
                    {
                        writer.WriteLine(sourceString);
                    }
                }
            }

            this.OnProgressChanged(new ProgressChangedEventArgs(2, new MarianCustomizationStatus(CustomizationStep.Copying_training_files, null)));
            //Copy raw files to model dir
            this.customSource = this.customSource.CopyTo(Path.Combine(this.customDir.FullName, "custom.source"));
            this.customTarget = this.customTarget.CopyTo(Path.Combine(this.customDir.FullName, "custom.target"));

            this.OnProgressChanged(new ProgressChangedEventArgs(3, new MarianCustomizationStatus(CustomizationStep.Preprocessing_training_files, null)));
            //Preprocess input files
            this.PreprocessInput();

            var decoderYaml  = this.customDir.GetFiles("decoder.yml").Single();
            var deserializer = new Deserializer();

            var decoderSettings = deserializer.Deserialize <MarianDecoderConfig>(decoderYaml.OpenText());

            if (this.guidedAlignment)
            {
                //Generate alignments for fine-tuning corpus
                this.alignmentFile = new FileInfo(Path.Combine(this.customDir.FullName, "custom.alignments"));
                MarianHelper.GenerateAlignments(this.spSource, this.spTarget, this.alignmentFile, this.model.AlignmentPriorsFile);

                //
                //Generate alignments for validation set (for evaluating fine-tuning effect on alignment)
                this.validAlignmentFile = new FileInfo(Path.Combine(this.customDir.FullName, "combined.alignments"));
                MarianHelper.GenerateAlignments(this.spValidSource, this.spValidTarget, this.validAlignmentFile, this.model.AlignmentPriorsFile);
            }

            this.OnProgressChanged(new ProgressChangedEventArgs(4, new MarianCustomizationStatus(CustomizationStep.Initial_evaluation, null)));
            //Do the initial evaluation
            var initialValidProcess = this.model.TranslateAndEvaluate(
                this.spValidSource,
                new FileInfo(Path.Combine(this.customDir.FullName, "valid.0.txt")),
                this.spValidTarget,
                OpusCatMTEngineSettings.Default.OODValidSetSize,
                this.sourceLanguage,
                this.targetLanguage,
                true
                );

            //Wait for the initial valid to finish before starting customization
            //(TODO: make sure this is not done on UI thread)
            initialValidProcess.WaitForExit();

            this.OnProgressChanged(new ProgressChangedEventArgs(6, new MarianCustomizationStatus(CustomizationStep.Finetuning, null)));

            //Use the initial translation time as basis for estimating the duration of validation file
            //translation
            this.trainingLog.EstimatedTranslationDuration = Convert.ToInt32((initialValidProcess.ExitTime - initialValidProcess.StartTime).TotalSeconds);

            MarianTrainerConfig trainingConfig;

            var baseCustomizeYmlPath =
                HelperFunctions.GetLocalAppDataPath(
                    OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            var processDir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);

            //Make sure there's a customization file.
            if (!File.Exists(baseCustomizeYmlPath))
            {
                File.Copy(
                    Path.Combine(processDir, OpusCatMTEngineSettings.Default.CustomizationBaseConfig),
                    baseCustomizeYmlPath);
            }

            //deserialize yaml file
            using (var reader = new StreamReader(baseCustomizeYmlPath))
            {
                trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader);
            }

            trainingConfig.trainSets = new List <string>
            {
                this.spSource.FullName,
                this.spTarget.FullName
            };

            trainingConfig.ValidSets = new List <string>
            {
                this.spValidSource.FullName,
                this.spValidTarget.FullName
            };

            trainingConfig.vocabs = new List <string>
            {
                Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0]),
                Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0])
            };

            switch (this.segmentationMethod)
            {
            case ".bpe":
                string validScriptPath = Path.Combine(this.customDir.FullName, "ValidateBpe.bat");
                trainingConfig.validScriptPath =
                    $"\"{validScriptPath}\"";
                File.Copy(
                    Path.Combine(processDir, "ValidateBpe.bat"), validScriptPath);
                break;

            case ".spm":
                validScriptPath = Path.Combine(this.customDir.FullName, "ValidateSp.bat");
                trainingConfig.validScriptPath =
                    $"\"{validScriptPath}\"";
                File.Copy(
                    Path.Combine(processDir, "ValidateSp.bat"), validScriptPath);
                break;

            default:
                break;
            }

            trainingConfig.validScriptArgs =
                new List <string> {
                $"{spValidTarget.FullName}",
                $"OOD{OpusCatMTEngineSettings.Default.OODValidSetSize.ToString()}"
            };
            trainingConfig.validTranslationOutput = Path.Combine(this.customDir.FullName, "valid.{U}.txt");

            if (this.guidedAlignment)
            {
                trainingConfig.guidedAlignment = this.alignmentFile.FullName;
            }

            trainingConfig.validLog = Path.Combine(this.customDir.FullName, "valid.log");
            trainingConfig.log      = Path.Combine(this.customDir.FullName, "train.log");

            trainingConfig.model = Path.Combine(this.customDir.FullName, decoderSettings.models.Single());

            var builder = new SerializerBuilder();

            builder.ConfigureDefaultValuesHandling(DefaultValuesHandling.OmitNull);
            var serializer = builder.Build();

            var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            using (var writer = File.CreateText(configPath))
            {
                serializer.Serialize(writer, trainingConfig, typeof(MarianTrainerConfig));
            }

            Process trainProcess = this.StartTraining();

            return(trainProcess);
        }
        private void OpenCustomSettingsInEditor_Click(object sender, RoutedEventArgs e)
        {
            var customizeYml = HelperFunctions.GetLocalAppDataPath(OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            Process.Start("notepad.exe", customizeYml);
        }