Beispiel #1
0
        internal static FileInfo PreprocessLanguage(
            FileInfo languageFile,
            DirectoryInfo directory,
            string languageCode,
            FileInfo spmModel,
            bool includePlaceholderTags,
            bool includeTagPairs)
        {
            var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}"));

            //Marian doesn't like spaces in names
            var spFile = new FileInfo(Path.Combine(directory.FullName, $"sp_{languageFile.Name.Replace(" ", "_")}"));

            using (var rawFile = languageFile.OpenText())
                using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName))
                {
                    String line;
                    while ((line = rawFile.ReadLine()) != null)
                    {
                        var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs);
                        preprocessedWriter.WriteLine(preprocessedLine);
                    }
                }

            var spArgs     = $"\"{preprocessedFile.FullName}\" --model \"{spmModel.FullName}\" --output \"{spFile.FullName}\"";
            var spmProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs);

            spmProcess.WaitForExit();
            return(spFile);
        }
        internal FileInfo PreprocessInput(IEnumerable <string> input, Boolean preprocessedInput = false)
        {
            var fileGuid = Guid.NewGuid();
            var srcFile  = new FileInfo(Path.Combine(Path.GetTempPath(), $"{fileGuid}.{this.SourceCode}"));

            using (var srcStream = new StreamWriter(srcFile.FullName, true, Encoding.UTF8))
            {
                foreach (var line in input)
                {
                    srcStream.WriteLine(line);
                }
            }

            FileInfo spSrcFile;

            if (!preprocessedInput)
            {
                var spmModel = this.modelDir.GetFiles("source.spm").Single();
                spSrcFile = MarianHelper.PreprocessLanguage(srcFile, new DirectoryInfo(Path.GetTempPath()), this.SourceCode, spmModel, this.includePlaceholderTags, this.includeTagPairs);
            }
            else
            {
                spSrcFile = srcFile;
            }
            return(spSrcFile);
        }
        //Callback can be used to do different things with translation output/input (default is to save in translation cache)
        internal Process BatchTranslate(
            IEnumerable <string> input,
            FileInfo spOutput,
            Boolean preprocessedInput = false,
            Boolean storeTranslations = false)
        {
            if (storeTranslations)
            {
                this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spOutput);
            }

            Log.Information($"Starting batch translator for model {this.SystemName}.");

            var cmd = "TranslateBatchSentencePiece.bat";

            FileInfo spInput = this.PreprocessInput(input, preprocessedInput);

            //TODO: check the translation cache for translations beforehand, and only translate new
            //segments (also change translation cache to account for different decoder configs for
            //same systems, i.e. keep track of decoder settings)

            FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign");
            var      args          = $"{this.modelDir.FullName} {spInput.FullName} {transAndAlign.FullName} --log-level=info --quiet";

            EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y);

            var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler);


            return(batchProcess);
        }
        private void PreprocessInput()
        {
            FileInfo sourceSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();
            FileInfo targetSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single();

            this.segmentationMethod = sourceSegModel.Extension;

            var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null;

            this.spSource = MarianHelper.PreprocessLanguage(
                this.customSource,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spTarget = MarianHelper.PreprocessLanguage(
                this.customTarget,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos =
                HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode);

            if (tatoebaValidFileInfos == null)
            {
                tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir);
            }

            ParallelFilePair combinedValid = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(
                combinedValid.Source,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spValidTarget = MarianHelper.PreprocessLanguage(
                combinedValid.Target,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);
        }
        internal Process BatchTranslate(
            IEnumerable <string> input,
            FileInfo spOutput,
            Boolean preprocessedInput = false,
            Boolean storeTranslations = false)
        {
            Log.Information($"Starting batch translator for model {this.SystemName}.");

            var      srcFile = MarianHelper.LinesToFile(input, this.SourceCode);
            FileInfo spInput;

            if (!preprocessedInput)
            {
                FileInfo sourceSegModel =
                    this.modelDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();

                spInput = MarianHelper.PreprocessLanguage(
                    srcFile,
                    new DirectoryInfo(Path.GetTempPath()),
                    this.TargetCode,
                    sourceSegModel,
                    this.includePlaceholderTags,
                    this.includeTagPairs);
            }
            else
            {
                spInput = srcFile;
            }

            //TODO: check the translation cache for translations beforehand, and only translate new
            //segments (also change translation cache to account for different decoder configs for
            //same systems, i.e. keep track of decoder settings)

            FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign");
            var      args          = $"\"{this.modelDir.FullName}\" \"{spInput.FullName}\" \"{transAndAlign.FullName}\" --log-level=info --quiet";

            if (storeTranslations)
            {
                this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spInput, transAndAlign);
            }

            EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y);

            var cmd          = "TranslateBatchSentencePiece.bat";
            var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler);


            return(batchProcess);
        }
Beispiel #6
0
        internal static void GenerateAlignments(FileInfo spSource, FileInfo spTarget, FileInfo alignmentFile, FileInfo priorsFile)
        {
            var alignArgs = $"-s \"{spSource.FullName}\" -t \"{spTarget.FullName}\" -f \"{alignmentFile.FullName}.fwd\" -r \"{alignmentFile.FullName}.rev\"";

            Log.Information($"Aligning fine-tuning corpus with args {alignArgs}");
            var alignProcess = MarianHelper.StartProcessInBackgroundWithRedirects("python Alignment\\align.py", alignArgs);

            alignProcess.WaitForExit();

            var symmetryArgs = $"-c grow-diag-final -i \"{alignmentFile.FullName}.fwd\" -j \"{alignmentFile.FullName}.rev\" > \"{alignmentFile.FullName}\"";

            Log.Information($"Symmetrisizing alignment with args {symmetryArgs}");
            var symmetryProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Alignment\\atools.exe", symmetryArgs);

            symmetryProcess.WaitForExit();
        }
Beispiel #7
0
        private void PreprocessInput()
        {
            var sourceSpm = this.customDir.GetFiles("source.spm").Single();
            var targetSpm = this.customDir.GetFiles("target.spm").Single();

            this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode);
            ParallelFilePair combinedValid         = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);
        }
        private Process StartTraining()
        {
            var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            var deserializer = new Deserializer();
            MarianTrainerConfig trainingConfig;

            using (var reader = new StreamReader(configPath))
            {
                trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader);
            }

            this.trainingLog.TrainingConfig = trainingConfig;

            //var trainingArgs = $"--config {configPath} --log-level=warn";
            var trainingArgs = $"--config \"{configPath}\" --log-level=info"; // --quiet";

            var trainProcess = MarianHelper.StartProcessInBackgroundWithRedirects(
                Path.Combine(OpusCatMTEngineSettings.Default.MarianDir, "marian.exe"), trainingArgs, this.MarianExitHandler, this.MarianProgressHandler);

            return(trainProcess);
        }
        public Process Customize()
        {
            this.OnProgressChanged(new ProgressChangedEventArgs(1, new MarianCustomizationStatus(CustomizationStep.Copying_model, null)));
            //First copy the model to new dir
            try
            {
                this.CopyModelDir(this.modelDir, this.customLabel);
                //Save model config as soon as the model dir exists
                this.customModel.SaveModelConfig();
            }
            catch (Exception ex)
            {
                Log.Information($"Customization failed: {ex.Message}");
                return(null);
            }

            //Save the batch to translate after customization to a file (to be batch translated after successful exit)
            if (this.postCustomizationBatch != null && this.postCustomizationBatch.Count > 0)
            {
                FileInfo postCustomizationBatchFile = new FileInfo(Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.PostFinetuneBatchName));
                using (var writer = postCustomizationBatchFile.CreateText())
                {
                    foreach (var sourceString in this.postCustomizationBatch)
                    {
                        writer.WriteLine(sourceString);
                    }
                }
            }

            this.OnProgressChanged(new ProgressChangedEventArgs(2, new MarianCustomizationStatus(CustomizationStep.Copying_training_files, null)));
            //Copy raw files to model dir
            this.customSource = this.customSource.CopyTo(Path.Combine(this.customDir.FullName, "custom.source"));
            this.customTarget = this.customTarget.CopyTo(Path.Combine(this.customDir.FullName, "custom.target"));

            this.OnProgressChanged(new ProgressChangedEventArgs(3, new MarianCustomizationStatus(CustomizationStep.Preprocessing_training_files, null)));
            //Preprocess input files
            this.PreprocessInput();

            var decoderYaml  = this.customDir.GetFiles("decoder.yml").Single();
            var deserializer = new Deserializer();

            var decoderSettings = deserializer.Deserialize <MarianDecoderConfig>(decoderYaml.OpenText());

            if (this.guidedAlignment)
            {
                //Generate alignments for fine-tuning corpus
                this.alignmentFile = new FileInfo(Path.Combine(this.customDir.FullName, "custom.alignments"));
                MarianHelper.GenerateAlignments(this.spSource, this.spTarget, this.alignmentFile, this.model.AlignmentPriorsFile);

                //
                //Generate alignments for validation set (for evaluating fine-tuning effect on alignment)
                this.validAlignmentFile = new FileInfo(Path.Combine(this.customDir.FullName, "combined.alignments"));
                MarianHelper.GenerateAlignments(this.spValidSource, this.spValidTarget, this.validAlignmentFile, this.model.AlignmentPriorsFile);
            }

            this.OnProgressChanged(new ProgressChangedEventArgs(4, new MarianCustomizationStatus(CustomizationStep.Initial_evaluation, null)));
            //Do the initial evaluation
            var initialValidProcess = this.model.TranslateAndEvaluate(
                this.spValidSource,
                new FileInfo(Path.Combine(this.customDir.FullName, "valid.0.txt")),
                this.spValidTarget,
                OpusCatMTEngineSettings.Default.OODValidSetSize,
                this.sourceLanguage,
                this.targetLanguage,
                true
                );

            //Wait for the initial valid to finish before starting customization
            //(TODO: make sure this is not done on UI thread)
            initialValidProcess.WaitForExit();

            this.OnProgressChanged(new ProgressChangedEventArgs(6, new MarianCustomizationStatus(CustomizationStep.Finetuning, null)));

            //Use the initial translation time as basis for estimating the duration of validation file
            //translation
            this.trainingLog.EstimatedTranslationDuration = Convert.ToInt32((initialValidProcess.ExitTime - initialValidProcess.StartTime).TotalSeconds);

            MarianTrainerConfig trainingConfig;

            var baseCustomizeYmlPath =
                HelperFunctions.GetLocalAppDataPath(
                    OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            var processDir = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);

            //Make sure there's a customization file.
            if (!File.Exists(baseCustomizeYmlPath))
            {
                File.Copy(
                    Path.Combine(processDir, OpusCatMTEngineSettings.Default.CustomizationBaseConfig),
                    baseCustomizeYmlPath);
            }

            //deserialize yaml file
            using (var reader = new StreamReader(baseCustomizeYmlPath))
            {
                trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader);
            }

            trainingConfig.trainSets = new List <string>
            {
                this.spSource.FullName,
                this.spTarget.FullName
            };

            trainingConfig.ValidSets = new List <string>
            {
                this.spValidSource.FullName,
                this.spValidTarget.FullName
            };

            trainingConfig.vocabs = new List <string>
            {
                Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0]),
                Path.Combine(this.customDir.FullName, decoderSettings.vocabs[0])
            };

            switch (this.segmentationMethod)
            {
            case ".bpe":
                string validScriptPath = Path.Combine(this.customDir.FullName, "ValidateBpe.bat");
                trainingConfig.validScriptPath =
                    $"\"{validScriptPath}\"";
                File.Copy(
                    Path.Combine(processDir, "ValidateBpe.bat"), validScriptPath);
                break;

            case ".spm":
                validScriptPath = Path.Combine(this.customDir.FullName, "ValidateSp.bat");
                trainingConfig.validScriptPath =
                    $"\"{validScriptPath}\"";
                File.Copy(
                    Path.Combine(processDir, "ValidateSp.bat"), validScriptPath);
                break;

            default:
                break;
            }

            trainingConfig.validScriptArgs =
                new List <string> {
                $"{spValidTarget.FullName}",
                $"OOD{OpusCatMTEngineSettings.Default.OODValidSetSize.ToString()}"
            };
            trainingConfig.validTranslationOutput = Path.Combine(this.customDir.FullName, "valid.{U}.txt");

            if (this.guidedAlignment)
            {
                trainingConfig.guidedAlignment = this.alignmentFile.FullName;
            }

            trainingConfig.validLog = Path.Combine(this.customDir.FullName, "valid.log");
            trainingConfig.log      = Path.Combine(this.customDir.FullName, "train.log");

            trainingConfig.model = Path.Combine(this.customDir.FullName, decoderSettings.models.Single());

            var builder = new SerializerBuilder();

            builder.ConfigureDefaultValuesHandling(DefaultValuesHandling.OmitNull);
            var serializer = builder.Build();

            var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            using (var writer = File.CreateText(configPath))
            {
                serializer.Serialize(writer, trainingConfig, typeof(MarianTrainerConfig));
            }

            Process trainProcess = this.StartTraining();

            return(trainProcess);
        }
Beispiel #10
0
        internal static FileInfo PreprocessLanguage(
            FileInfo languageFile,
            DirectoryInfo directory,
            string languageCode,
            FileInfo segmentationModel,
            bool includePlaceholderTags,
            bool includeTagPairs,
            string targetLanguageToPrefix = null)
        {
            var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}"));

            using (var rawFile = languageFile.OpenText())
                using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName))
                {
                    String line;
                    while ((line = rawFile.ReadLine()) != null)
                    {
                        var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs);
                        preprocessedWriter.WriteLine(preprocessedLine);
                    }
                }

            //Marian doesn't like spaces in names
            var segmentedFile = new FileInfo(Path.Combine(directory.FullName, $"seg_{languageFile.Name.Replace(" ", "_")}"));

            switch (segmentationModel.Extension)
            {
            case ".spm":
                var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{segmentationModel.FullName}\" --output \"{segmentedFile.FullName}\"";
                var segmentationProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs);
                segmentationProcess.WaitForExit();
                break;

            case ".bpe":
                //Truecasing is not used in any models, so this is a dummy tc model (empty). So it does not
                //matter is source.tcmodel is used for target language.
                var tcModelPath = $@"{directory.FullName}\source.tcmodel";

                var mosesProcess = MarianHelper.StartProcessInBackgroundWithRedirects(
                    $"type {preprocessedFile.FullName} | Preprocessing\\StartMosesBpePreprocessPipe.bat {languageCode} \"{tcModelPath}\" \"{segmentationModel.FullName}\" > {segmentedFile.FullName}");
                mosesProcess.WaitForExit();
                break;

            default:
                segmentationProcess = null;
                throw new Exception("No segmentation model found");
                break;
            }

            if (targetLanguageToPrefix != null)
            {
                var segmentedWithTargetPrefix = new FileInfo(Path.Combine(directory.FullName, $"prefix_{languageFile.Name.Replace(" ", "_")}"));

                using (var segFile = segmentedFile.OpenText())
                    using (var prefixWriter = new StreamWriter(segmentedWithTargetPrefix.FullName))
                    {
                        String line;
                        while ((line = segFile.ReadLine()) != null)
                        {
                            var prefixedLine = $">>{targetLanguageToPrefix}<< {line}";
                            prefixWriter.WriteLine(prefixedLine);
                        }
                    }

                return(segmentedWithTargetPrefix);
            }
            else
            {
                return(segmentedFile);
            }
        }