Beispiel #1
0
        internal static FileInfo PreprocessLanguage(
            FileInfo languageFile,
            DirectoryInfo directory,
            string languageCode,
            FileInfo spmModel,
            bool includePlaceholderTags,
            bool includeTagPairs)
        {
            var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}"));

            //Marian doesn't like spaces in names
            var spFile = new FileInfo(Path.Combine(directory.FullName, $"sp_{languageFile.Name.Replace(" ", "_")}"));

            using (var rawFile = languageFile.OpenText())
                using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName))
                {
                    String line;
                    while ((line = rawFile.ReadLine()) != null)
                    {
                        var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs);
                        preprocessedWriter.WriteLine(preprocessedLine);
                    }
                }

            var spArgs     = $"\"{preprocessedFile.FullName}\" --model \"{spmModel.FullName}\" --output \"{spFile.FullName}\"";
            var spmProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs);

            spmProcess.WaitForExit();
            return(spFile);
        }
        //Callback can be used to do different things with translation output/input (default is to save in translation cache)
        internal Process BatchTranslate(
            IEnumerable <string> input,
            FileInfo spOutput,
            Boolean preprocessedInput = false,
            Boolean storeTranslations = false)
        {
            if (storeTranslations)
            {
                this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spOutput);
            }

            Log.Information($"Starting batch translator for model {this.SystemName}.");

            var cmd = "TranslateBatchSentencePiece.bat";

            FileInfo spInput = this.PreprocessInput(input, preprocessedInput);

            //TODO: check the translation cache for translations beforehand, and only translate new
            //segments (also change translation cache to account for different decoder configs for
            //same systems, i.e. keep track of decoder settings)

            FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign");
            var      args          = $"{this.modelDir.FullName} {spInput.FullName} {transAndAlign.FullName} --log-level=info --quiet";

            EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y);

            var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler);


            return(batchProcess);
        }
        internal Process BatchTranslate(
            IEnumerable <string> input,
            FileInfo spOutput,
            Boolean preprocessedInput = false,
            Boolean storeTranslations = false)
        {
            Log.Information($"Starting batch translator for model {this.SystemName}.");

            var      srcFile = MarianHelper.LinesToFile(input, this.SourceCode);
            FileInfo spInput;

            if (!preprocessedInput)
            {
                FileInfo sourceSegModel =
                    this.modelDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();

                spInput = MarianHelper.PreprocessLanguage(
                    srcFile,
                    new DirectoryInfo(Path.GetTempPath()),
                    this.TargetCode,
                    sourceSegModel,
                    this.includePlaceholderTags,
                    this.includeTagPairs);
            }
            else
            {
                spInput = srcFile;
            }

            //TODO: check the translation cache for translations beforehand, and only translate new
            //segments (also change translation cache to account for different decoder configs for
            //same systems, i.e. keep track of decoder settings)

            FileInfo transAndAlign = new FileInfo($"{spOutput.FullName}.transandalign");
            var      args          = $"\"{this.modelDir.FullName}\" \"{spInput.FullName}\" \"{transAndAlign.FullName}\" --log-level=info --quiet";

            if (storeTranslations)
            {
                this.OutputReady += (x, y) => this.WriteToTranslationDb(x, y, input, spInput, transAndAlign);
            }

            EventHandler exitHandler = (x, y) => BatchProcess_Exited(transAndAlign, spOutput, x, y);

            var cmd          = "TranslateBatchSentencePiece.bat";
            var batchProcess = MarianHelper.StartProcessInBackgroundWithRedirects(cmd, args, exitHandler);


            return(batchProcess);
        }
Beispiel #4
0
        internal static void GenerateAlignments(FileInfo spSource, FileInfo spTarget, FileInfo alignmentFile, FileInfo priorsFile)
        {
            var alignArgs = $"-s \"{spSource.FullName}\" -t \"{spTarget.FullName}\" -f \"{alignmentFile.FullName}.fwd\" -r \"{alignmentFile.FullName}.rev\"";

            Log.Information($"Aligning fine-tuning corpus with args {alignArgs}");
            var alignProcess = MarianHelper.StartProcessInBackgroundWithRedirects("python Alignment\\align.py", alignArgs);

            alignProcess.WaitForExit();

            var symmetryArgs = $"-c grow-diag-final -i \"{alignmentFile.FullName}.fwd\" -j \"{alignmentFile.FullName}.rev\" > \"{alignmentFile.FullName}\"";

            Log.Information($"Symmetrisizing alignment with args {symmetryArgs}");
            var symmetryProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Alignment\\atools.exe", symmetryArgs);

            symmetryProcess.WaitForExit();
        }
        private Process StartTraining()
        {
            var configPath = Path.Combine(this.customDir.FullName, OpusCatMTEngineSettings.Default.CustomizationBaseConfig);

            var deserializer = new Deserializer();
            MarianTrainerConfig trainingConfig;

            using (var reader = new StreamReader(configPath))
            {
                trainingConfig = deserializer.Deserialize <MarianTrainerConfig>(reader);
            }

            this.trainingLog.TrainingConfig = trainingConfig;

            //var trainingArgs = $"--config {configPath} --log-level=warn";
            var trainingArgs = $"--config \"{configPath}\" --log-level=info"; // --quiet";

            var trainProcess = MarianHelper.StartProcessInBackgroundWithRedirects(
                Path.Combine(OpusCatMTEngineSettings.Default.MarianDir, "marian.exe"), trainingArgs, this.MarianExitHandler, this.MarianProgressHandler);

            return(trainProcess);
        }
Beispiel #6
0
        internal static FileInfo PreprocessLanguage(
            FileInfo languageFile,
            DirectoryInfo directory,
            string languageCode,
            FileInfo segmentationModel,
            bool includePlaceholderTags,
            bool includeTagPairs,
            string targetLanguageToPrefix = null)
        {
            var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}"));

            using (var rawFile = languageFile.OpenText())
                using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName))
                {
                    String line;
                    while ((line = rawFile.ReadLine()) != null)
                    {
                        var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs);
                        preprocessedWriter.WriteLine(preprocessedLine);
                    }
                }

            //Marian doesn't like spaces in names
            var segmentedFile = new FileInfo(Path.Combine(directory.FullName, $"seg_{languageFile.Name.Replace(" ", "_")}"));

            switch (segmentationModel.Extension)
            {
            case ".spm":
                var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{segmentationModel.FullName}\" --output \"{segmentedFile.FullName}\"";
                var segmentationProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs);
                segmentationProcess.WaitForExit();
                break;

            case ".bpe":
                //Truecasing is not used in any models, so this is a dummy tc model (empty). So it does not
                //matter is source.tcmodel is used for target language.
                var tcModelPath = $@"{directory.FullName}\source.tcmodel";

                var mosesProcess = MarianHelper.StartProcessInBackgroundWithRedirects(
                    $"type {preprocessedFile.FullName} | Preprocessing\\StartMosesBpePreprocessPipe.bat {languageCode} \"{tcModelPath}\" \"{segmentationModel.FullName}\" > {segmentedFile.FullName}");
                mosesProcess.WaitForExit();
                break;

            default:
                segmentationProcess = null;
                throw new Exception("No segmentation model found");
                break;
            }

            if (targetLanguageToPrefix != null)
            {
                var segmentedWithTargetPrefix = new FileInfo(Path.Combine(directory.FullName, $"prefix_{languageFile.Name.Replace(" ", "_")}"));

                using (var segFile = segmentedFile.OpenText())
                    using (var prefixWriter = new StreamWriter(segmentedWithTargetPrefix.FullName))
                    {
                        String line;
                        while ((line = segFile.ReadLine()) != null)
                        {
                            var prefixedLine = $">>{targetLanguageToPrefix}<< {line}";
                            prefixWriter.WriteLine(prefixedLine);
                        }
                    }

                return(segmentedWithTargetPrefix);
            }
            else
            {
                return(segmentedFile);
            }
        }