Пример #1
0
        internal static FileInfo PreprocessLanguage(
            FileInfo languageFile,
            DirectoryInfo directory,
            string languageCode,
            FileInfo spmModel,
            bool includePlaceholderTags,
            bool includeTagPairs)
        {
            var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}"));

            //Marian doesn't like spaces in names
            var spFile = new FileInfo(Path.Combine(directory.FullName, $"sp_{languageFile.Name.Replace(" ", "_")}"));

            using (var rawFile = languageFile.OpenText())
                using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName))
                {
                    String line;
                    while ((line = rawFile.ReadLine()) != null)
                    {
                        var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs);
                        preprocessedWriter.WriteLine(preprocessedLine);
                    }
                }

            var spArgs     = $"\"{preprocessedFile.FullName}\" --model \"{spmModel.FullName}\" --output \"{spFile.FullName}\"";
            var spmProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs);

            spmProcess.WaitForExit();
            return(spFile);
        }
Пример #2
0
        internal static FileInfo PreprocessLanguage(
            FileInfo languageFile,
            DirectoryInfo directory,
            string languageCode,
            FileInfo segmentationModel,
            bool includePlaceholderTags,
            bool includeTagPairs,
            string targetLanguageToPrefix = null)
        {
            var preprocessedFile = new FileInfo(Path.Combine(directory.FullName, $"preprocessed_{languageFile.Name}"));

            using (var rawFile = languageFile.OpenText())
                using (var preprocessedWriter = new StreamWriter(preprocessedFile.FullName))
                {
                    String line;
                    while ((line = rawFile.ReadLine()) != null)
                    {
                        var preprocessedLine = MarianHelper.PreprocessLine(line, languageCode, includePlaceholderTags, includeTagPairs);
                        preprocessedWriter.WriteLine(preprocessedLine);
                    }
                }

            //Marian doesn't like spaces in names
            var segmentedFile = new FileInfo(Path.Combine(directory.FullName, $"seg_{languageFile.Name.Replace(" ", "_")}"));

            switch (segmentationModel.Extension)
            {
            case ".spm":
                var spArgs = $"\"{preprocessedFile.FullName}\" --model \"{segmentationModel.FullName}\" --output \"{segmentedFile.FullName}\"";
                var segmentationProcess = MarianHelper.StartProcessInBackgroundWithRedirects("Preprocessing\\spm_encode.exe", spArgs);
                segmentationProcess.WaitForExit();
                break;

            case ".bpe":
                //Truecasing is not used in any models, so this is a dummy tc model (empty). So it does not
                //matter is source.tcmodel is used for target language.
                var tcModelPath = $@"{directory.FullName}\source.tcmodel";

                var mosesProcess = MarianHelper.StartProcessInBackgroundWithRedirects(
                    $"type {preprocessedFile.FullName} | Preprocessing\\StartMosesBpePreprocessPipe.bat {languageCode} \"{tcModelPath}\" \"{segmentationModel.FullName}\" > {segmentedFile.FullName}");
                mosesProcess.WaitForExit();
                break;

            default:
                segmentationProcess = null;
                throw new Exception("No segmentation model found");
                break;
            }

            if (targetLanguageToPrefix != null)
            {
                var segmentedWithTargetPrefix = new FileInfo(Path.Combine(directory.FullName, $"prefix_{languageFile.Name.Replace(" ", "_")}"));

                using (var segFile = segmentedFile.OpenText())
                    using (var prefixWriter = new StreamWriter(segmentedWithTargetPrefix.FullName))
                    {
                        String line;
                        while ((line = segFile.ReadLine()) != null)
                        {
                            var prefixedLine = $">>{targetLanguageToPrefix}<< {line}";
                            prefixWriter.WriteLine(prefixedLine);
                        }
                    }

                return(segmentedWithTargetPrefix);
            }
            else
            {
                return(segmentedFile);
            }
        }