public MarianCustomizer(
     MTModel model,
     MTModel customModel,
     ParallelFilePair inputPair,
     ParallelFilePair indomainValidPair,
     string customLabel,
     bool includePlaceholderTags,
     bool includeTagPairs,
     List <string> postCustomizationBatch,
     IsoLanguage sourceLanguage,
     IsoLanguage targetLanguage,
     bool guidedAlignment = false)
 {
     this.model                    = model;
     this.customModel              = customModel;
     this.modelDir                 = new DirectoryInfo(model.InstallDir);
     this.customDir                = new DirectoryInfo(this.customModel.InstallDir);
     this.customSource             = inputPair.Source;
     this.customTarget             = inputPair.Target;
     this.customLabel              = customLabel;
     this.includePlaceholderTags   = includePlaceholderTags;
     this.includeTagPairs          = includeTagPairs;
     this.inDomainValidationSource = indomainValidPair.Source;
     this.inDomainValidationTarget = indomainValidPair.Target;
     this.sourceLanguage           = sourceLanguage;
     this.targetLanguage           = targetLanguage;
     this.guidedAlignment          = guidedAlignment;
 }
        private void PreprocessInput()
        {
            FileInfo sourceSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single();
            FileInfo targetSegModel =
                this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single();

            this.segmentationMethod = sourceSegModel.Extension;

            var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null;

            this.spSource = MarianHelper.PreprocessLanguage(
                this.customSource,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spTarget = MarianHelper.PreprocessLanguage(
                this.customTarget,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos =
                HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode);

            if (tatoebaValidFileInfos == null)
            {
                tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir);
            }

            ParallelFilePair combinedValid = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(
                combinedValid.Source,
                this.customDir,
                this.sourceLanguage.OriginalCode,
                sourceSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs,
                targetPrefix);
            this.spValidTarget = MarianHelper.PreprocessLanguage(
                combinedValid.Target,
                this.customDir,
                this.targetLanguage.OriginalCode,
                targetSegModel,
                this.includePlaceholderTags,
                this.includeTagPairs);
        }
Exemple #3
0
        //Split a file pair into two randomly (used for separating a validation set from training set)
        internal static (ParallelFilePair pair1, ParallelFilePair pair2) SplitFilePair(ParallelFilePair filePair, int pair2Size)
        {
            //First need to get the linecount of the file pair
            var lines = 0;

            using (var reader = filePair.Source.OpenText())
            {
                while (reader.ReadLine() != null)
                {
                    lines++;
                }
            }

            ParallelFilePair pair1 =
                new ParallelFilePair(
                    $"{filePair.Source.DirectoryName}{Path.DirectorySeparatorChar}split1.{filePair.Source.Name}",
                    $"{filePair.Target.DirectoryName}{Path.DirectorySeparatorChar}split1.{filePair.Target.Name}");
            ParallelFilePair pair2 =
                new ParallelFilePair(
                    $"{filePair.Source.DirectoryName}{Path.DirectorySeparatorChar}split2.{filePair.Source.Name}",
                    $"{filePair.Target.DirectoryName}{Path.DirectorySeparatorChar}split2.{filePair.Target.Name}");

            var nthLine = lines / pair2Size;

            var    writtenLines = 0;
            string sourceLine, targetLine;

            using (var sourcereader = filePair.Source.OpenText())
                using (var sourcewriter1 = pair1.Source.CreateText())
                    using (var sourcewriter2 = pair2.Source.CreateText())
                        using (var targetreader = filePair.Target.OpenText())
                            using (var targetwriter1 = pair1.Target.CreateText())
                                using (var targetwriter2 = pair2.Target.CreateText())
                                {
                                    while
                                    (((sourceLine = sourcereader.ReadLine()) != null) &&
                                     ((targetLine = targetreader.ReadLine()) != null))
                                    {
                                        if (writtenLines % nthLine == 0)
                                        {
                                            sourcewriter2.WriteLine(sourceLine);
                                            targetwriter2.WriteLine(targetLine);
                                        }
                                        else
                                        {
                                            sourcewriter1.WriteLine(sourceLine);
                                            targetwriter1.WriteLine(targetLine);
                                        }
                                        writtenLines++;
                                    }
                                }

            return(pair1, pair2);
        }
Exemple #4
0
 //combine two file pairs
 public ParallelFilePair(ParallelFilePair pair1, ParallelFilePair pair2, string combinedPath, int pair1Lines = 1000, int pair2Lines = 1000)
 {
     this.Source = HelperFunctions.CombineFiles(
         pair1.Source,
         pair2.Source,
         Path.Combine(combinedPath, "combined.source"),
         pair1Lines,
         pair2Lines);
     this.Target = HelperFunctions.CombineFiles(
         pair1.Target,
         pair2.Target,
         Path.Combine(combinedPath, "combined.target"),
         pair1Lines,
         pair2Lines);
 }
Exemple #5
0
        private void PreprocessInput()
        {
            var sourceSpm = this.customDir.GetFiles("source.spm").Single();
            var targetSpm = this.customDir.GetFiles("target.spm").Single();

            this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);

            //concatenate the out-of-domain validation set with the in-domain validation set
            ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode);
            ParallelFilePair combinedValid         = new ParallelFilePair(
                tatoebaValidFileInfos,
                new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget),
                this.customDir.FullName,
                OpusCatMTEngineSettings.Default.OODValidSetSize);

            this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs);
            this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs);
        }