public MarianCustomizer( MTModel model, MTModel customModel, ParallelFilePair inputPair, ParallelFilePair indomainValidPair, string customLabel, bool includePlaceholderTags, bool includeTagPairs, List <string> postCustomizationBatch, IsoLanguage sourceLanguage, IsoLanguage targetLanguage, bool guidedAlignment = false) { this.model = model; this.customModel = customModel; this.modelDir = new DirectoryInfo(model.InstallDir); this.customDir = new DirectoryInfo(this.customModel.InstallDir); this.customSource = inputPair.Source; this.customTarget = inputPair.Target; this.customLabel = customLabel; this.includePlaceholderTags = includePlaceholderTags; this.includeTagPairs = includeTagPairs; this.inDomainValidationSource = indomainValidPair.Source; this.inDomainValidationTarget = indomainValidPair.Target; this.sourceLanguage = sourceLanguage; this.targetLanguage = targetLanguage; this.guidedAlignment = guidedAlignment; }
private void PreprocessInput() { FileInfo sourceSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "source.(spm|bpe)")).Single(); FileInfo targetSegModel = this.customDir.GetFiles().Where(x => Regex.IsMatch(x.Name, "target.(spm|bpe)")).Single(); this.segmentationMethod = sourceSegModel.Extension; var targetPrefix = this.model.TargetLanguages.Count > 1 ? this.targetLanguage.OriginalCode : null; this.spSource = MarianHelper.PreprocessLanguage( this.customSource, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spTarget = MarianHelper.PreprocessLanguage( this.customTarget, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceLanguage.ShortestIsoCode, this.targetLanguage.ShortestIsoCode); if (tatoebaValidFileInfos == null) { tatoebaValidFileInfos = HelperFunctions.GenerateDummyOODValidSet(this.customDir); } ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage( combinedValid.Source, this.customDir, this.sourceLanguage.OriginalCode, sourceSegModel, this.includePlaceholderTags, this.includeTagPairs, targetPrefix); this.spValidTarget = MarianHelper.PreprocessLanguage( combinedValid.Target, this.customDir, this.targetLanguage.OriginalCode, targetSegModel, this.includePlaceholderTags, this.includeTagPairs); }
//Split a file pair into two randomly (used for separating a validation set from training set) internal static (ParallelFilePair pair1, ParallelFilePair pair2) SplitFilePair(ParallelFilePair filePair, int pair2Size) { //First need to get the linecount of the file pair var lines = 0; using (var reader = filePair.Source.OpenText()) { while (reader.ReadLine() != null) { lines++; } } ParallelFilePair pair1 = new ParallelFilePair( $"{filePair.Source.DirectoryName}{Path.DirectorySeparatorChar}split1.{filePair.Source.Name}", $"{filePair.Target.DirectoryName}{Path.DirectorySeparatorChar}split1.{filePair.Target.Name}"); ParallelFilePair pair2 = new ParallelFilePair( $"{filePair.Source.DirectoryName}{Path.DirectorySeparatorChar}split2.{filePair.Source.Name}", $"{filePair.Target.DirectoryName}{Path.DirectorySeparatorChar}split2.{filePair.Target.Name}"); var nthLine = lines / pair2Size; var writtenLines = 0; string sourceLine, targetLine; using (var sourcereader = filePair.Source.OpenText()) using (var sourcewriter1 = pair1.Source.CreateText()) using (var sourcewriter2 = pair2.Source.CreateText()) using (var targetreader = filePair.Target.OpenText()) using (var targetwriter1 = pair1.Target.CreateText()) using (var targetwriter2 = pair2.Target.CreateText()) { while (((sourceLine = sourcereader.ReadLine()) != null) && ((targetLine = targetreader.ReadLine()) != null)) { if (writtenLines % nthLine == 0) { sourcewriter2.WriteLine(sourceLine); targetwriter2.WriteLine(targetLine); } else { sourcewriter1.WriteLine(sourceLine); targetwriter1.WriteLine(targetLine); } writtenLines++; } } return(pair1, pair2); }
//combine two file pairs public ParallelFilePair(ParallelFilePair pair1, ParallelFilePair pair2, string combinedPath, int pair1Lines = 1000, int pair2Lines = 1000) { this.Source = HelperFunctions.CombineFiles( pair1.Source, pair2.Source, Path.Combine(combinedPath, "combined.source"), pair1Lines, pair2Lines); this.Target = HelperFunctions.CombineFiles( pair1.Target, pair2.Target, Path.Combine(combinedPath, "combined.target"), pair1Lines, pair2Lines); }
private void PreprocessInput() { var sourceSpm = this.customDir.GetFiles("source.spm").Single(); var targetSpm = this.customDir.GetFiles("target.spm").Single(); this.spSource = MarianHelper.PreprocessLanguage(this.customSource, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spTarget = MarianHelper.PreprocessLanguage(this.customTarget, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); //concatenate the out-of-domain validation set with the in-domain validation set ParallelFilePair tatoebaValidFileInfos = HelperFunctions.GetTatoebaFileInfos(this.sourceCode, this.targetCode); ParallelFilePair combinedValid = new ParallelFilePair( tatoebaValidFileInfos, new ParallelFilePair(this.inDomainValidationSource, this.inDomainValidationTarget), this.customDir.FullName, OpusCatMTEngineSettings.Default.OODValidSetSize); this.spValidSource = MarianHelper.PreprocessLanguage(combinedValid.Source, this.customDir, this.sourceCode, sourceSpm, this.includePlaceholderTags, this.includeTagPairs); this.spValidTarget = MarianHelper.PreprocessLanguage(combinedValid.Target, this.customDir, this.targetCode, targetSpm, this.includePlaceholderTags, this.includeTagPairs); }