Пример #1
0
 public MarianCustomizer(
     MTModel model,
     MTModel customModel,
     ParallelFilePair inputPair,
     ParallelFilePair indomainValidPair,
     string customLabel,
     bool includePlaceholderTags,
     bool includeTagPairs,
     List <string> postCustomizationBatch,
     IsoLanguage sourceLanguage,
     IsoLanguage targetLanguage,
     bool guidedAlignment = false)
 {
     this.model                    = model;
     this.customModel              = customModel;
     this.modelDir                 = new DirectoryInfo(model.InstallDir);
     this.customDir                = new DirectoryInfo(this.customModel.InstallDir);
     this.customSource             = inputPair.Source;
     this.customTarget             = inputPair.Target;
     this.customLabel              = customLabel;
     this.includePlaceholderTags   = includePlaceholderTags;
     this.includeTagPairs          = includeTagPairs;
     this.inDomainValidationSource = indomainValidPair.Source;
     this.inDomainValidationTarget = indomainValidPair.Target;
     this.sourceLanguage           = sourceLanguage;
     this.targetLanguage           = targetLanguage;
     this.guidedAlignment          = guidedAlignment;
 }
Пример #2
0
        public string Customize(
            string tokenCode,
            List <Tuple <string, string> > input,
            List <Tuple <string, string> > validation,
            List <string> uniqueNewSegments,
            string srcLangCode,
            string trgLangCode,
            string modelTag,
            bool includePlaceholderTags,
            bool includeTagPairs)
        {
            if (!TokenCodeGenerator.Instance.TokenCodeIsValid(tokenCode))
            {
                return(null);
            }

            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            if (!this.ModelManager.FinetuningOngoing && !this.ModelManager.BatchTranslationOngoing)
            {
                this.ModelManager.StartCustomization(
                    input, validation, uniqueNewSegments, sourceLang, targetLang, modelTag, includePlaceholderTags, includeTagPairs);
                return("fine-tuning started");
            }
            else
            {
                //TODO: need to queue up customization, i.e. save data for starting later
                throw new FaultException($"Batch translation or customization already in process in the MT engine");
            }
        }
Пример #3
0
        public MarianBatchTranslator(
            string modelDir,
            IsoLanguage sourceLang,
            IsoLanguage targetLang,
            bool includePlaceholderTags,
            bool includeTagPairs)
        {
            this.SourceCode = sourceLang.ShortestIsoCode;
            this.TargetCode = targetLang.ShortestIsoCode;

            this.includePlaceholderTags = includePlaceholderTags;
            this.includeTagPairs        = includeTagPairs;
            this.modelDir   = new DirectoryInfo(modelDir);
            this.SystemName = $"{this.SourceCode}-{this.TargetCode}_" + this.modelDir.Name;

            //Check if batch.yml exists, if not create it from decode.yml
            var batchYaml = this.modelDir.GetFiles("batch.yml");

            if (batchYaml.Length == 0)
            {
                var decoderYaml     = this.modelDir.GetFiles("decoder.yml").Single();
                var deserializer    = new Deserializer();
                var decoderSettings = deserializer.Deserialize <MarianDecoderConfig>(decoderYaml.OpenText());
                decoderSettings.miniBatch = "16";
                decoderSettings.log       = Path.Combine(this.modelDir.FullName, "batch.log");
                decoderSettings.alignment = "hard";

                var serializer = new Serializer();
                var configPath = Path.Combine(this.modelDir.FullName, "batch.yml");
                using (var writer = File.CreateText(configPath))
                {
                    serializer.Serialize(writer, decoderSettings, typeof(MarianDecoderConfig));
                }
            }
        }
Пример #4
0
        public Stream TranslateStream(string tokenCode = "", string input = "", string srcLangCode = "", string trgLangCode = "", string modelTag = "")
        {
            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            /*WebOperationContext.Current.OutgoingResponse.ContentType = "text/plain; charset=utf-8";
             * WebOperationContext.Current.OutgoingResponse.Headers.Add("Connection: close");
             * WebOperationContext.Current.OutgoingResponse.Headers.Add("Access-Control-Allow-Origin: *");
             *
             * //This is for Wordfast Anywhere (probably other versions as well) compatibility, for some reason it doesn't accept a response with
             * //the default Server header.
             * WebOperationContext.Current.OutgoingResponse.Headers.Add(HttpResponseHeader.Server.ToString(), string.Empty);
             */
            var translation = this.mtProvider.Translate(input, sourceLang, targetLang, modelTag).Result;
            var stream      = new MemoryStream(Encoding.UTF8.GetBytes(translation.Translation));

            return(stream);

            var response = Request.CreateResponse <Stream>(HttpStatusCode.OK, stream);

            response.Headers.Add("Access-Control-Allow-Origin", "*");


            //return response;
        }
Пример #5
0
        public Translation TranslateJson(string tokenCode, string input, string srcLangCode, string trgLangCode, string modelTag)
        {
            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            var translation = this.ModelManager.Translate(input, sourceLang, targetLang, modelTag);

            return(new Translation(translation.Result));
        }
Пример #6
0
        public Translation TranslateJson(string tokenCode, string input, string srcLangCode, string trgLangCode, string modelTag)
        {
            WebOperationContext.Current.OutgoingResponse.Headers.Add("Access-Control-Allow-Origin: *");

            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            var translation = this.ModelManager.Translate(input, sourceLang, targetLang, modelTag);

            return(new Translation(translation.Result.Translation));
        }
Пример #7
0
        /// <summary>
        /// Call this method to get the translation for a single string.
        /// </summary>
        /// <param name="tokenCode">The token code.</param>
        /// <param name="input">The input string.</param>
        /// <param name="srcLangCode">The code of the source language.</param>
        /// <param name="trgLangCode">The code of the target language.</param>
        /// <returns>The translated input string.</returns>
        public string Translate(string tokenCode, string input, string srcLangCode, string trgLangCode, string modelTag)
        {
            if (!TokenCodeGenerator.Instance.TokenCodeIsValid(tokenCode))
            {
                return(null);
            }

            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            return(this.ModelManager.Translate(input, sourceLang, targetLang, modelTag).Result);
        }
Пример #8
0
        public string CheckModelStatus(string tokenCode, string sourceCode, string targetCode, string modelTag)
        {
            if (!TokenCodeGenerator.Instance.TokenCodeIsValid(tokenCode))
            {
                return(null);
            }

            var sourceLang = new IsoLanguage(sourceCode);
            var targetLang = new IsoLanguage(targetCode);

            return(this.ModelManager.CheckModelStatus(sourceLang, targetLang, modelTag));
        }
Пример #9
0
        public HttpResponseMessage TranslateJson(string tokenCode = "", string input = "", string srcLangCode = "", string trgLangCode = "", string modelTag = "")
        {
            //HttpContext.Current.Response.Headers.Add("Access-Control-Allow-Origin", "*");
            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            var translation = this.mtProvider.Translate(input, sourceLang, targetLang, modelTag);

            var response = Request.CreateResponse <Translation>(HttpStatusCode.OK, new Translation(translation.Result.Translation));

            response.Headers.Add("Access-Control-Allow-Origin", "*");


            return(response);
            //return new Translation(translation.Result.Translation);
        }
Пример #10
0
        public static ParallelFilePair ParseTmxToParallelFiles(
            string tmxFile,
            IsoLanguage sourceLang,
            IsoLanguage targetLang,
            bool includePlaceholderTags,
            bool includeTagPairs)
        {
            var sourceFile = new FileInfo($"{tmxFile}.{sourceLang.ShortestIsoCode}.txt");
            var targetFile = new FileInfo($"{tmxFile}.{targetLang.ShortestIsoCode}.txt");

            XDocument tmx;

            try
            {
                tmx = XDocument.Load(tmxFile);
            }
            catch (System.Xml.XmlException ex)
            {
                Log.Error($"{tmxFile} is not a valid tmx file");
                return(null);
            }

            var tus = tmx.Descendants("tu");

            using (var sourceWriter = sourceFile.CreateText())
                using (var targetWriter = targetFile.CreateText())
                {
                    foreach (var tu in tus)
                    {
                        var sourceSeg =
                            tu.Descendants("seg").FirstOrDefault(
                                x => sourceLang.IsCompatibleTmxLang(x.Parent.Attribute(XNamespace.Xml + "lang").Value.ToLower()));
                        var targetSeg =
                            tu.Descendants("seg").FirstOrDefault(
                                x => targetLang.IsCompatibleTmxLang(x.Parent.Attribute(XNamespace.Xml + "lang").Value.ToLower()));
                        if (sourceSeg != null && targetSeg != null)
                        {
                            var sourceText = TmxToTxtParser.FilterTextAndTags(sourceSeg, includePlaceholderTags, includeTagPairs);
                            sourceWriter.WriteLine(sourceText);
                            var targetText = TmxToTxtParser.FilterTextAndTags(targetSeg, includePlaceholderTags, includeTagPairs);
                            targetWriter.WriteLine(targetText);
                        }
                    }
                }

            return(new ParallelFilePair(sourceFile, targetFile));
        }
Пример #11
0
        public Stream TranslateStream(string tokenCode, string input, string srcLangCode, string trgLangCode, string modelTag)
        {
            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            WebOperationContext.Current.OutgoingResponse.ContentType = "text/plain; charset=utf-8";
            WebOperationContext.Current.OutgoingResponse.Headers.Add("Connection: close");
            WebOperationContext.Current.OutgoingResponse.Headers.Add("Access-Control-Allow-Origin: *");

            //This is for Wordfast Anywhere (probably other versions as well) compatibility, for some reason it doesn't accept a response with
            //the default Server header.
            WebOperationContext.Current.OutgoingResponse.Headers.Add(HttpResponseHeader.Server.ToString(), string.Empty);

            var translation = this.ModelManager.Translate(input, sourceLang, targetLang, modelTag).Result;

            return(new MemoryStream(Encoding.UTF8.GetBytes(translation)));
        }
Пример #12
0
        /// <summary>
        /// Call this method to get the translation for multiple strings in batch.
        /// NOTE: this is not currently used, for pretranslation PretranslateBatch is better.
        /// </summary>
        /// <param name="tokenCode">The token code.</param>
        /// <param name="input">The input strings.</param>
        /// <param name="srcLangCode">The code of the source language.</param>
        /// <param name="trgLangCode">The code of the target language.</param>
        /// <returns>The translated input strings.</returns>
        public List <string> BatchTranslate(string tokenCode, List <string> input, string srcLangCode, string trgLangCode, string modelTag)
        {
            if (!TokenCodeGenerator.Instance.TokenCodeIsValid(tokenCode))
            {
                return(null);
            }

            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            List <string> translations = new List <string>();

            foreach (var sourceSegment in input)
            {
                translations.Add(this.ModelManager.Translate(sourceSegment, sourceLang, targetLang, modelTag).Result);
            }

            return(translations);
        }
Пример #13
0
        /// <summary>
        /// This will send a batch to the MT engine for pretranslation, which means
        /// the translations will be immediately available when it is requested
        /// </summary>
        /// <param name="tokenCode"></param>
        /// <param name="input"></param>
        /// <param name="srcLangCode"></param>
        /// <param name="trgLangCode"></param>
        public string PreOrderBatch(string tokenCode, List <string> input, string srcLangCode, string trgLangCode, string modelTag)
        {
            if (!TokenCodeGenerator.Instance.TokenCodeIsValid(tokenCode))
            {
                return("");
            }

            var sourceLang = new IsoLanguage(srcLangCode);
            var targetLang = new IsoLanguage(trgLangCode);

            if (input.Count == 0)
            {
                return("input was empty");
            }

            foreach (var inputString in input)
            {
                this.ModelManager.Translate(inputString, sourceLang, targetLang, modelTag);
            }

            /* Batch preordering was done earlier with batch translation, but it doesn't seem
             * to be much quicker than normal translation, and it has to problem of providing all
             * the translations at once in the end. Using normal translation means the MT is ready
             * as soon as a sentence gets translated (you could do this for batch translation as well
             * by adding an outputline handler, but it's not implemented yet). Batch translation should be
             * much quicker, need to test for correct parameters, so stick with this. Using normal translate
             * is also more robust, one less thing to break.
             * if (!this.ModelManager.BatchTranslationOngoing && !this.ModelManager.CustomizationOngoing)
             * {
             *  this.ModelManager.PreTranslateBatch(input, sourceLang, targetLang, modelTag);
             *  return "batch translation started";
             * }
             * else
             * {
             *  return "batch translation or customization already in process";
             * }*/

            return("preorder received");
        }
Пример #14
0
 //this static constructor parses the iso table files that are embedded as resources
 static IsoLanguage()
 {
     IsoLanguage.ParseIso639_3();
     IsoLanguage.ParseIso639_5();
 }
Пример #15
0
 internal bool IsCompatibleLanguage(IsoLanguage lang)
 {
     //TODO: add language group compatibility matching here
     return(this.ShortestIsoCode == lang.ShortestIsoCode);
 }