Пример #1
0
 public ThotWordAlignmentModel()
 {
     Handle          = Thot.swAlignModel_create();
     _sourceWords    = new ThotWordVocabulary(Handle, true);
     _targetWords    = new ThotWordVocabulary(Handle, false);
     _closeOnDispose = true;
 }
Пример #2
0
        private void TrainTranslationModel(string tmPrefix, ThotTrainProgressReporter reporter)
        {
            string invswmPrefix = tmPrefix + "_invswm";

            GenerateSingleWordAlignmentModel(invswmPrefix, _sourcePreprocessor, _targetPreprocessor, _parallelCorpus,
                                             "direct", reporter);

            string swmPrefix = tmPrefix + "_swm";

            GenerateSingleWordAlignmentModel(swmPrefix, _targetPreprocessor, _sourcePreprocessor,
                                             _parallelCorpus.Invert(), "inverse", reporter);

            reporter.Step("Merging alignments");

            Thot.giza_symmetr1(swmPrefix + ".bestal", invswmPrefix + ".bestal", tmPrefix + ".A3.final", true);

            reporter.Step("Generating phrase table");

            Thot.phraseModel_generate(tmPrefix + ".A3.final", 10, tmPrefix + ".ttable");

            reporter.Step("Filtering phrase table");

            FilterPhraseTableNBest(tmPrefix + ".ttable", 20);

            File.WriteAllText(tmPrefix + ".lambda", "0.7 0.7");
            File.WriteAllText(tmPrefix + ".srcsegmlentable", "Uniform");
            File.WriteAllText(tmPrefix + ".trgcutstable", "0.999");
            File.WriteAllText(tmPrefix + ".trgsegmlentable", "Geometric");
        }
Пример #3
0
        private void TrainTranslationModel(string tmPrefix, ThotTrainProgressReporter reporter)
        {
            string invswmPrefix = tmPrefix + "_invswm";

            GenerateWordAlignmentModel(invswmPrefix, _sourcePreprocessor, _targetPreprocessor, _parallelCorpus,
                                       reporter, false);

            string swmPrefix = tmPrefix + "_swm";

            GenerateWordAlignmentModel(swmPrefix, _targetPreprocessor, _sourcePreprocessor, _parallelCorpus.Invert(),
                                       reporter, true);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                Thot.giza_symmetr1(swmPrefix + ".bestal", invswmPrefix + ".bestal", tmPrefix + ".A3.final", true);

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                Thot.phraseModel_generate(tmPrefix + ".A3.final", 10, tmPrefix + ".ttable");

            using (PhaseProgress phaseProgress = reporter.StartNextPhase())
                FilterPhraseTableNBest(tmPrefix + ".ttable", 20);

            File.WriteAllText(tmPrefix + ".lambda", "0.7 0.7");
            File.WriteAllText(tmPrefix + ".srcsegmlentable", "Uniform");
            File.WriteAllText(tmPrefix + ".trgcutstable", "0.999");
            File.WriteAllText(tmPrefix + ".trgsegmlentable", "Geometric");
        }
Пример #4
0
        public WordAlignmentMatrix GetBestAlignment(IReadOnlyList <string> sourceSegment,
                                                    IReadOnlyList <string> targetSegment)
        {
            CheckDisposed();

            IntPtr nativeSourceSegment = Thot.ConvertStringsToNativeUtf8(sourceSegment);
            IntPtr nativeTargetSegment = Thot.ConvertStringsToNativeUtf8(targetSegment);
            IntPtr nativeMatrix        = Thot.AllocNativeMatrix(sourceSegment.Count, targetSegment.Count);

            uint iLen = (uint)sourceSegment.Count;
            uint jLen = (uint)targetSegment.Count;

            try
            {
                Thot.swAlignModel_getBestAlignment(Handle, nativeSourceSegment, nativeTargetSegment, nativeMatrix,
                                                   ref iLen, ref jLen);
                return(Thot.ConvertNativeMatrixToWordAlignmentMatrix(nativeMatrix, iLen, jLen));
            }
            finally
            {
                Thot.FreeNativeMatrix(nativeMatrix, iLen);
                Marshal.FreeHGlobal(nativeTargetSegment);
                Marshal.FreeHGlobal(nativeSourceSegment);
            }
        }
Пример #5
0
 protected override void DisposeUnmanagedResources()
 {
     if (_closeOnDispose)
     {
         Thot.swAlignModel_close(Handle);
     }
 }
Пример #6
0
        public void AddSegmentPair(IReadOnlyList <string> sourceSegment, IReadOnlyList <string> targetSegment,
                                   WordAlignmentMatrix hintMatrix = null)
        {
            CheckDisposed();

            IntPtr nativeSourceSegment = Thot.ConvertStringsToNativeUtf8(sourceSegment);
            IntPtr nativeTargetSegment = Thot.ConvertStringsToNativeUtf8(targetSegment);
            IntPtr nativeMatrix = IntPtr.Zero;
            uint   iLen = 0, jLen = 0;

            if (hintMatrix != null)
            {
                nativeMatrix = Thot.ConvertWordAlignmentMatrixToNativeMatrix(hintMatrix);
                iLen         = (uint)hintMatrix.RowCount;
                jLen         = (uint)hintMatrix.ColumnCount;
            }

            try
            {
                Thot.swAlignModel_addSentencePair(Handle, nativeSourceSegment, nativeTargetSegment, nativeMatrix, iLen,
                                                  jLen);
            }
            finally
            {
                Thot.FreeNativeMatrix(nativeMatrix, iLen);
                Marshal.FreeHGlobal(nativeTargetSegment);
                Marshal.FreeHGlobal(nativeSourceSegment);
            }
        }
Пример #7
0
        public IEnumerable <TranslationResult> Translate(int n, IReadOnlyList <string> segment)
        {
            CheckDisposed();

            return(Thot.DoTranslateNBest(_decoderHandle, Thot.decoder_translateNBest, n, segment, false, segment,
                                         CreateResult));
        }
Пример #8
0
        public WordGraph GetWordGraph(IReadOnlyList <string> segment)
        {
            CheckDisposed();

            IntPtr nativeSentence     = Thot.ConvertStringsToNativeUtf8(segment);
            IntPtr wordGraph          = IntPtr.Zero;
            IntPtr nativeWordGraphStr = IntPtr.Zero;

            try
            {
                wordGraph = Thot.decoder_getWordGraph(_decoderHandle, nativeSentence);

                uint len = Thot.wg_getString(wordGraph, IntPtr.Zero, 0);
                nativeWordGraphStr = Marshal.AllocHGlobal((int)len);
                Thot.wg_getString(wordGraph, nativeWordGraphStr, len);
                string wordGraphStr      = Thot.ConvertNativeUtf8ToString(nativeWordGraphStr, len);
                double initialStateScore = Thot.wg_getInitialStateScore(wordGraph);
                return(CreateWordGraph(segment, wordGraphStr, initialStateScore));
            }
            finally
            {
                if (nativeWordGraphStr != IntPtr.Zero)
                {
                    Marshal.FreeHGlobal(nativeWordGraphStr);
                }
                if (wordGraph != IntPtr.Zero)
                {
                    Thot.wg_destroy(wordGraph);
                }
                Marshal.FreeHGlobal(nativeSentence);
            }
        }
Пример #9
0
        private static IEnumerable <IReadOnlyList <string> > GenerateTranslations(ThotSmtParameters parameters,
                                                                                  IReadOnlyList <IReadOnlyList <string> > sourceCorpus)
        {
            var results = new IReadOnlyList <string> [sourceCorpus.Count];

            Parallel.ForEach(Partitioner.Create(0, sourceCorpus.Count), range =>
            {
                IntPtr smtModelHandle = IntPtr.Zero, decoderHandle = IntPtr.Zero;
                try
                {
                    smtModelHandle = Thot.LoadSmtModel(parameters);
                    decoderHandle  = Thot.LoadDecoder(smtModelHandle, parameters);
                    for (int i = range.Item1; i < range.Item2; i++)
                    {
                        IReadOnlyList <string> segment = sourceCorpus[i];
                        results[i] = Thot.DoTranslate(decoderHandle, Thot.decoder_translate, segment, false,
                                                      segment, (s, t, d) => t);
                    }
                }
                finally
                {
                    if (decoderHandle != IntPtr.Zero)
                    {
                        Thot.decoder_close(decoderHandle);
                    }
                    if (smtModelHandle != IntPtr.Zero)
                    {
                        Thot.smtModel_close(smtModelHandle);
                    }
                }
            });
            return(results);
        }
Пример #10
0
        public double GetTranslationProbability(int sourceWordIndex, int targetWordIndex)
        {
            CheckDisposed();

            return(Thot.swAlignModel_getTranslationProbabilityByIndex(Handle, (uint)sourceWordIndex,
                                                                      (uint)targetWordIndex));
        }
Пример #11
0
        /// <summary>
        /// Gets the alignment probability from the HMM single word alignment model. Use -1 for unaligned indices that
        /// occur before the first aligned index. Other unaligned indices are indicated by adding the source length to
        /// the previously aligned index.
        /// </summary>
        public double GetAlignmentProbability(int sourceLen, int prevSourceIndex, int sourceIndex)
        {
            CheckDisposed();

            // add 1 to convert the specified indices to Thot position indices, which are 1-based
            return(Thot.swAlignModel_getAlignmentProbability(Handle, (uint)(prevSourceIndex + 1), (uint)sourceLen,
                                                             (uint)(sourceIndex + 1)));
        }
Пример #12
0
        private static TranslationInfo CreateTranslationInfo(IReadOnlyList <string> sourceSegment,
                                                             IReadOnlyList <string> targetSegment, IntPtr data)
        {
            var scoreComps = new double[8];

            Thot.tdata_getScoreComponents(data, scoreComps, (uint)scoreComps.Length);
            return(new TranslationInfo(scoreComps, targetSegment));
        }
Пример #13
0
        public void Save()
        {
            CheckDisposed();

            if (string.IsNullOrEmpty(_prefFileName))
            {
                throw new InvalidOperationException("This word alignment model cannot be saved.");
            }
            Thot.swAlignModel_save(Handle, _prefFileName);
        }
Пример #14
0
 private uint GetWordNative(uint index, IntPtr nativeWordStr, uint capacity)
 {
     if (_isSource)
     {
         return(Thot.swAlignModel_getSourceWord(_swAlignModelHandle, index, nativeWordStr, capacity));
     }
     else
     {
         return(Thot.swAlignModel_getTargetWord(_swAlignModelHandle, index, nativeWordStr, capacity));
     }
 }
Пример #15
0
        private string GetWord(uint index, IntPtr nativeWordStr, ref uint capacity)
        {
            uint len = GetWordNative(index, nativeWordStr, capacity);

            if (len > capacity)
            {
                capacity      = len;
                nativeWordStr = Marshal.ReAllocHGlobal(nativeWordStr, (IntPtr)capacity);
                len           = GetWordNative(index, nativeWordStr, capacity);
            }
            return(Thot.ConvertNativeUtf8ToString(nativeWordStr, len));
        }
Пример #16
0
        public ThotSmtModel(ThotSmtParameters parameters)
        {
            Parameters = parameters;
            Parameters.Freeze();

            Handle = Thot.LoadSmtModel(Parameters);

            _directWordAlignmentModel = new ThotWordAlignmentModel(
                Thot.smtModel_getSingleWordAlignmentModel(Handle));
            _inverseWordAlignmentModel = new ThotWordAlignmentModel(
                Thot.smtModel_getInverseSingleWordAlignmentModel(Handle));
        }
Пример #17
0
        public double GetSegmentProbability(IEnumerable <string> segment)
        {
            IntPtr nativeSegment = Thot.ConvertStringsToNativeUtf8(segment);

            try
            {
                return(Thot.langModel_getSentenceProbability(_handle, nativeSegment));
            }
            finally
            {
                Marshal.FreeHGlobal(nativeSegment);
            }
        }
Пример #18
0
        public ThotWordAlignmentModel(string prefFileName, bool createNew = false)
        {
            if (!createNew && !File.Exists(prefFileName + ".src"))
            {
                throw new FileNotFoundException("The word alignment model configuration could not be found.");
            }

            _prefFileName = prefFileName;
            Handle        = createNew || !File.Exists(prefFileName + ".src")
                                ? Thot.swAlignModel_create()
                                : Thot.swAlignModel_open(_prefFileName);
            _sourceWords    = new ThotWordVocabulary(Handle, true);
            _targetWords    = new ThotWordVocabulary(Handle, false);
            _closeOnDispose = true;
        }
Пример #19
0
        public void AddSegmentPair(IReadOnlyList <string> sourceSegment, IReadOnlyList <string> targetSegment)
        {
            CheckDisposed();

            IntPtr nativeSourceSegment = Thot.ConvertStringsToNativeUtf8(sourceSegment);
            IntPtr nativeTargetSegment = Thot.ConvertStringsToNativeUtf8(targetSegment);

            try
            {
                Thot.swAlignModel_addSentencePair(Handle, nativeSourceSegment, nativeTargetSegment);
            }
            finally
            {
                Marshal.FreeHGlobal(nativeTargetSegment);
                Marshal.FreeHGlobal(nativeSourceSegment);
            }
        }
Пример #20
0
        public double GetTranslationProbability(string sourceWord, string targetWord)
        {
            CheckDisposed();

            IntPtr nativeSourceWord = Thot.ConvertStringToNativeUtf8(sourceWord ?? "NULL");
            IntPtr nativeTargetWord = Thot.ConvertStringToNativeUtf8(targetWord ?? "NULL");

            try
            {
                return(Thot.swAlignModel_getTranslationProbability(Handle, nativeSourceWord, nativeTargetWord));
            }
            finally
            {
                Marshal.FreeHGlobal(nativeTargetWord);
                Marshal.FreeHGlobal(nativeSourceWord);
            }
        }
Пример #21
0
            public override void Save()
            {
                foreach (ThotSmtEngine engine in _smtModel._engines)
                {
                    engine.CloseHandle();
                }
                Thot.smtModel_close(_smtModel.Handle);

                base.Save();

                _smtModel.Parameters = Parameters;
                _smtModel.Handle     = Thot.LoadSmtModel(_smtModel.Parameters);
                _smtModel._directWordAlignmentModel.Handle  = Thot.smtModel_getSingleWordAlignmentModel(_smtModel.Handle);
                _smtModel._inverseWordAlignmentModel.Handle =
                    Thot.smtModel_getInverseSingleWordAlignmentModel(_smtModel.Handle);
                foreach (ThotSmtEngine engine in _smtModel._engines)
                {
                    engine.LoadHandle();
                }
            }
Пример #22
0
        private IEnumerable <IList <TranslationInfo> > GetNBestLists(ThotSmtParameters parameters,
                                                                     IReadOnlyList <IReadOnlyList <string> > sourceCorpus)
        {
            IntPtr smtModelHandle = IntPtr.Zero;

            try
            {
                smtModelHandle = Thot.LoadSmtModel(parameters);
                var results = new IList <TranslationInfo> [sourceCorpus.Count];
                Parallel.ForEach(Partitioner.Create(0, sourceCorpus.Count), range =>
                {
                    IntPtr decoderHandle = IntPtr.Zero;
                    try
                    {
                        decoderHandle = Thot.LoadDecoder(smtModelHandle, parameters);
                        for (int i = range.Item1; i < range.Item2; i++)
                        {
                            IReadOnlyList <string> sourceSegment = sourceCorpus[i];
                            results[i] = Thot.DoTranslateNBest(decoderHandle, Thot.decoder_translateNBest, K,
                                                               sourceSegment, false, sourceSegment, CreateTranslationInfo).ToArray();
                        }
                    }
                    finally
                    {
                        if (decoderHandle != IntPtr.Zero)
                        {
                            Thot.decoder_close(decoderHandle);
                        }
                    }
                });
                return(results);
            }
            finally
            {
                if (smtModelHandle != IntPtr.Zero)
                {
                    Thot.smtModel_close(smtModelHandle);
                }
            }
        }
Пример #23
0
        public TranslationResult Translate(IReadOnlyList <string> segment)
        {
            CheckDisposed();

            return(Thot.DoTranslate(_decoderHandle, Thot.decoder_translate, segment, false, segment, CreateResult));
        }
Пример #24
0
 public ThotLanguageModel(string lmPrefix)
 {
     _handle = Thot.langModel_open(lmPrefix);
 }
Пример #25
0
 protected override void DisposeUnmanagedResources()
 {
     Thot.langModel_close(_handle);
 }
Пример #26
0
        private static void UpdateWeights(IntPtr weightUpdaterHandle,
                                          IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, HashSet <TranslationInfo>[] nbestLists,
                                          float[] curWeights)
        {
            IntPtr[] nativeTuneTargetCorpus = tuneTargetCorpus.Select(Thot.ConvertStringsToNativeUtf8).ToArray();

            int    sizeOfPtr           = Marshal.SizeOf <IntPtr>();
            int    sizeOfDouble        = Marshal.SizeOf <double>();
            IntPtr nativeNBestLists    = Marshal.AllocHGlobal(nbestLists.Length * sizeOfPtr);
            IntPtr nativeScoreComps    = Marshal.AllocHGlobal(nbestLists.Length * sizeOfPtr);
            var    nativeNBestListLens = new uint[nbestLists.Length];

            for (int i = 0; i < nbestLists.Length; i++)
            {
                IntPtr nativeNBestList      = Marshal.AllocHGlobal(nbestLists[i].Count * sizeOfPtr);
                IntPtr nativeListScoreComps = Marshal.AllocHGlobal(nbestLists[i].Count * sizeOfPtr);
                int    j = 0;
                foreach (TranslationInfo ti in nbestLists[i])
                {
                    IntPtr nativeSegment = Thot.ConvertStringsToNativeUtf8(ti.Translation);
                    Marshal.WriteIntPtr(nativeNBestList, j * sizeOfPtr, nativeSegment);

                    IntPtr nativeTransScoreComps = Marshal.AllocHGlobal((ti.ScoreComponents.Length - 1) * sizeOfDouble);
                    Marshal.Copy(ti.ScoreComponents, 0, nativeTransScoreComps, ti.ScoreComponents.Length - 1);
                    Marshal.WriteIntPtr(nativeListScoreComps, j * sizeOfPtr, nativeTransScoreComps);
                    j++;
                }
                Marshal.WriteIntPtr(nativeNBestLists, i * sizeOfPtr, nativeNBestList);
                Marshal.WriteIntPtr(nativeScoreComps, i * sizeOfPtr, nativeListScoreComps);
                nativeNBestListLens[i] = (uint)nbestLists[i].Count;
            }

            try
            {
                Thot.llWeightUpdater_updateClosedCorpus(weightUpdaterHandle, nativeTuneTargetCorpus, nativeNBestLists,
                                                        nativeScoreComps, nativeNBestListLens,
                                                        curWeights, (uint)nbestLists.Length, (uint)curWeights.Length - 1);
            }
            finally
            {
                foreach (IntPtr nativeSegment in nativeTuneTargetCorpus)
                {
                    Marshal.FreeHGlobal(nativeSegment);
                }

                for (int i = 0; i < nbestLists.Length; i++)
                {
                    IntPtr nativeNBestList      = Marshal.ReadIntPtr(nativeNBestLists, i * sizeOfPtr);
                    IntPtr nativeListScoreComps = Marshal.ReadIntPtr(nativeScoreComps, i * sizeOfPtr);
                    for (int j = 0; j < nbestLists[i].Count; j++)
                    {
                        IntPtr nativeSegment = Marshal.ReadIntPtr(nativeNBestList, j * sizeOfPtr);
                        Marshal.FreeHGlobal(nativeSegment);

                        IntPtr nativeTransScoreComps = Marshal.ReadIntPtr(nativeListScoreComps, j * sizeOfPtr);
                        Marshal.FreeHGlobal(nativeTransScoreComps);
                    }
                    Marshal.FreeHGlobal(nativeNBestList);
                    Marshal.FreeHGlobal(nativeListScoreComps);
                }

                Marshal.FreeHGlobal(nativeNBestLists);
                Marshal.FreeHGlobal(nativeScoreComps);
            }
        }
Пример #27
0
        public ThotSmtParameters Tune(ThotSmtParameters parameters,
                                      IReadOnlyList <IReadOnlyList <string> > tuneSourceCorpus,
                                      IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, SmtBatchTrainStats stats,
                                      IProgress <ProgressStatus> progress)
        {
            IntPtr weightUpdaterHandle = Thot.llWeightUpdater_create();

            try
            {
                var                         iterQualities  = new List <double>();
                double                      bestQuality    = double.MinValue;
                ThotSmtParameters           bestParameters = null;
                int                         iter           = 0;
                HashSet <TranslationInfo>[] curNBestLists  = null;
                float[]                     curWeights     = parameters.ModelWeights.ToArray();

                while (true)
                {
                    progress.Report(new ProgressStatus(iter, MaxIterations));

                    ThotSmtParameters newParameters = parameters.Clone();
                    newParameters.ModelWeights = curWeights;
                    newParameters.Freeze();
                    IList <TranslationInfo>[] nbestLists = GetNBestLists(newParameters, tuneSourceCorpus).ToArray();
                    double quality = Evaluation.CalculateBleu(nbestLists.Select(nbl => nbl.First().Translation),
                                                              tuneTargetCorpus);
                    iterQualities.Add(quality);
                    if (quality > bestQuality)
                    {
                        bestQuality    = quality;
                        bestParameters = newParameters;
                    }

                    iter++;
                    if (iter >= MaxIterations || IsTuningConverged(iterQualities))
                    {
                        break;
                    }

                    if (curNBestLists == null)
                    {
                        curNBestLists = nbestLists.Select(nbl => new HashSet <TranslationInfo>(nbl)).ToArray();
                    }
                    else
                    {
                        for (int i = 0; i < nbestLists.Length; i++)
                        {
                            curNBestLists[i].UnionWith(nbestLists[i]);
                        }
                    }

                    UpdateWeights(weightUpdaterHandle, tuneTargetCorpus, curNBestLists, curWeights);
                }

                if (iter < MaxIterations)
                {
                    progress.Report(new ProgressStatus(1.0));
                }
                stats.TranslationModelBleu = bestQuality;
                return(bestParameters);
            }
            finally
            {
                Thot.llWeightUpdater_close(weightUpdaterHandle);
            }
        }
Пример #28
0
 internal void CloseHandle()
 {
     Thot.decoder_close(_decoderHandle);
 }
Пример #29
0
 internal void LoadHandle()
 {
     _decoderHandle = Thot.LoadDecoder(_smtModel.Handle, _smtModel.Parameters);
 }
Пример #30
0
        public void TrainingIteration()
        {
            CheckDisposed();

            Thot.swAlignModel_train(Handle, 1);
        }