public ThotWordAlignmentModel() { Handle = Thot.swAlignModel_create(); _sourceWords = new ThotWordVocabulary(Handle, true); _targetWords = new ThotWordVocabulary(Handle, false); _closeOnDispose = true; }
private void TrainTranslationModel(string tmPrefix, ThotTrainProgressReporter reporter) { string invswmPrefix = tmPrefix + "_invswm"; GenerateSingleWordAlignmentModel(invswmPrefix, _sourcePreprocessor, _targetPreprocessor, _parallelCorpus, "direct", reporter); string swmPrefix = tmPrefix + "_swm"; GenerateSingleWordAlignmentModel(swmPrefix, _targetPreprocessor, _sourcePreprocessor, _parallelCorpus.Invert(), "inverse", reporter); reporter.Step("Merging alignments"); Thot.giza_symmetr1(swmPrefix + ".bestal", invswmPrefix + ".bestal", tmPrefix + ".A3.final", true); reporter.Step("Generating phrase table"); Thot.phraseModel_generate(tmPrefix + ".A3.final", 10, tmPrefix + ".ttable"); reporter.Step("Filtering phrase table"); FilterPhraseTableNBest(tmPrefix + ".ttable", 20); File.WriteAllText(tmPrefix + ".lambda", "0.7 0.7"); File.WriteAllText(tmPrefix + ".srcsegmlentable", "Uniform"); File.WriteAllText(tmPrefix + ".trgcutstable", "0.999"); File.WriteAllText(tmPrefix + ".trgsegmlentable", "Geometric"); }
private void TrainTranslationModel(string tmPrefix, ThotTrainProgressReporter reporter) { string invswmPrefix = tmPrefix + "_invswm"; GenerateWordAlignmentModel(invswmPrefix, _sourcePreprocessor, _targetPreprocessor, _parallelCorpus, reporter, false); string swmPrefix = tmPrefix + "_swm"; GenerateWordAlignmentModel(swmPrefix, _targetPreprocessor, _sourcePreprocessor, _parallelCorpus.Invert(), reporter, true); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) Thot.giza_symmetr1(swmPrefix + ".bestal", invswmPrefix + ".bestal", tmPrefix + ".A3.final", true); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) Thot.phraseModel_generate(tmPrefix + ".A3.final", 10, tmPrefix + ".ttable"); using (PhaseProgress phaseProgress = reporter.StartNextPhase()) FilterPhraseTableNBest(tmPrefix + ".ttable", 20); File.WriteAllText(tmPrefix + ".lambda", "0.7 0.7"); File.WriteAllText(tmPrefix + ".srcsegmlentable", "Uniform"); File.WriteAllText(tmPrefix + ".trgcutstable", "0.999"); File.WriteAllText(tmPrefix + ".trgsegmlentable", "Geometric"); }
public WordAlignmentMatrix GetBestAlignment(IReadOnlyList <string> sourceSegment, IReadOnlyList <string> targetSegment) { CheckDisposed(); IntPtr nativeSourceSegment = Thot.ConvertStringsToNativeUtf8(sourceSegment); IntPtr nativeTargetSegment = Thot.ConvertStringsToNativeUtf8(targetSegment); IntPtr nativeMatrix = Thot.AllocNativeMatrix(sourceSegment.Count, targetSegment.Count); uint iLen = (uint)sourceSegment.Count; uint jLen = (uint)targetSegment.Count; try { Thot.swAlignModel_getBestAlignment(Handle, nativeSourceSegment, nativeTargetSegment, nativeMatrix, ref iLen, ref jLen); return(Thot.ConvertNativeMatrixToWordAlignmentMatrix(nativeMatrix, iLen, jLen)); } finally { Thot.FreeNativeMatrix(nativeMatrix, iLen); Marshal.FreeHGlobal(nativeTargetSegment); Marshal.FreeHGlobal(nativeSourceSegment); } }
protected override void DisposeUnmanagedResources() { if (_closeOnDispose) { Thot.swAlignModel_close(Handle); } }
public void AddSegmentPair(IReadOnlyList <string> sourceSegment, IReadOnlyList <string> targetSegment, WordAlignmentMatrix hintMatrix = null) { CheckDisposed(); IntPtr nativeSourceSegment = Thot.ConvertStringsToNativeUtf8(sourceSegment); IntPtr nativeTargetSegment = Thot.ConvertStringsToNativeUtf8(targetSegment); IntPtr nativeMatrix = IntPtr.Zero; uint iLen = 0, jLen = 0; if (hintMatrix != null) { nativeMatrix = Thot.ConvertWordAlignmentMatrixToNativeMatrix(hintMatrix); iLen = (uint)hintMatrix.RowCount; jLen = (uint)hintMatrix.ColumnCount; } try { Thot.swAlignModel_addSentencePair(Handle, nativeSourceSegment, nativeTargetSegment, nativeMatrix, iLen, jLen); } finally { Thot.FreeNativeMatrix(nativeMatrix, iLen); Marshal.FreeHGlobal(nativeTargetSegment); Marshal.FreeHGlobal(nativeSourceSegment); } }
public IEnumerable <TranslationResult> Translate(int n, IReadOnlyList <string> segment) { CheckDisposed(); return(Thot.DoTranslateNBest(_decoderHandle, Thot.decoder_translateNBest, n, segment, false, segment, CreateResult)); }
public WordGraph GetWordGraph(IReadOnlyList <string> segment) { CheckDisposed(); IntPtr nativeSentence = Thot.ConvertStringsToNativeUtf8(segment); IntPtr wordGraph = IntPtr.Zero; IntPtr nativeWordGraphStr = IntPtr.Zero; try { wordGraph = Thot.decoder_getWordGraph(_decoderHandle, nativeSentence); uint len = Thot.wg_getString(wordGraph, IntPtr.Zero, 0); nativeWordGraphStr = Marshal.AllocHGlobal((int)len); Thot.wg_getString(wordGraph, nativeWordGraphStr, len); string wordGraphStr = Thot.ConvertNativeUtf8ToString(nativeWordGraphStr, len); double initialStateScore = Thot.wg_getInitialStateScore(wordGraph); return(CreateWordGraph(segment, wordGraphStr, initialStateScore)); } finally { if (nativeWordGraphStr != IntPtr.Zero) { Marshal.FreeHGlobal(nativeWordGraphStr); } if (wordGraph != IntPtr.Zero) { Thot.wg_destroy(wordGraph); } Marshal.FreeHGlobal(nativeSentence); } }
private static IEnumerable <IReadOnlyList <string> > GenerateTranslations(ThotSmtParameters parameters, IReadOnlyList <IReadOnlyList <string> > sourceCorpus) { var results = new IReadOnlyList <string> [sourceCorpus.Count]; Parallel.ForEach(Partitioner.Create(0, sourceCorpus.Count), range => { IntPtr smtModelHandle = IntPtr.Zero, decoderHandle = IntPtr.Zero; try { smtModelHandle = Thot.LoadSmtModel(parameters); decoderHandle = Thot.LoadDecoder(smtModelHandle, parameters); for (int i = range.Item1; i < range.Item2; i++) { IReadOnlyList <string> segment = sourceCorpus[i]; results[i] = Thot.DoTranslate(decoderHandle, Thot.decoder_translate, segment, false, segment, (s, t, d) => t); } } finally { if (decoderHandle != IntPtr.Zero) { Thot.decoder_close(decoderHandle); } if (smtModelHandle != IntPtr.Zero) { Thot.smtModel_close(smtModelHandle); } } }); return(results); }
public double GetTranslationProbability(int sourceWordIndex, int targetWordIndex) { CheckDisposed(); return(Thot.swAlignModel_getTranslationProbabilityByIndex(Handle, (uint)sourceWordIndex, (uint)targetWordIndex)); }
/// <summary> /// Gets the alignment probability from the HMM single word alignment model. Use -1 for unaligned indices that /// occur before the first aligned index. Other unaligned indices are indicated by adding the source length to /// the previously aligned index. /// </summary> public double GetAlignmentProbability(int sourceLen, int prevSourceIndex, int sourceIndex) { CheckDisposed(); // add 1 to convert the specified indices to Thot position indices, which are 1-based return(Thot.swAlignModel_getAlignmentProbability(Handle, (uint)(prevSourceIndex + 1), (uint)sourceLen, (uint)(sourceIndex + 1))); }
private static TranslationInfo CreateTranslationInfo(IReadOnlyList <string> sourceSegment, IReadOnlyList <string> targetSegment, IntPtr data) { var scoreComps = new double[8]; Thot.tdata_getScoreComponents(data, scoreComps, (uint)scoreComps.Length); return(new TranslationInfo(scoreComps, targetSegment)); }
public void Save() { CheckDisposed(); if (string.IsNullOrEmpty(_prefFileName)) { throw new InvalidOperationException("This word alignment model cannot be saved."); } Thot.swAlignModel_save(Handle, _prefFileName); }
private uint GetWordNative(uint index, IntPtr nativeWordStr, uint capacity) { if (_isSource) { return(Thot.swAlignModel_getSourceWord(_swAlignModelHandle, index, nativeWordStr, capacity)); } else { return(Thot.swAlignModel_getTargetWord(_swAlignModelHandle, index, nativeWordStr, capacity)); } }
private string GetWord(uint index, IntPtr nativeWordStr, ref uint capacity) { uint len = GetWordNative(index, nativeWordStr, capacity); if (len > capacity) { capacity = len; nativeWordStr = Marshal.ReAllocHGlobal(nativeWordStr, (IntPtr)capacity); len = GetWordNative(index, nativeWordStr, capacity); } return(Thot.ConvertNativeUtf8ToString(nativeWordStr, len)); }
public ThotSmtModel(ThotSmtParameters parameters) { Parameters = parameters; Parameters.Freeze(); Handle = Thot.LoadSmtModel(Parameters); _directWordAlignmentModel = new ThotWordAlignmentModel( Thot.smtModel_getSingleWordAlignmentModel(Handle)); _inverseWordAlignmentModel = new ThotWordAlignmentModel( Thot.smtModel_getInverseSingleWordAlignmentModel(Handle)); }
public double GetSegmentProbability(IEnumerable <string> segment) { IntPtr nativeSegment = Thot.ConvertStringsToNativeUtf8(segment); try { return(Thot.langModel_getSentenceProbability(_handle, nativeSegment)); } finally { Marshal.FreeHGlobal(nativeSegment); } }
public ThotWordAlignmentModel(string prefFileName, bool createNew = false) { if (!createNew && !File.Exists(prefFileName + ".src")) { throw new FileNotFoundException("The word alignment model configuration could not be found."); } _prefFileName = prefFileName; Handle = createNew || !File.Exists(prefFileName + ".src") ? Thot.swAlignModel_create() : Thot.swAlignModel_open(_prefFileName); _sourceWords = new ThotWordVocabulary(Handle, true); _targetWords = new ThotWordVocabulary(Handle, false); _closeOnDispose = true; }
public void AddSegmentPair(IReadOnlyList <string> sourceSegment, IReadOnlyList <string> targetSegment) { CheckDisposed(); IntPtr nativeSourceSegment = Thot.ConvertStringsToNativeUtf8(sourceSegment); IntPtr nativeTargetSegment = Thot.ConvertStringsToNativeUtf8(targetSegment); try { Thot.swAlignModel_addSentencePair(Handle, nativeSourceSegment, nativeTargetSegment); } finally { Marshal.FreeHGlobal(nativeTargetSegment); Marshal.FreeHGlobal(nativeSourceSegment); } }
public double GetTranslationProbability(string sourceWord, string targetWord) { CheckDisposed(); IntPtr nativeSourceWord = Thot.ConvertStringToNativeUtf8(sourceWord ?? "NULL"); IntPtr nativeTargetWord = Thot.ConvertStringToNativeUtf8(targetWord ?? "NULL"); try { return(Thot.swAlignModel_getTranslationProbability(Handle, nativeSourceWord, nativeTargetWord)); } finally { Marshal.FreeHGlobal(nativeTargetWord); Marshal.FreeHGlobal(nativeSourceWord); } }
public override void Save() { foreach (ThotSmtEngine engine in _smtModel._engines) { engine.CloseHandle(); } Thot.smtModel_close(_smtModel.Handle); base.Save(); _smtModel.Parameters = Parameters; _smtModel.Handle = Thot.LoadSmtModel(_smtModel.Parameters); _smtModel._directWordAlignmentModel.Handle = Thot.smtModel_getSingleWordAlignmentModel(_smtModel.Handle); _smtModel._inverseWordAlignmentModel.Handle = Thot.smtModel_getInverseSingleWordAlignmentModel(_smtModel.Handle); foreach (ThotSmtEngine engine in _smtModel._engines) { engine.LoadHandle(); } }
private IEnumerable <IList <TranslationInfo> > GetNBestLists(ThotSmtParameters parameters, IReadOnlyList <IReadOnlyList <string> > sourceCorpus) { IntPtr smtModelHandle = IntPtr.Zero; try { smtModelHandle = Thot.LoadSmtModel(parameters); var results = new IList <TranslationInfo> [sourceCorpus.Count]; Parallel.ForEach(Partitioner.Create(0, sourceCorpus.Count), range => { IntPtr decoderHandle = IntPtr.Zero; try { decoderHandle = Thot.LoadDecoder(smtModelHandle, parameters); for (int i = range.Item1; i < range.Item2; i++) { IReadOnlyList <string> sourceSegment = sourceCorpus[i]; results[i] = Thot.DoTranslateNBest(decoderHandle, Thot.decoder_translateNBest, K, sourceSegment, false, sourceSegment, CreateTranslationInfo).ToArray(); } } finally { if (decoderHandle != IntPtr.Zero) { Thot.decoder_close(decoderHandle); } } }); return(results); } finally { if (smtModelHandle != IntPtr.Zero) { Thot.smtModel_close(smtModelHandle); } } }
public TranslationResult Translate(IReadOnlyList <string> segment) { CheckDisposed(); return(Thot.DoTranslate(_decoderHandle, Thot.decoder_translate, segment, false, segment, CreateResult)); }
public ThotLanguageModel(string lmPrefix) { _handle = Thot.langModel_open(lmPrefix); }
protected override void DisposeUnmanagedResources() { Thot.langModel_close(_handle); }
private static void UpdateWeights(IntPtr weightUpdaterHandle, IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, HashSet <TranslationInfo>[] nbestLists, float[] curWeights) { IntPtr[] nativeTuneTargetCorpus = tuneTargetCorpus.Select(Thot.ConvertStringsToNativeUtf8).ToArray(); int sizeOfPtr = Marshal.SizeOf <IntPtr>(); int sizeOfDouble = Marshal.SizeOf <double>(); IntPtr nativeNBestLists = Marshal.AllocHGlobal(nbestLists.Length * sizeOfPtr); IntPtr nativeScoreComps = Marshal.AllocHGlobal(nbestLists.Length * sizeOfPtr); var nativeNBestListLens = new uint[nbestLists.Length]; for (int i = 0; i < nbestLists.Length; i++) { IntPtr nativeNBestList = Marshal.AllocHGlobal(nbestLists[i].Count * sizeOfPtr); IntPtr nativeListScoreComps = Marshal.AllocHGlobal(nbestLists[i].Count * sizeOfPtr); int j = 0; foreach (TranslationInfo ti in nbestLists[i]) { IntPtr nativeSegment = Thot.ConvertStringsToNativeUtf8(ti.Translation); Marshal.WriteIntPtr(nativeNBestList, j * sizeOfPtr, nativeSegment); IntPtr nativeTransScoreComps = Marshal.AllocHGlobal((ti.ScoreComponents.Length - 1) * sizeOfDouble); Marshal.Copy(ti.ScoreComponents, 0, nativeTransScoreComps, ti.ScoreComponents.Length - 1); Marshal.WriteIntPtr(nativeListScoreComps, j * sizeOfPtr, nativeTransScoreComps); j++; } Marshal.WriteIntPtr(nativeNBestLists, i * sizeOfPtr, nativeNBestList); Marshal.WriteIntPtr(nativeScoreComps, i * sizeOfPtr, nativeListScoreComps); nativeNBestListLens[i] = (uint)nbestLists[i].Count; } try { Thot.llWeightUpdater_updateClosedCorpus(weightUpdaterHandle, nativeTuneTargetCorpus, nativeNBestLists, nativeScoreComps, nativeNBestListLens, curWeights, (uint)nbestLists.Length, (uint)curWeights.Length - 1); } finally { foreach (IntPtr nativeSegment in nativeTuneTargetCorpus) { Marshal.FreeHGlobal(nativeSegment); } for (int i = 0; i < nbestLists.Length; i++) { IntPtr nativeNBestList = Marshal.ReadIntPtr(nativeNBestLists, i * sizeOfPtr); IntPtr nativeListScoreComps = Marshal.ReadIntPtr(nativeScoreComps, i * sizeOfPtr); for (int j = 0; j < nbestLists[i].Count; j++) { IntPtr nativeSegment = Marshal.ReadIntPtr(nativeNBestList, j * sizeOfPtr); Marshal.FreeHGlobal(nativeSegment); IntPtr nativeTransScoreComps = Marshal.ReadIntPtr(nativeListScoreComps, j * sizeOfPtr); Marshal.FreeHGlobal(nativeTransScoreComps); } Marshal.FreeHGlobal(nativeNBestList); Marshal.FreeHGlobal(nativeListScoreComps); } Marshal.FreeHGlobal(nativeNBestLists); Marshal.FreeHGlobal(nativeScoreComps); } }
public ThotSmtParameters Tune(ThotSmtParameters parameters, IReadOnlyList <IReadOnlyList <string> > tuneSourceCorpus, IReadOnlyList <IReadOnlyList <string> > tuneTargetCorpus, SmtBatchTrainStats stats, IProgress <ProgressStatus> progress) { IntPtr weightUpdaterHandle = Thot.llWeightUpdater_create(); try { var iterQualities = new List <double>(); double bestQuality = double.MinValue; ThotSmtParameters bestParameters = null; int iter = 0; HashSet <TranslationInfo>[] curNBestLists = null; float[] curWeights = parameters.ModelWeights.ToArray(); while (true) { progress.Report(new ProgressStatus(iter, MaxIterations)); ThotSmtParameters newParameters = parameters.Clone(); newParameters.ModelWeights = curWeights; newParameters.Freeze(); IList <TranslationInfo>[] nbestLists = GetNBestLists(newParameters, tuneSourceCorpus).ToArray(); double quality = Evaluation.CalculateBleu(nbestLists.Select(nbl => nbl.First().Translation), tuneTargetCorpus); iterQualities.Add(quality); if (quality > bestQuality) { bestQuality = quality; bestParameters = newParameters; } iter++; if (iter >= MaxIterations || IsTuningConverged(iterQualities)) { break; } if (curNBestLists == null) { curNBestLists = nbestLists.Select(nbl => new HashSet <TranslationInfo>(nbl)).ToArray(); } else { for (int i = 0; i < nbestLists.Length; i++) { curNBestLists[i].UnionWith(nbestLists[i]); } } UpdateWeights(weightUpdaterHandle, tuneTargetCorpus, curNBestLists, curWeights); } if (iter < MaxIterations) { progress.Report(new ProgressStatus(1.0)); } stats.TranslationModelBleu = bestQuality; return(bestParameters); } finally { Thot.llWeightUpdater_close(weightUpdaterHandle); } }
internal void CloseHandle() { Thot.decoder_close(_decoderHandle); }
internal void LoadHandle() { _decoderHandle = Thot.LoadDecoder(_smtModel.Handle, _smtModel.Parameters); }
public void TrainingIteration() { CheckDisposed(); Thot.swAlignModel_train(Handle, 1); }