/// <summary> /// 翻译一条语句 /// </summary> /// <param name="sourceText">源文本</param> /// <param name="desLang">目标语言</param> /// <param name="srcLang">源语言</param> /// <returns>翻译后的语句,如果翻译有错误会返回空,可以通过GetLastError来获取错误</returns> #region Explanation /* * An Approximated Viterbi Algorithm for sparse model * Author: jsc723 * * Our problem can be modeled as an HMM and we can apply Viterbi algorithm to decode the best * match for each timestep. * Let T[i, t] store the probability at time t that the most likely sequence so far ends at i. (i.e. most likely seq at t = (x_1, x_2, ... , x_t=i) * Assume there are K possible sentences (states). * * We use the following transition model: * P(transition from state i to j) = (1-pTransitionSkip)* v if j == i + 1 * = pTransitionSkip * v otherwise * where (1-pTransitionSkip*v + (K-1)*pTransitionSkip*v = 1 * (Assume K >= 2) * So simply use pTransitionSkip and (1-pTransitionSkip) due to normalization * * * Initial probabilities P(i) = 1.0 / K * which is same for all states, so we can use 1.0 due to normalization * * P(state = i | observation at t) = ComputeSimilarity(jp_text[i], sourceText) * see the implementation below for details * * A forward step in Viterbi algorithm at time t can be described as * for each state i = {1, 2, ..., K} do * T[i, t] <- max(k)(T[i, t-1] * P(transition from state k to i) * P(state = i | observation at t) * * This requires O(K^2), but our K > 30000, so it will be too slow for our case. * So we need to approximate: * We only consider the case when two of {T[i, t-1], P(transition from state k to i), P(state = i | observation at t)} are large. * Let possibleCursors be a list of large T[i, t-1], and sum(possibleCursors) == 0.8 * For each T[i, t-1] in possibleCursors , we consider * t[i, t-1]*P(i to i+1)*P(i+1 | o_t) [Case 1] * and * ( * for all k that P(k | o_t) is large: t[i, t-1]*P(i to not i+1)*P(k | o_t) * covered in the next case more efficiently, so skiped * ) * For all k that P(k | o_t) is large: * max(t[*, t-1])*P(i to not i+1)*P(i+1 | o_t) [Case 2] * and * ( * t[k-1, t-1]*P(k-1 to k)*P(k | o_t) * where t[k-1, t-1] == 0.2 / (K - possibleCursors.Count) if k-1 not in possibleCursors * == possibleCursors[k-1] if k-1 in possibleCursors * However, in this case, if k-1 is not in possibleCursor, t[k-1, t-1] will be extremely small, and if * k-1 is in possibleCursor, it is already covered in the previous case. Therefore we can simply skip this case. * ) * * The runtime is now O(MK) where M is the maximum size of possibleCursors and is a constant. * */ #endregion public string Translate(string sourceText, string desLang, string srcLang) { //sourceText = addNoise(addNoise2(sourceText)); //The translator is able to find the correct match on hook mode under a high noise Console.WriteLine(String.Format("Input:{0}", sourceText)); if (jp_text.Count == 0) { return("No translation available"); } if (sourceText.Length >= R_MAX_LEN) { sourceText = sourceText.Substring(0, R_MAX_LEN - 1); } double pMostLikelyPrevCursor = possibleCursors.Count == 0 ? 1.0 / pTransitionNext : possibleCursors.Max(i => i.Value); CursorPriorityQueue nextCursorsPQ = new CursorPriorityQueue(MAX_CURSOR); for (int i = 0; i < jp_text.Count; i++) { double s = ComputeSimilarity(sourceText, jp_text[i]); if (possibleCursors.ContainsKey(i - 1)) // [Case 1] { double pSequantial = possibleCursors[i - 1] * pTransitionNext * s; nextCursorsPQ.Add(i, pSequantial); } double pSkip = pMostLikelyPrevCursor * pTransitionSkip * s; //[Case 2] nextCursorsPQ.Add(i, pSkip); } //Softmax on next cursors List <int> nextCursorsIdx = nextCursorsPQ.Indices().ToList(); var z = nextCursorsPQ.Values(); double z_sum = z.Sum(); var z_norm = z.Select(i => i / z_sum); //take an extra normalization var z_exp = z_norm.Select(i => Math.Exp(SoftmaxCoeff * i)); double sum_z_exp = z_exp.Sum(); List <double> z_softmax = z_exp.Select(i => i / sum_z_exp).ToList(); possibleCursors.Clear(); for (int i = 0; i < z_softmax.Count; i++) { int j = nextCursorsIdx[i]; if (possibleCursors.ContainsKey(j)) { possibleCursors[j] = Math.Max(possibleCursors[j], z_softmax[i]); } else { possibleCursors[j] = z_softmax[i]; } } int maxI = 0; double maxP = 0.0; foreach (int k in possibleCursors.Keys) { if (maxP < possibleCursors[k]) { maxP = possibleCursors[k]; maxI = k; } //For debug Console.WriteLine(String.Format("{0}:{1}", k, jp_text[k])); Console.WriteLine(possibleCursors[k]); } if (possibleCursors.Count == 0) { return("无匹配文本"); } Console.WriteLine(String.Format("[{0}:{1}]", maxI, jp_text[maxI])); Console.WriteLine("------"); return(cn_text[maxI] == "" ? jp_text[maxI] : cn_text[maxI]); }
/// <summary> /// 翻译一条语句 /// </summary> /// <param name="sourceText">源文本</param> /// <param name="desLang">目标语言</param> /// <param name="srcLang">源语言</param> /// <returns>翻译后的语句,如果翻译有错误会返回空,可以通过GetLastError来获取错误</returns> #region Explanation /* * An Approximated Viterbi Algorithm for sparse model * Author: jsc723 * * Our problem can be modeled as an HMM and we can apply Viterbi algorithm to decode the best * match for each timestep. * Let T[i, t] store the probability at time t that the most likely sequence so far ends at i. (i.e. most likely seq at t = (x_1, x_2, ... , x_t=i) * Assume there are K possible sentences (states). * * We use the following transition model: * P(transition from state i to j) = pNext * v if j == i + 1 // go to next sentence * = pShortJump * v if j != i + 1 and abs(j - i - 1) < 10 // jump to somewhere nearby * = pLongJump * v otherwise // jump to somewhere faraway (e.g. load game data) * where pNext*v + 20*pShortJump*v + (K-21)*pLongJump*v == 1, v is a normalization factor * (Assume K >> 20) * Notice that we can choose any (pNext, pShortJump, pLongJump) due to normalization. * * * Initial probabilities P(i) = 1.0 / K * which is same for all states, so we can use 1.0 due to normalization * * P(state = i | observation at t) = ComputeSimilarity(jp_text[i], sourceText) * see the implementation below for details * * A forward step in Viterbi algorithm at time t can be described as * for each state i = {1, 2, ..., K} do * T[i, t] <- max(k)(T[i, t-1] * P(transition from state k to i) * P(state = i | observation at t)) * * This requires O(K^2), but our K > 30000, so it will be too slow for our case. * So we need to approximate: * We only consider the case when at least one of {T[i, t-1], P(transition from state k to i)} are large. * Let possibleCursors be a list of large T[i, t-1], and sum(possibleCursors) == 0.8, * which means I'm 80% sure the previous state is one of the state in possibleCursors * For each T[i, t-1] in possibleCursors , we consider * [Case 1] * T[i, t-1]*P(i to i+1)*P(i+1 | o_t) //transition to the next line * [Case 2] * T[i, t-1]*P(i to somewhere u within [i-10, i+10])*P(u | o_t) [Case 2] //transition to somewhere nearby * * At this point, if we've already find a confident solution (indicated by some threshold), we can stop here. * Notice that the runtime is O(M), where M is the maximum size of possibleCursors and is a constant! * * If we don't find a confident solution after case 1 and 2, we have to search for every sentence: * For all k that P(k | o_t) is large, consider: * [Case 3] * max(T[*, t-1])*P(i to k)*P(k | o_t) // a long jump from the most likely cursor * * The runtime is now O(MK). (linear) * */ #endregion public string Translate(string sourceText, string desLang, string srcLang) { //sourceText = addNoise(addNoise2(sourceText)); //The translator is able to find the correct match on hook mode under a high noise Console.WriteLine(String.Format("Input:{0}", sourceText)); if (jp_text.Count == 0) { return("补丁为空"); } if (sourceText.Length >= R_MAX_LEN) { sourceText = sourceText.Substring(0, R_MAX_LEN - 1); } double pMostLikelyPrevCursor = possibleCursors.Count == 0 ? 1.0 / MAX_CURSOR : possibleCursors.Max(i => i.Value); CursorPriorityQueue nextCursorsPQ = new CursorPriorityQueue(MAX_CURSOR); var maxPSeq = 0.0; var maxPSkip = 0.0; if (possibleCursors.Count > 0) { foreach (int k in possibleCursors.Keys) { int pivot = k + 1; for (int u = pivot - shortJumpMaxDistance; u < pivot + shortJumpMaxDistance; u++) { if (u >= 0 && u < jp_text.Count) { double s = ComputeSimilarity(sourceText, jp_text[u]); int d = Math.Abs(u - pivot); // [Case 1] or [Case 2] double pTransition = u == pivot ? pTransitionNext : pTransitionShortJump; double pSequantial = possibleCursors[k] * pTransition * s; maxPSeq = Math.Max(maxPSeq, pSequantial); nextCursorsPQ.Add(u, pSequantial); } } } } // do a full search only if we are not confident after case 1 and 2 if (maxPSeq < pFullSearchThresh) { for (int i = 0; i < jp_text.Count; i++) { double s = ComputeSimilarity(sourceText, jp_text[i]); double pSkip = pMostLikelyPrevCursor * pTransitionLongJump * s; //[Case 3] maxPSkip = Math.Max(maxPSkip, pSkip); nextCursorsPQ.Add(i, pSkip); } } Console.WriteLine("maxPSeq = {0}", maxPSeq); Console.WriteLine("maxPSkip = {0}", maxPSkip); //Softmax on next cursors List <int> nextCursorsIdx = nextCursorsPQ.Indices().ToList(); var z = nextCursorsPQ.Values(); double z_sum = z.Sum(); var z_norm = z.Select(i => i / z_sum); //take an extra normalization var z_exp = z_norm.Select(i => Math.Exp(SoftmaxCoeff * i)); double sum_z_exp = z_exp.Sum(); List <double> z_softmax = z_exp.Select(i => i / sum_z_exp).ToList(); possibleCursors.Clear(); for (int i = 0; i < z_softmax.Count; i++) { int j = nextCursorsIdx[i]; if (possibleCursors.ContainsKey(j)) { possibleCursors[j] = Math.Max(possibleCursors[j], z_softmax[i]); } else { possibleCursors[j] = z_softmax[i]; } } int maxI = 0; double maxP = 0.0; foreach (int k in possibleCursors.Keys) { if (maxP < possibleCursors[k]) { maxP = possibleCursors[k]; maxI = k; } //For debug Console.WriteLine(String.Format("{0}:{1}", k, jp_text[k])); Console.WriteLine(possibleCursors[k]); } if (possibleCursors.Count == 0 || Math.Max(maxPSkip, maxPSeq) < minConfidenceThresh) { return("无匹配文本"); } Console.WriteLine(String.Format("[{0}:{1}]", maxI, jp_text[maxI])); Console.WriteLine("------"); return(cn_text[maxI] == "" ? jp_text[maxI] : cn_text[maxI]); }