/// <summary>
        /// 翻译一条语句
        /// </summary>
        /// <param name="sourceText">源文本</param>
        /// <param name="desLang">目标语言</param>
        /// <param name="srcLang">源语言</param>
        /// <returns>翻译后的语句,如果翻译有错误会返回空,可以通过GetLastError来获取错误</returns>
        #region Explanation

        /*
         * An Approximated Viterbi Algorithm for sparse model
         * Author: jsc723
         *
         * Our problem can be modeled as an HMM and we can apply Viterbi algorithm to decode the best
         * match for each timestep.
         * Let T[i, t] store the probability at time t that the most likely sequence so far ends at i. (i.e. most likely seq at t = (x_1, x_2, ... , x_t=i)
         * Assume there are K possible sentences (states).
         *
         * We use the following transition model:
         *     P(transition from state i to j) = (1-pTransitionSkip)* v if j == i + 1
         *                                     = pTransitionSkip * v otherwise
         *                                     where (1-pTransitionSkip*v + (K-1)*pTransitionSkip*v = 1
         *     (Assume K >= 2)
         *     So simply use pTransitionSkip and (1-pTransitionSkip) due to normalization
         *
         *
         * Initial probabilities P(i) = 1.0 / K
         *     which is same for all states, so we can use 1.0 due to normalization
         *
         * P(state = i | observation at t) = ComputeSimilarity(jp_text[i], sourceText)
         *     see the implementation below for details
         *
         * A forward step in Viterbi algorithm at time t can be described as
         * for each state i = {1, 2, ..., K} do
         *     T[i, t] <- max(k)(T[i, t-1] * P(transition from state k to i) * P(state = i | observation at t)
         *
         * This requires O(K^2), but our K > 30000, so it will be too slow for our case.
         * So we need to approximate:
         *     We only consider the case when two of {T[i, t-1], P(transition from state k to i), P(state = i | observation at t)} are large.
         *     Let possibleCursors be a list of large T[i, t-1], and sum(possibleCursors) == 0.8
         *     For each T[i, t-1] in possibleCursors , we consider
         *          t[i, t-1]*P(i to i+1)*P(i+1 | o_t)  [Case 1]
         *              and
         *          (
         *              for all k that P(k | o_t) is large: t[i, t-1]*P(i to not i+1)*P(k | o_t)
         *              covered in the next case more efficiently, so skiped
         *          )
         *     For all k that P(k | o_t) is large:
         *          max(t[*, t-1])*P(i to not i+1)*P(i+1 | o_t)  [Case 2]
         *              and
         *          (
         *              t[k-1, t-1]*P(k-1 to k)*P(k | o_t)
         *              where t[k-1, t-1] == 0.2 / (K - possibleCursors.Count) if k-1 not in possibleCursors
         *                                == possibleCursors[k-1] if k-1 in possibleCursors
         *              However, in this case, if k-1 is not in possibleCursor, t[k-1, t-1] will be extremely small, and if
         *              k-1 is in possibleCursor, it is already covered in the previous case. Therefore we can simply skip this case.
         *           )
         *
         * The runtime is now O(MK) where M is the maximum size of possibleCursors and is a constant.
         *
         */
        #endregion
        public string Translate(string sourceText, string desLang, string srcLang)
        {
            //sourceText = addNoise(addNoise2(sourceText)); //The translator is able to find the correct match on hook mode under a high noise
            Console.WriteLine(String.Format("Input:{0}", sourceText));
            if (jp_text.Count == 0)
            {
                return("No translation available");
            }

            if (sourceText.Length >= R_MAX_LEN)
            {
                sourceText = sourceText.Substring(0, R_MAX_LEN - 1);
            }

            double pMostLikelyPrevCursor      = possibleCursors.Count == 0 ? 1.0 / pTransitionNext : possibleCursors.Max(i => i.Value);
            CursorPriorityQueue nextCursorsPQ = new CursorPriorityQueue(MAX_CURSOR);

            for (int i = 0; i < jp_text.Count; i++)
            {
                double s = ComputeSimilarity(sourceText, jp_text[i]);
                if (possibleCursors.ContainsKey(i - 1)) // [Case 1]
                {
                    double pSequantial = possibleCursors[i - 1] * pTransitionNext * s;
                    nextCursorsPQ.Add(i, pSequantial);
                }
                double pSkip = pMostLikelyPrevCursor * pTransitionSkip * s; //[Case 2]
                nextCursorsPQ.Add(i, pSkip);
            }
            //Softmax on next cursors
            List <int>    nextCursorsIdx = nextCursorsPQ.Indices().ToList();
            var           z         = nextCursorsPQ.Values();
            double        z_sum     = z.Sum();
            var           z_norm    = z.Select(i => i / z_sum); //take an extra normalization
            var           z_exp     = z_norm.Select(i => Math.Exp(SoftmaxCoeff * i));
            double        sum_z_exp = z_exp.Sum();
            List <double> z_softmax = z_exp.Select(i => i / sum_z_exp).ToList();

            possibleCursors.Clear();

            for (int i = 0; i < z_softmax.Count; i++)
            {
                int j = nextCursorsIdx[i];
                if (possibleCursors.ContainsKey(j))
                {
                    possibleCursors[j] = Math.Max(possibleCursors[j], z_softmax[i]);
                }
                else
                {
                    possibleCursors[j] = z_softmax[i];
                }
            }
            int    maxI = 0;
            double maxP = 0.0;

            foreach (int k in possibleCursors.Keys)
            {
                if (maxP < possibleCursors[k])
                {
                    maxP = possibleCursors[k];
                    maxI = k;
                }
                //For debug
                Console.WriteLine(String.Format("{0}:{1}", k, jp_text[k]));
                Console.WriteLine(possibleCursors[k]);
            }
            if (possibleCursors.Count == 0)
            {
                return("无匹配文本");
            }
            Console.WriteLine(String.Format("[{0}:{1}]", maxI, jp_text[maxI]));
            Console.WriteLine("------");
            return(cn_text[maxI] == "" ? jp_text[maxI] : cn_text[maxI]);
        }
예제 #2
0
        /// <summary>
        /// 翻译一条语句
        /// </summary>
        /// <param name="sourceText">源文本</param>
        /// <param name="desLang">目标语言</param>
        /// <param name="srcLang">源语言</param>
        /// <returns>翻译后的语句,如果翻译有错误会返回空,可以通过GetLastError来获取错误</returns>
        #region Explanation

        /*
         * An Approximated Viterbi Algorithm for sparse model
         * Author: jsc723
         *
         * Our problem can be modeled as an HMM and we can apply Viterbi algorithm to decode the best
         * match for each timestep.
         * Let T[i, t] store the probability at time t that the most likely sequence so far ends at i. (i.e. most likely seq at t = (x_1, x_2, ... , x_t=i)
         * Assume there are K possible sentences (states).
         *
         * We use the following transition model:
         *     P(transition from state i to j) = pNext * v       if j == i + 1  // go to next sentence
         *                                     = pShortJump * v  if j != i + 1 and abs(j - i - 1) < 10 // jump to somewhere nearby
         *                                     = pLongJump * v   otherwise  // jump to somewhere faraway (e.g. load game data)
         *          where pNext*v + 20*pShortJump*v + (K-21)*pLongJump*v == 1, v is a normalization factor
         *     (Assume K >> 20)
         *     Notice that we can choose any (pNext, pShortJump, pLongJump) due to normalization.
         *
         *
         * Initial probabilities P(i) = 1.0 / K
         *     which is same for all states, so we can use 1.0 due to normalization
         *
         * P(state = i | observation at t) = ComputeSimilarity(jp_text[i], sourceText)
         *     see the implementation below for details
         *
         * A forward step in Viterbi algorithm at time t can be described as
         * for each state i = {1, 2, ..., K} do
         *     T[i, t] <- max(k)(T[i, t-1] * P(transition from state k to i) * P(state = i | observation at t))
         *
         * This requires O(K^2), but our K > 30000, so it will be too slow for our case.
         * So we need to approximate:
         *     We only consider the case when at least one of {T[i, t-1], P(transition from state k to i)} are large.
         *     Let possibleCursors be a list of large T[i, t-1], and sum(possibleCursors) == 0.8,
         *     which means I'm 80% sure the previous state is one of the state in possibleCursors
         *     For each T[i, t-1] in possibleCursors , we consider
         *       [Case 1]
         *          T[i, t-1]*P(i to i+1)*P(i+1 | o_t) //transition to the next line
         *       [Case 2]
         *          T[i, t-1]*P(i to somewhere u within [i-10, i+10])*P(u | o_t)  [Case 2] //transition to somewhere nearby
         *
         *     At this point, if we've already find a confident solution (indicated by some threshold), we can stop here.
         *     Notice that the runtime is O(M), where M is the maximum size of possibleCursors and is a constant!
         *
         *     If we don't find a confident solution after case 1 and 2, we have to search for every sentence:
         *     For all k that P(k | o_t) is large, consider:
         *        [Case 3]
         *          max(T[*, t-1])*P(i to k)*P(k | o_t) // a long jump from the most likely cursor
         *
         *     The runtime is now O(MK). (linear)
         *
         */
        #endregion
        public string Translate(string sourceText, string desLang, string srcLang)
        {
            //sourceText = addNoise(addNoise2(sourceText)); //The translator is able to find the correct match on hook mode under a high noise
            Console.WriteLine(String.Format("Input:{0}", sourceText));
            if (jp_text.Count == 0)
            {
                return("补丁为空");
            }

            if (sourceText.Length >= R_MAX_LEN)
            {
                sourceText = sourceText.Substring(0, R_MAX_LEN - 1);
            }

            double pMostLikelyPrevCursor      = possibleCursors.Count == 0 ? 1.0 / MAX_CURSOR : possibleCursors.Max(i => i.Value);
            CursorPriorityQueue nextCursorsPQ = new CursorPriorityQueue(MAX_CURSOR);

            var maxPSeq  = 0.0;
            var maxPSkip = 0.0;

            if (possibleCursors.Count > 0)
            {
                foreach (int k in possibleCursors.Keys)
                {
                    int pivot = k + 1;
                    for (int u = pivot - shortJumpMaxDistance; u < pivot + shortJumpMaxDistance; u++)
                    {
                        if (u >= 0 && u < jp_text.Count)
                        {
                            double s = ComputeSimilarity(sourceText, jp_text[u]);
                            int    d = Math.Abs(u - pivot);
                            // [Case 1] or [Case 2]
                            double pTransition = u == pivot ? pTransitionNext : pTransitionShortJump;
                            double pSequantial = possibleCursors[k] * pTransition * s;
                            maxPSeq = Math.Max(maxPSeq, pSequantial);
                            nextCursorsPQ.Add(u, pSequantial);
                        }
                    }
                }
            }
            // do a full search only if we are not confident after case 1 and 2
            if (maxPSeq < pFullSearchThresh)
            {
                for (int i = 0; i < jp_text.Count; i++)
                {
                    double s     = ComputeSimilarity(sourceText, jp_text[i]);
                    double pSkip = pMostLikelyPrevCursor * pTransitionLongJump * s; //[Case 3]
                    maxPSkip = Math.Max(maxPSkip, pSkip);
                    nextCursorsPQ.Add(i, pSkip);
                }
            }
            Console.WriteLine("maxPSeq = {0}", maxPSeq);
            Console.WriteLine("maxPSkip = {0}", maxPSkip);
            //Softmax on next cursors
            List <int>    nextCursorsIdx = nextCursorsPQ.Indices().ToList();
            var           z         = nextCursorsPQ.Values();
            double        z_sum     = z.Sum();
            var           z_norm    = z.Select(i => i / z_sum); //take an extra normalization
            var           z_exp     = z_norm.Select(i => Math.Exp(SoftmaxCoeff * i));
            double        sum_z_exp = z_exp.Sum();
            List <double> z_softmax = z_exp.Select(i => i / sum_z_exp).ToList();

            possibleCursors.Clear();

            for (int i = 0; i < z_softmax.Count; i++)
            {
                int j = nextCursorsIdx[i];
                if (possibleCursors.ContainsKey(j))
                {
                    possibleCursors[j] = Math.Max(possibleCursors[j], z_softmax[i]);
                }
                else
                {
                    possibleCursors[j] = z_softmax[i];
                }
            }
            int    maxI = 0;
            double maxP = 0.0;

            foreach (int k in possibleCursors.Keys)
            {
                if (maxP < possibleCursors[k])
                {
                    maxP = possibleCursors[k];
                    maxI = k;
                }
                //For debug
                Console.WriteLine(String.Format("{0}:{1}", k, jp_text[k]));
                Console.WriteLine(possibleCursors[k]);
            }
            if (possibleCursors.Count == 0 || Math.Max(maxPSkip, maxPSeq) < minConfidenceThresh)
            {
                return("无匹配文本");
            }
            Console.WriteLine(String.Format("[{0}:{1}]", maxI, jp_text[maxI]));
            Console.WriteLine("------");
            return(cn_text[maxI] == "" ? jp_text[maxI] : cn_text[maxI]);
        }