private WordGraphConfidences ComputeWordGraphConfidences(WordGraph wordGraph)
        {
            double normalizationFactor = LogSpace.Zero;
            var    backwardProbs       = new double[wordGraph.Arcs.Count];

            for (int i = wordGraph.Arcs.Count - 1; i >= 0; i--)
            {
                WordGraphArc arc = wordGraph.Arcs[i];
                double       sum = LogSpace.One;
                if (!wordGraph.FinalStates.Contains(arc.NextState))
                {
                    sum = LogSpace.Zero;
                    foreach (int nextArcIndex in wordGraph.GetNextArcIndices(arc.NextState))
                    {
                        WordGraphArc nextArc = wordGraph.Arcs[nextArcIndex];
                        sum = LogSpace.Add(sum, LogSpace.Multiple(nextArc.Score, backwardProbs[nextArcIndex]));
                    }
                }
                backwardProbs[i] = sum;
                if (arc.PrevState == WordGraph.InitialState)
                {
                    normalizationFactor = LogSpace.Add(normalizationFactor,
                                                       LogSpace.Multiple(arc.Score, backwardProbs[i]));
                }
            }

            var rawWpps      = new Dictionary <string, Dictionary <int, double> >();
            var forwardProbs = new (double Prob, int Index)[wordGraph.Arcs.Count];
Exemplo n.º 2
0
        private void ComputePrevScores(int state, out double[] prevScores, out int[] stateBestPrevArcs)
        {
            if (IsEmpty)
            {
                prevScores        = new double[0];
                stateBestPrevArcs = new int[0];
                return;
            }

            prevScores        = Enumerable.Repeat(LogSpace.Zero, StateCount).ToArray();
            stateBestPrevArcs = new int[StateCount];

            if (state == InitialState)
            {
                prevScores[InitialState] = InitialStateScore;
            }
            else
            {
                prevScores[state] = 0;
            }

            var accessibleStates = new HashSet <int> {
                state
            };

            for (int arcIndex = 0; arcIndex < Arcs.Count; arcIndex++)
            {
                WordGraphArc arc = Arcs[arcIndex];
                if (accessibleStates.Contains(arc.PrevState))
                {
                    double score = LogSpace.Multiple(arc.Score, prevScores[arc.PrevState]);
                    if (score > prevScores[arc.NextState])
                    {
                        prevScores[arc.NextState]        = score;
                        stateBestPrevArcs[arc.NextState] = arcIndex;
                    }
                    accessibleStates.Add(arc.NextState);
                }
                else
                {
                    if (!accessibleStates.Contains(arc.NextState))
                    {
                        prevScores[arc.NextState] = LogSpace.Zero;
                    }
                }
            }
        }
Exemplo n.º 3
0
        public IEnumerable <double> ComputeRestScores()
        {
            double[] restScores = Enumerable.Repeat(LogSpace.Zero, StateCount).ToArray();

            foreach (int state in _finalStates)
            {
                restScores[state] = InitialStateScore;
            }

            for (int i = Arcs.Count - 1; i >= 0; i--)
            {
                WordGraphArc arc   = Arcs[i];
                double       score = LogSpace.Multiple(arc.Score, restScores[arc.NextState]);
                if (score > restScores[arc.PrevState])
                {
                    restScores[arc.PrevState] = score;
                }
            }

            return(restScores);
        }
Exemplo n.º 4
0
        private static void PruneLexTable(string fileName, double threshold)
        {
            var entries = new List <Tuple <uint, uint, float> >();

#if THOT_TEXT_FORMAT
            using (var reader = new StreamReader(fileName))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    string[] fields = line.Split(' ');
                    entries.Add(Tuple.Create(uint.Parse(fields[0], CultureInfo.InvariantCulture),
                                             uint.Parse(fields[1], CultureInfo.InvariantCulture),
                                             float.Parse(fields[2], CultureInfo.InvariantCulture)));
                }
            }
#else
            using (var reader = new BinaryReader(File.Open(fileName, FileMode.Open)))
            {
                int pos    = 0;
                var length = (int)reader.BaseStream.Length;
                while (pos < length)
                {
                    uint srcIndex = reader.ReadUInt32();
                    pos += sizeof(uint);
                    uint trgIndex = reader.ReadUInt32();
                    pos += sizeof(uint);
                    float numer = reader.ReadSingle();
                    pos += sizeof(float);
                    reader.ReadSingle();
                    pos += sizeof(float);

                    entries.Add(Tuple.Create(srcIndex, trgIndex, numer));
                }
            }
#endif

#if THOT_TEXT_FORMAT
            using (var writer = new StreamWriter(fileName))
#else
            using (var writer = new BinaryWriter(File.Open(fileName, FileMode.Create)))
#endif
            {
                foreach (IGrouping <uint, Tuple <uint, uint, float> > g in entries.GroupBy(e => e.Item1).OrderBy(g => g.Key))
                {
                    Tuple <uint, uint, float>[] groupEntries = g.OrderByDescending(e => e.Item3).ToArray();

                    double lcSrc = groupEntries.Select(e => e.Item3).Skip(1)
                                   .Aggregate((double)groupEntries[0].Item3, (a, n) => LogSpace.Add(a, n));

                    double newLcSrc = -99999;
                    int    count    = 0;
                    foreach (Tuple <uint, uint, float> entry in groupEntries)
                    {
                        double prob = Math.Exp(entry.Item3 - lcSrc);
                        if (prob < threshold)
                        {
                            break;
                        }
                        newLcSrc = LogSpace.Add(newLcSrc, entry.Item3);
                        count++;
                    }

                    for (int i = 0; i < count; i++)
                    {
#if THOT_TEXT_FORMAT
                        writer.Write("{0} {1} {2:0.######} {3:0.######}\n", groupEntries[i].Item1,
                                     groupEntries[i].Item2, groupEntries[i].Item3, newLcSrc);
#else
                        writer.Write(groupEntries[i].Item1);
                        writer.Write(groupEntries[i].Item2);
                        writer.Write(groupEntries[i].Item3);
                        writer.Write((float)newLcSrc);
#endif
                    }
                }
            }
        }
Exemplo n.º 5
0
        /// <summary>
        /// Removes redundant arcs from the word graph.
        /// TODO: This seems to affect the results of an interactive translation session, so don't use it yet.
        /// </summary>
        /// <returns>The optimized word graph.</returns>
        public WordGraph Optimize()
        {
            var dfaArcs           = new List <WordGraphArc>();
            var dfaStates         = new DfaStateCollection();
            var dfaFinalStates    = new HashSet <int>();
            int nextDfaStateIndex = 1;
            var unmarkedStates    = new Queue <DfaState>();

            unmarkedStates.Enqueue(new DfaState(0, new[] { new NfaState(0) }));

            while (unmarkedStates.Count > 0)
            {
                DfaState dfaState      = unmarkedStates.Dequeue();
                var      candidateArcs = new Dictionary <string, DfaArc>();
                foreach ((int arcIndex, NfaState nfaState) in GetArcIndices(dfaState))
                {
                    WordGraphArc arc           = Arcs[arcIndex];
                    int          nextWordIndex = nfaState.WordIndex + 1;
                    DfaArc       candidateArc  = candidateArcs.GetOrCreate(arc.Words[nextWordIndex]);
                    if (nextWordIndex == arc.Words.Count - 1)
                    {
                        candidateArc.NfaStates.Add(new NfaState(arc.NextState));

                        Path path;
                        if (dfaState.Paths.TryGetValue(nfaState.StateIndex, out Path prevPath))
                        {
                            path = new Path(prevPath.StartState, prevPath.Arcs.Concat(arcIndex),
                                            LogSpace.Multiply(prevPath.Score, arc.Score));
                        }
                        else
                        {
                            path = new Path(dfaState.Index, new[] { arcIndex }, arc.Score);
                        }

                        if (!candidateArc.Paths.TryGetValue(arc.NextState, out Path otherPath) ||
                            path.Score > otherPath.Score)
                        {
                            candidateArc.Paths[arc.NextState] = path;
                        }
                    }
                    else
                    {
                        candidateArc.NfaStates.Add(new NfaState(nfaState.StateIndex, arcIndex, nextWordIndex));
                        candidateArc.IsNextSubState = true;

                        if (dfaState.Paths.TryGetValue(nfaState.StateIndex, out Path prevPath))
                        {
                            candidateArc.Paths[nfaState.StateIndex] = prevPath;
                        }
                    }
                }

                foreach (DfaArc candidateArc in candidateArcs.Values)
                {
                    if (!dfaStates.TryGetValue(candidateArc.NfaStates, out DfaState nextDfaState))
                    {
                        int stateIndex = candidateArc.IsNextSubState ? dfaState.Index : nextDfaStateIndex++;
                        nextDfaState = new DfaState(stateIndex, candidateArc.NfaStates);
                        if (candidateArc.IsNextSubState)
                        {
                            foreach (KeyValuePair <int, Path> kvp in candidateArc.Paths)
                            {
                                nextDfaState.Paths.Add(kvp);
                            }
                        }
                        else
                        {
                            dfaStates.Add(nextDfaState);
                        }
                        unmarkedStates.Enqueue(nextDfaState);
                    }

                    bool isFinal = nextDfaState.NfaStates.Where(s => !s.IsSubState)
                                   .Any(s => FinalStates.Contains(s.StateIndex));
                    if ((isFinal || !candidateArc.IsNextSubState) && candidateArc.Paths.Count > 0)
                    {
                        Path bestPath = candidateArc.Paths.Values.MaxBy(p => p.Score);

                        int curState = bestPath.StartState;
                        for (int i = 0; i < bestPath.Arcs.Count; i++)
                        {
                            WordGraphArc nfaArc    = Arcs[bestPath.Arcs[i]];
                            int          nextState = !candidateArc.IsNextSubState && i == bestPath.Arcs.Count - 1
                                                                ? nextDfaState.Index
                                                                : nextDfaStateIndex++;
                            dfaArcs.Add(new WordGraphArc(curState, nextState, nfaArc.Score, nfaArc.Words,
                                                         nfaArc.Alignment, nfaArc.SourceSegmentRange, nfaArc.IsUnknown, nfaArc.WordConfidences));
                            curState = nextState;
                        }
                        if (isFinal)
                        {
                            dfaFinalStates.Add(curState);
                        }
                    }
                }
            }

            return(new WordGraph(dfaArcs, dfaFinalStates, InitialStateScore));
        }