public virtual void PrintLattice(DFSA <string, int> tagLattice, IList <CoreLabel> doc, PrintWriter @out)
        {
            CoreLabel[] docArray = Sharpen.Collections.ToArray(doc, new CoreLabel[doc.Count]);
            // Create answer lattice:
            MutableInteger          nodeId        = new MutableInteger(0);
            DFSA <string, int>      answerLattice = new DFSA <string, int>(null);
            DFSAState <string, int> aInitState    = new DFSAState <string, int>(nodeId, answerLattice);

            answerLattice.SetInitialState(aInitState);
            IDictionary <DFSAState <string, int>, DFSAState <string, int> > stateLinks = Generics.NewHashMap();

            // Convert binary lattice into word lattice:
            TagLatticeToAnswerLattice(tagLattice.InitialState(), aInitState, new StringBuilder(string.Empty), nodeId, 0, 0.0, stateLinks, answerLattice, docArray);
            try
            {
                answerLattice.PrintAttFsmFormat(@out);
            }
            catch (IOException e)
            {
                throw new Exception(e);
            }
        }
        /// <summary>
        /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
        /// of binary predictions.
        /// </summary>
        /// <remarks>
        /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
        /// of binary predictions. This function does a limited amount of post-processing:
        /// preserve white spaces of the input, and not segment between two latin characters or
        /// between two digits. Consequently, the probabilities of all paths in answerLattice
        /// may not sum to 1 (they do sum to 1 if no post processing applies).
        /// </remarks>
        /// <param name="tSource">Current node in Viterbi search graph.</param>
        /// <param name="aSource">Current node in answer lattice.</param>
        /// <param name="answer">Partial word starting at aSource.</param>
        /// <param name="nodeId">Currently unused node identifier for answer graph.</param>
        /// <param name="pos">Current position in docArray.</param>
        /// <param name="cost">Current cost of answer.</param>
        /// <param name="stateLinks">
        /// Maps nodes of the search graph to nodes in answer lattice
        /// (when paths of the search graph are recombined, paths of the answer lattice should be
        /// recombined as well, if at word boundary).
        /// </param>
        private void TagLatticeToAnswerLattice(DFSAState <string, int> tSource, DFSAState <string, int> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, IDictionary <DFSAState <string, int>, DFSAState <string, int> > stateLinks,
                                               DFSA <string, int> answerLattice, CoreLabel[] docArray)
        {
            // Add "1" prediction after the end of the sentence, if applicable:
            if (tSource.IsAccepting() && tSource.ContinuingInputs().IsEmpty())
            {
                tSource.AddTransition(new DFSATransition <string, int>(string.Empty, tSource, new DFSAState <string, int>(-1, null), "1", string.Empty, 0));
            }
            // Get current label, character, and prediction:
            CoreLabel curLabel  = (pos < docArray.Length) ? docArray[pos] : null;
            string    curChr    = null;
            string    origSpace = null;

            if (curLabel != null)
            {
                curChr = curLabel.Get(typeof(CoreAnnotations.OriginalCharAnnotation));
                System.Diagnostics.Debug.Assert((curChr.Length == 1));
                origSpace = curLabel.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation));
            }
            // Get set of successors in search graph:
            ICollection <string> inputs = tSource.ContinuingInputs();
            // Only keep most probable transition out of initial state:
            string answerConstraint = null;

            if (pos == 0)
            {
                double minCost = double.PositiveInfinity;
                // DFSATransition<String, Integer> bestTransition = null;
                foreach (string predictSpace in inputs)
                {
                    DFSATransition <string, int> transition = tSource.Transition(predictSpace);
                    double transitionCost = transition.Score();
                    if (transitionCost < minCost)
                    {
                        if (predictSpace != null)
                        {
                            logger.Info(string.Format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost));
                            minCost          = transitionCost;
                            answerConstraint = predictSpace;
                        }
                    }
                }
            }
            // Follow along each transition:
            foreach (string predictSpace_1 in inputs)
            {
                DFSATransition <string, int> transition = tSource.Transition(predictSpace_1);
                DFSAState <string, int>      tDest      = transition.Target();
                DFSAState <string, int>      newASource = aSource;
                //logger.info(String.format("tsource=%s tdest=%s asource=%s pos=%d predictSpace=%s%n", tSource, tDest, newASource, pos, predictSpace));
                StringBuilder newAnswer = new StringBuilder(answer.ToString());
                int           answerLen = newAnswer.Length;
                string        prevChr   = (answerLen > 0) ? newAnswer.Substring(answerLen - 1) : null;
                double        newCost   = cost;
                // Ignore paths starting with zero:
                if (answerConstraint != null && !answerConstraint.Equals(predictSpace_1))
                {
                    logger.Info(string.Format("Skipping transition %s at pos 0.%n", predictSpace_1));
                    continue;
                }
                // Ignore paths not consistent with input segmentation:
                if (flags.keepAllWhitespaces && "0".Equals(predictSpace_1) && "1".Equals(origSpace))
                {
                    logger.Info(string.Format("Skipping non-boundary at pos %d, since space in the input.%n", pos));
                    continue;
                }
                // Ignore paths adding segment boundaries between two latin characters, or between two digits:
                // (unless already present in original input)
                if ("1".Equals(predictSpace_1) && "0".Equals(origSpace) && prevChr != null && curChr != null)
                {
                    char p = prevChr[0];
                    char c = curChr[0];
                    if (ChineseStringUtils.IsLetterASCII(p) && ChineseStringUtils.IsLetterASCII(c))
                    {
                        logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr));
                        continue;
                    }
                    if (ChineseUtils.IsNumber(p) && ChineseUtils.IsNumber(c))
                    {
                        logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr));
                        continue;
                    }
                }
                // If predictSpace==1, create a new transition in answer search graph:
                if ("1".Equals(predictSpace_1))
                {
                    if (newAnswer.ToString().Length > 0)
                    {
                        // If answer destination node visited before, create a new edge and leave:
                        if (stateLinks.Contains(tSource))
                        {
                            DFSAState <string, int> aDest = stateLinks[tSource];
                            newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest, newAnswer.ToString(), string.Empty, newCost));
                            //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n", newASource, aDest, newAnswer));
                            continue;
                        }
                        // If answer destination node not visited before, create it + new edge:
                        nodeId.IncValue(1);
                        DFSAState <string, int> aDest_1 = new DFSAState <string, int>(nodeId, answerLattice, 0.0);
                        stateLinks[tSource] = aDest_1;
                        newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest_1, newAnswer.ToString(), string.Empty, newCost));
                        //logger.info(String.format("new edge: adest=%s%n", newASource, aDest, newAnswer));
                        //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n%n%n", newASource, aDest, newAnswer));
                        // Reached an accepting state:
                        if (tSource.IsAccepting())
                        {
                            aDest_1.SetAccepting(true);
                            continue;
                        }
                        // Start new answer edge:
                        newASource = aDest_1;
                        newAnswer  = new StringBuilder();
                        newCost    = 0.0;
                    }
                }
                System.Diagnostics.Debug.Assert((curChr != null));
                newAnswer.Append(curChr);
                newCost += transition.Score();
                if (newCost < flags.searchGraphPrune || ChineseStringUtils.IsLetterASCII(curChr[0]))
                {
                    TagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray);
                }
            }
        }
        /// <summary>Returns the lexicon-based segmentation following heuristic h.</summary>
        /// <remarks>
        /// Returns the lexicon-based segmentation following heuristic h.
        /// Note that buildSegmentationLattice must be run first.
        /// Two heuristics are currently available -- MINWORDS and MAXWORDS --
        /// to respectively minimize and maximize the number of segment
        /// (where each segment is a lexicon word, if possible).
        /// </remarks>
        /// <param name="h">Heuristic to use for segmentation.</param>
        /// <returns>Segmented sentence.</returns>
        /// <exception cref="System.NotSupportedException"/>
        /// <seealso cref="BuildSegmentationLattice(string)"/>
        public virtual List <Word> SegmentWords(MaxMatchSegmenter.MatchHeuristic h)
        {
            if (lattice == null || len < 0)
            {
                throw new NotSupportedException("segmentWords must be run first");
            }
            IList <Word> segmentedWords = new List <Word>();

            // Init dynamic programming:
            double[] costs = new double[len + 1];
            IList <DFSATransition <Word, int> > bptrs = new List <DFSATransition <Word, int> >();

            for (int i = 0; i < len + 1; ++i)
            {
                bptrs.Add(null);
            }
            costs[0] = 0.0;
            for (int i_1 = 1; i_1 <= len; ++i_1)
            {
                costs[i_1] = double.MaxValue;
            }
            // DP:
            for (int start = 0; start < len; ++start)
            {
                DFSAState <Word, int> fromState = states[start];
                ICollection <DFSATransition <Word, int> > trs = fromState.Transitions();
                foreach (DFSATransition <Word, int> tr in trs)
                {
                    DFSAState <Word, int> toState = tr.GetTarget();
                    double lcost = tr.Score();
                    int    end   = toState.StateID();
                    //logger.debug("start="+start+" end="+end+" word="+tr.getInput());
                    if (h == MaxMatchSegmenter.MatchHeuristic.Minwords)
                    {
                        // Minimize number of words:
                        if (costs[start] + 1 < costs[end])
                        {
                            costs[end] = costs[start] + lcost;
                            bptrs.Set(end, tr);
                        }
                    }
                    else
                    {
                        //logger.debug("start="+start+" end="+end+" word="+tr.getInput());
                        if (h == MaxMatchSegmenter.MatchHeuristic.Maxwords)
                        {
                            // Maximze number of words:
                            if (costs[start] + 1 < costs[end])
                            {
                                costs[end] = costs[start] - lcost;
                                bptrs.Set(end, tr);
                            }
                        }
                        else
                        {
                            throw new NotSupportedException("unimplemented heuristic");
                        }
                    }
                }
            }
            // Extract min-cost path:
            int i_2 = len;

            while (i_2 > 0)
            {
                DFSATransition <Word, int> tr        = bptrs[i_2];
                DFSAState <Word, int>      fromState = tr.GetSource();
                Word word = tr.GetInput();
                if (!word.Word().Equals(" "))
                {
                    segmentedWords.Add(0, word);
                }
                i_2 = fromState.StateID();
            }
            // Print lattice density ([1,+inf[) : if equal to 1, it means
            // there is only one segmentation using words of the lexicon.
            return(new List <Word>(segmentedWords));
        }
示例#4
0
        public static DFSA <string, int> GetGraph(ISequenceModel ts, IIndex <string> classIndex)
        {
            DFSA <string, int> viterbiSearchGraph = new DFSA <string, int>(null);
            // Set up tag options
            int length      = ts.Length();
            int leftWindow  = ts.LeftWindow();
            int rightWindow = ts.RightWindow();

            System.Diagnostics.Debug.Assert((rightWindow == 0));
            int padLength = length + leftWindow + rightWindow;

            // NOTE: tags[i][j]  : i is index into pos, and j into product
            int[][] tags   = new int[padLength][];
            int[]   tagNum = new int[padLength];
            for (int pos = 0; pos < padLength; pos++)
            {
                tags[pos]   = ts.GetPossibleValues(pos);
                tagNum[pos] = tags[pos].Length;
            }
            // Set up Viterbi search graph:
            DFSAState <string, int>[][] graphStates = null;
            DFSAState <string, int>     startState  = null;
            DFSAState <string, int>     endState    = null;

            if (viterbiSearchGraph != null)
            {
                int stateId = -1;
                startState = new DFSAState <string, int>(++stateId, viterbiSearchGraph, 0.0);
                viterbiSearchGraph.SetInitialState(startState);
                graphStates = new DFSAState[length][];
                for (int pos_1 = 0; pos_1 < length; ++pos_1)
                {
                    //System.err.printf("%d states at pos %d\n",tags[pos].length,pos);
                    graphStates[pos_1] = new DFSAState[tags[pos_1].Length];
                    for (int product = 0; product < tags[pos_1].Length; ++product)
                    {
                        graphStates[pos_1][product] = new DFSAState <string, int>(++stateId, viterbiSearchGraph);
                    }
                }
                // Accepting state:
                endState = new DFSAState <string, int>(++stateId, viterbiSearchGraph, 0.0);
                endState.SetAccepting(true);
            }
            int[] tempTags = new int[padLength];
            // Set up product space sizes
            int[] productSizes = new int[padLength];
            int   curProduct   = 1;

            for (int i = 0; i < leftWindow; i++)
            {
                curProduct *= tagNum[i];
            }
            for (int pos_2 = leftWindow; pos_2 < padLength; pos_2++)
            {
                if (pos_2 > leftWindow + rightWindow)
                {
                    curProduct /= tagNum[pos_2 - leftWindow - rightWindow - 1];
                }
                // shift off
                curProduct *= tagNum[pos_2];
                // shift on
                productSizes[pos_2 - rightWindow] = curProduct;
            }
            double[][] windowScore = new double[padLength][];
            // Score all of each window's options
            for (int pos_3 = leftWindow; pos_3 < leftWindow + length; pos_3++)
            {
                windowScore[pos_3] = new double[productSizes[pos_3]];
                Arrays.Fill(tempTags, tags[0][0]);
                for (int product = 0; product < productSizes[pos_3]; product++)
                {
                    int p     = product;
                    int shift = 1;
                    for (int curPos = pos_3; curPos >= pos_3 - leftWindow; curPos--)
                    {
                        tempTags[curPos] = tags[curPos][p % tagNum[curPos]];
                        p /= tagNum[curPos];
                        if (curPos > pos_3)
                        {
                            shift *= tagNum[curPos];
                        }
                    }
                    if (tempTags[pos_3] == tags[pos_3][0])
                    {
                        // get all tags at once
                        double[] scores = ts.ScoresOf(tempTags, pos_3);
                        // fill in the relevant windowScores
                        for (int t = 0; t < tagNum[pos_3]; t++)
                        {
                            windowScore[pos_3][product + t * shift] = scores[t];
                        }
                    }
                }
            }
            // loop over the classification spot
            for (int pos_4 = leftWindow; pos_4 < length + leftWindow; pos_4++)
            {
                // loop over window product types
                for (int product = 0; product < productSizes[pos_4]; product++)
                {
                    if (pos_4 == leftWindow)
                    {
                        // all nodes in the first spot link to startState:
                        int curTag = tags[pos_4][product % tagNum[pos_4]];
                        //System.err.printf("pos=%d, product=%d, tag=%d score=%.3f\n",pos,product,curTag,windowScore[pos][product]);
                        DFSATransition <string, int> tr = new DFSATransition <string, int>(string.Empty, startState, graphStates[pos_4][product], classIndex.Get(curTag), string.Empty, -windowScore[pos_4][product]);
                        startState.AddTransition(tr);
                    }
                    else
                    {
                        int sharedProduct = product / tagNum[pos_4 + rightWindow];
                        int factor        = productSizes[pos_4] / tagNum[pos_4 + rightWindow];
                        for (int newTagNum = 0; newTagNum < tagNum[pos_4 - leftWindow - 1]; newTagNum++)
                        {
                            int predProduct = newTagNum * factor + sharedProduct;
                            int predTag     = tags[pos_4 - 1][predProduct % tagNum[pos_4 - 1]];
                            int curTag      = tags[pos_4][product % tagNum[pos_4]];
                            //log.info("pos: "+pos);
                            //log.info("product: "+product);
                            //System.err.printf("pos=%d-%d, product=%d-%d, tag=%d-%d score=%.3f\n",pos-1,pos,predProduct,product,predTag,curTag,
                            //  windowScore[pos][product]);
                            DFSAState <string, int>      sourceState = graphStates[pos_4 - leftWindow][predTag];
                            DFSAState <string, int>      destState   = (pos_4 - leftWindow + 1 == graphStates.Length) ? endState : graphStates[pos_4 - leftWindow + 1][curTag];
                            DFSATransition <string, int> tr          = new DFSATransition <string, int>(string.Empty, sourceState, destState, classIndex.Get(curTag), string.Empty, -windowScore[pos_4][product]);
                            graphStates[pos_4 - leftWindow][predTag].AddTransition(tr);
                        }
                    }
                }
            }
            return(viterbiSearchGraph);
        }