Exemplo n.º 1
0
        /**
         * Find best path from input lattice.
         *
         * @param lattice the result of build method
         * @return List of ViterbiNode which consist best path
         */
        public List <ViterbiNode> Search(ViterbiLattice lattice)
        {
            ViterbiNode[][]          endIndexArr = CalculatePathCosts(lattice);
            LinkedList <ViterbiNode> result      = BacktrackBestPath(endIndexArr[0][0]);

            return(result.ToList());
        }
        /**
         * Find token(s) in input text and set found token(s) in arrays as normal tokens
         *
         * @param text
         * @param lattice
         */
        private void ProcessUserDictionary(string text, ViterbiLattice lattice)
        {
            List <UserDictionary.UserDictionaryMatch> matches = userDictionary.FindUserDictionaryMatches(text);

            foreach (UserDictionary.UserDictionaryMatch match in matches)
            {
                int wordId = match.GetWordId();
                int index  = match.GetMatchStartIndex();
                int length = match.GetMatchLength();

                string word = text.Substring(index, length);

                ViterbiNode node           = new ViterbiNode(wordId, word, userDictionary, index, ViterbiNode.NodeType.USER);
                int         nodeStartIndex = index + 1;
                int         nodeEndIndex   = nodeStartIndex + length;

                lattice.AddNode(node, nodeStartIndex, nodeEndIndex);

                if (IsLatticeBrokenBefore(nodeStartIndex, lattice))
                {
                    RepairBrokenLatticeBefore(lattice, index);
                }

                if (IsLatticeBrokenAfter(nodeStartIndex + length, lattice))
                {
                    RepairBrokenLatticeAfter(lattice, nodeEndIndex);
                }
            }
        }
        private bool ProcessIndex(ViterbiLattice lattice, int startIndex, string suffix)
        {
            bool found = false;

            for (int endIndex = 1; endIndex < suffix.Length + 1; endIndex++)
            {
                string prefix = suffix.Substring(0, endIndex);

                int result = fst.Lookup(prefix);

                if (result > 0)
                {
                    found = true; // Don't produce unknown word starting from this index
                    foreach (int wordId in dictionary.LookupWordIds(result))
                    {
                        ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary, startIndex, ViterbiNode.NodeType.KNOWN);
                        lattice.AddNode(node, startIndex + 1, startIndex + 1 + endIndex);
                    }
                }
                else if (result < 0)
                { // If result is less than zero, continue to next position
                    break;
                }
            }
            return(found);
        }
Exemplo n.º 4
0
        /**
         * Find the best paths with cost at most OPT + costSlack, where OPT is the optimal solution. At most maxCount paths will be returned. The paths are ordered by cost in ascending order.
         *
         * @param lattice  the result of a build method
         * @param maxCount  the maximum number of paths to find
         * @param costSlack  the maximum cost slack of a path
         * @return  MultiSearchResult containing the shortest paths and their costs
         */
        public MultiSearchResult SearchMultiple(ViterbiLattice lattice, int maxCount, int costSlack)
        {
            CalculatePathCosts(lattice);
            MultiSearchResult result = multiSearcher.GetShortestPaths(lattice, maxCount, costSlack);

            return(result);
        }
Exemplo n.º 5
0
        public string Format(ViterbiLattice lattice, List <ViterbiNode> bestPath)
        {
            InitBestPathMap(bestPath);

            StringBuilder builder = new StringBuilder();

            builder.Append(FormatHeader());
            builder.Append(FormatNodes(lattice));
            builder.Append(FormatTrailer());
            return(builder.ToString());
        }
        private int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, String suffix, bool found)
        {
            int unknownWordLength = 0;

            int[] definition = characterDefinitions.LookupDefinition(category);

            if (definition[CharacterDefinitions.INVOKE] == 1 || found == false)
            {
                if (definition[CharacterDefinitions.GROUP] == 0)
                {
                    unknownWordLength = 1;
                }
                else
                {
                    unknownWordLength = 1;
                    for (int j = 1; j < suffix.Length; j++)
                    {
                        char c = suffix[j];

                        int[] categories = characterDefinitions.LookupCategories(c);

                        if (categories == null)
                        {
                            break;
                        }

                        if (i < categories.Length && category == categories[i])
                        {
                            unknownWordLength++;
                        }
                        else
                        {
                            break;
                        }
                    }
                }
            }

            if (unknownWordLength > 0)
            {
                string unkWord = suffix.Substring(0, unknownWordLength);
                int[]  wordIds = unknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same

                foreach (int wordId in wordIds)
                {
                    ViterbiNode node = new ViterbiNode(wordId, unkWord, unknownDictionary, startIndex, ViterbiNode.NodeType.UNKNOWN);
                    lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength);
                }
                unknownWordEndIndex = startIndex + unknownWordLength;
            }

            return(unknownWordEndIndex);
        }
        /**
         * Tries to repair the lattice by creating and adding an additional Viterbi node to the LEFT of the newly
         * inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least
         *
         * @param lattice
         * @param index
         */
        private void RepairBrokenLatticeBefore(ViterbiLattice lattice, int index)
        {
            ViterbiNode[][] nodeStartIndices = lattice.StartIndexArr;

            for (int startIndex = index; startIndex > 0; startIndex--)
            {
                if (nodeStartIndices[startIndex] != null)
                {
                    ViterbiNode glueBase = FindGlueNodeCandidate(index, nodeStartIndices[startIndex], startIndex);
                    if (glueBase != null)
                    {
                        int         length   = index + 1 - startIndex;
                        String      surface  = glueBase.Surface.Substring(0, length);
                        ViterbiNode glueNode = MakeGlueNode(startIndex, glueBase, surface);
                        lattice.AddNode(glueNode, startIndex, startIndex + glueNode.Surface.Length);
                        return;
                    }
                }
            }
        }
        /**
         * Tries to repair the lattice by creating and adding an additional Viterbi node to the RIGHT of the newly
         * inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least
         *  @param lattice
         * @param nodeEndIndex
         */
        private void RepairBrokenLatticeAfter(ViterbiLattice lattice, int nodeEndIndex)
        {
            ViterbiNode[][] nodeEndIndices = lattice.EndIndexArr;

            for (int endIndex = nodeEndIndex + 1; endIndex < nodeEndIndices.Length; endIndex++)
            {
                if (nodeEndIndices[endIndex] != null)
                {
                    ViterbiNode glueBase = FindGlueNodeCandidate(nodeEndIndex, nodeEndIndices[endIndex], endIndex);
                    if (glueBase != null)
                    {
                        int         delta           = endIndex - nodeEndIndex;
                        String      glueBaseSurface = glueBase.Surface;
                        String      surface         = glueBaseSurface.Substring(glueBaseSurface.Length - delta);
                        ViterbiNode glueNode        = MakeGlueNode(nodeEndIndex, glueBase, surface);
                        lattice.AddNode(glueNode, nodeEndIndex, nodeEndIndex + glueNode.Surface.Length);
                        return;
                    }
                }
            }
        }
        /**
         * Get up to maxCount shortest paths with cost at most OPT + costSlack, where OPT is the optimal solution. The results are ordered in ascending order by cost.
         *
         * @param lattice  an instance of ViterbiLattice prosecced by a ViterbiSearcher
         * @param maxCount  the maximum number of results
         * @param costSlack  the maximum cost slack of a path
         * @return  the shortest paths and their costs
         */
        public MultiSearchResult GetShortestPaths(ViterbiLattice lattice, int maxCount, int costSlack)
        {
            pathCosts  = new List <int>();
            sidetracks = new Dictionary <ViterbiNode, MultiSearcher.SidetrackEdge>();
            MultiSearchResult multiSearchResult = new MultiSearchResult();

            BuildSidetracks(lattice);
            ViterbiNode eos = lattice.EndIndexArr[0][0];

            baseCost = eos.PathCost;
            List <SidetrackEdge> paths = GetPaths(eos, maxCount, costSlack);
            int i = 0;

            foreach (SidetrackEdge path in paths)
            {
                LinkedList <ViterbiNode> nodes = GeneratePath(eos, path);
                multiSearchResult.Add(nodes, pathCosts[i]);
                i += 1;
            }
            return(multiSearchResult);
        }
        /**
         * Build lattice from input text
         *
         * @param text  source text for the lattice
         * @return built lattice, not null
         */
        public ViterbiLattice Build(string text)
        {
            int            textLength = text.Length;
            ViterbiLattice lattice    = new ViterbiLattice(textLength + 2);

            lattice.AddBos();

            int unknownWordEndIndex = -1; // index of the last character of unknown word

            for (int startIndex = 0; startIndex < textLength; startIndex++)
            {
                // If no token ends where current token starts, skip this index
                if (lattice.TokenEndsWhereCurrentTokenStarts(startIndex))
                {
                    string suffix = text.Substring(startIndex);
                    bool   found  = ProcessIndex(lattice, startIndex, suffix);

                    // In the case of normal mode, it doesn't process unknown word greedily.
                    if (searchMode || unknownWordEndIndex <= startIndex)
                    {
                        int[] categories = characterDefinitions.LookupCategories(suffix[0]);

                        for (int i = 0; i < categories.Length; i++)
                        {
                            int category = categories[i];
                            unknownWordEndIndex = ProcessUnknownWord(category, i, lattice, unknownWordEndIndex, startIndex, suffix, found);
                        }
                    }
                }
            }

            if (useUserDictionary)
            {
                ProcessUserDictionary(text, lattice);
            }

            lattice.AddEos();

            return(lattice);
        }
        private void BuildSidetracks(ViterbiLattice lattice)
        {
            ViterbiNode[][] startIndexArr = lattice.StartIndexArr;
            ViterbiNode[][] endIndexArr   = lattice.EndIndexArr;

            for (int i = 1; i < startIndexArr.Length; i++)
            {
                if (startIndexArr[i] == null || endIndexArr[i] == null)
                {
                    continue;
                }

                foreach (ViterbiNode node in startIndexArr[i])
                {
                    if (node == null)
                    {
                        break;
                    }

                    BuildSidetracksForNode(endIndexArr[i], node);
                }
            }
        }
Exemplo n.º 12
0
        private string FormatNodes(ViterbiLattice lattice)
        {
            ViterbiNode[][] startsArray = lattice.StartIndexArr;
            ViterbiNode[][] endsArray   = lattice.EndIndexArr;
            this.nodeMap.Clear();
            this.foundBOS = false;

            StringBuilder builder = new StringBuilder();

            for (int i = 1; i < endsArray.Length; i++)
            {
                if (endsArray[i] == null || startsArray[i] == null)
                {
                    continue;
                }
                for (int j = 0; j < endsArray[i].Length; j++)
                {
                    ViterbiNode from = endsArray[i][j];
                    if (from == null)
                    {
                        continue;
                    }
                    builder.Append(FormatNodeIfNew(from));
                    for (int k = 0; k < startsArray[i].Length; k++)
                    {
                        ViterbiNode to = startsArray[i][k];
                        if (to == null)
                        {
                            break;
                        }
                        builder.Append(FormatNodeIfNew(to));
                        builder.Append(FormatEdge(from, to));
                    }
                }
            }
            return(builder.ToString());
        }
Exemplo n.º 13
0
        private ViterbiNode[][] CalculatePathCosts(ViterbiLattice lattice)
        {
            ViterbiNode[][] startIndexArr = lattice.StartIndexArr;
            ViterbiNode[][] endIndexArr   = lattice.EndIndexArr;

            for (int i = 1; i < startIndexArr.Length; i++)
            {
                if (startIndexArr[i] == null || endIndexArr[i] == null)
                {    // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
                    continue;
                }

                foreach (ViterbiNode node in startIndexArr[i])
                {
                    if (node == null)
                    {    // If array doesn't contain ViterbiNode any more, continue to next index
                        break;
                    }

                    UpdateNode(endIndexArr[i], node);
                }
            }
            return(endIndexArr);
        }
        /**
         * Checks whether there exists any node in the lattice that connects to the newly inserted entry on the right side
         * (after the new entry).
         *
         * @param endIndex
         * @param lattice
         * @return whether the lattice has a node that starts at endIndex
         */
        private bool IsLatticeBrokenAfter(int endIndex, ViterbiLattice lattice)
        {
            ViterbiNode[][] nodeStartIndices = lattice.StartIndexArr;

            return(nodeStartIndices[endIndex] == null);
        }
        /**
         * Checks whether there exists any node in the lattice that connects to the newly inserted entry on the left side
         * (before the new entry).
         *
         * @param nodeIndex
         * @param lattice
         * @return whether the lattice has a node that ends at nodeIndex
         */
        private bool IsLatticeBrokenBefore(int nodeIndex, ViterbiLattice lattice)
        {
            ViterbiNode[][] nodeEndIndices = lattice.EndIndexArr;

            return(nodeEndIndices[nodeIndex] == null);
        }
Exemplo n.º 16
0
 public string Format(ViterbiLattice lattice)
 {
     return(Format(lattice, null));
 }