Beispiel #1
0
        /// <summary>
        /// Find token(s) in input text and set found token(s) in arrays as normal tokens
        /// </summary>
        /// <param name="text"></param>
        /// <param name="lattice"></param>
        void ProcessUserDictionary(string text, ViterbiLattice lattice)
        {
            var matches = UserDictionary.FindUserDictionaryMatches(text);

            foreach (var match in matches)
            {
                var wordId = match.WordId;
                var index  = match.MatchStartIndex;
                var length = match.MatchLength;

                var word = text.Substring(index, length);

                var node           = new ViterbiNode(wordId, word, UserDictionary, index, ViterbiNode.NodeType.User);
                var nodeStartIndex = index + 1;
                var nodeEndIndex   = nodeStartIndex + length;

                lattice.AddNode(node, nodeStartIndex, nodeEndIndex);

                if (IsLatticeBrokenBefore(nodeStartIndex, lattice))
                {
                    RepairBrokenLatticeBefore(lattice, index);
                }

                if (IsLatticeBrokenAfter(nodeStartIndex + length, lattice))
                {
                    RepairBrokenLatticeAfter(lattice, nodeEndIndex);
                }
            }
        }
Beispiel #2
0
        ViterbiNode[][] CalculatePathCosts(ViterbiLattice lattice)
        {
            var startIndexArr = lattice.StartIndexArr;
            var endIndexArr   = lattice.EndIndexArr;

            for (var i = 1; i < startIndexArr.Length; i++)
            {
                if (startIndexArr[i] == null || endIndexArr[i] == null)
                {
                    // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
                    continue;
                }

                foreach (var node in startIndexArr[i])
                {
                    if (node == null)
                    {
                        // If array doesn't contain ViterbiNode any more, continue to next index
                        break;
                    }

                    UpdateNode(endIndexArr[i], node);
                }
            }

            return(endIndexArr);
        }
Beispiel #3
0
        bool ProcessIndex(ViterbiLattice lattice, int startIndex, string suffix)
        {
            var found = false;

            for (var endIndex = 1; endIndex < suffix.Length + 1; endIndex++)
            {
                var prefix = suffix.Substring(0, endIndex);
                var result = DoubleArrayTrie.Lookup(prefix);

                if (result > 0)
                {
                    found = true; // Don't produce unknown word starting from this index
                    foreach (var wordId in Dictionary.LookupWordIds(result))
                    {
                        var node = new ViterbiNode(wordId, prefix, Dictionary, startIndex, ViterbiNode.NodeType.Known);
                        lattice.AddNode(node, startIndex + 1, startIndex + 1 + endIndex);
                    }
                }
                else if (result < 0)
                {
                    // If result is less than zero, continue to next position
                    break;
                }
            }

            return(found);
        }
Beispiel #4
0
        public string Format(ViterbiLattice lattice, List <ViterbiNode> bestPath)
        {
            InitBestPathMap(bestPath);

            var builder = new StringBuilder();

            builder.Append(FormatHeader());
            builder.Append(FormatNodes(lattice));
            builder.Append(FormatTrailer());

            return(builder.ToString());
        }
Beispiel #5
0
        int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, string suffix, bool found)
        {
            var unknownWordLength = 0;
            var definition        = CharacterDefinitions.LookupDefinition(category);

            if (definition[CharacterDefinitions.Invoke] == 1 || found == false)
            {
                if (definition[CharacterDefinitions.Group] == 0)
                {
                    unknownWordLength = 1;
                }
                else
                {
                    unknownWordLength = 1;
                    for (var j = 1; j < suffix.Length; j++)
                    {
                        var c = suffix[j];

                        var categories = CharacterDefinitions.LookupCategories(c);

                        if (categories == null)
                        {
                            break;
                        }

                        if (i < categories.Length && category == categories[i])
                        {
                            unknownWordLength++;
                        }
                        else
                        {
                            break;
                        }
                    }
                }
            }

            if (unknownWordLength > 0)
            {
                var unkWord = suffix.Substring(0, unknownWordLength);
                var wordIds = UnknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same

                foreach (var wordId in wordIds)
                {
                    var node = new ViterbiNode(wordId, unkWord, UnknownDictionary, startIndex, ViterbiNode.NodeType.Unknown);
                    lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength);
                }
                unknownWordEndIndex = startIndex + unknownWordLength;
            }

            return(unknownWordEndIndex);
        }
Beispiel #6
0
        /// <summary>
        /// Get up to maxCount shortest paths with cost at most OPT + costSlack, where OPT is the optimal solution. The results are ordered in ascending order by cost.
        /// </summary>
        /// <param name="lattice">an instance of ViterbiLattice prosecced by a ViterbiSearcher</param>
        /// <param name="maxCount">the maximum number of results</param>
        /// <param name="costSlack">the maximum cost slack of a path</param>
        /// <returns>the shortest paths and their costs</returns>
        public MultiSearchResult GetShortestPaths(ViterbiLattice lattice, int maxCount, int costSlack)
        {
            PathCosts.Clear();
            Sidetracks.Clear();
            var multiSearchResult = new MultiSearchResult();

            BuildSidetracks(lattice);
            var eos = lattice.EndIndexArr[0][0];

            BaseCost = eos.PathCost;
            var paths = GetPaths(eos, maxCount, costSlack);

            foreach (var(path, cost) in paths.Zip(Enumerable.Range(0, paths.Count), (p, i) => Tuple.Create(p, PathCosts[i])))
            {
                var nodes = GeneratePath(eos, path);
                multiSearchResult.Add(nodes, cost);
            }

            return(multiSearchResult);
        }
Beispiel #7
0
        /// <summary>
        /// Tries to repair the lattice by creating and adding an additional Viterbi node to the LEFT of the newly
        /// inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least
        /// </summary>
        /// <param name="lattice"></param>
        /// <param name="index"></param>
        void RepairBrokenLatticeBefore(ViterbiLattice lattice, int index)
        {
            var nodeStartIndices = lattice.StartIndexArr;

            for (var startIndex = index; startIndex > 0; startIndex--)
            {
                if (nodeStartIndices[startIndex] != null)
                {
                    var glueBase = FindGlueNodeCandidate(index, nodeStartIndices[startIndex], startIndex);
                    if (glueBase != null)
                    {
                        var length   = index + 1 - startIndex;
                        var surface  = glueBase.Surface.Substring(0, length);
                        var glueNode = MakeGlueNode(startIndex, glueBase, surface);
                        lattice.AddNode(glueNode, startIndex, startIndex + glueNode.Surface.Length);
                        return;
                    }
                }
            }
        }
Beispiel #8
0
        string FormatNodes(ViterbiLattice lattice)
        {
            var startsArray = lattice.StartIndexArr;
            var endsArray   = lattice.EndIndexArr;

            NodeMap.Clear();
            FoundBOS = false;

            var builder = new StringBuilder();

            for (var i = 1; i < endsArray.Length; i++)
            {
                if (endsArray[i] == null || startsArray[i] == null)
                {
                    continue;
                }

                for (var j = 0; j < endsArray[i].Length; j++)
                {
                    var from = endsArray[i][j];
                    if (from == null)
                    {
                        continue;
                    }

                    builder.Append(FormatNodeIfNew(from));
                    for (var k = 0; k < startsArray[i].Length; k++)
                    {
                        var to = startsArray[i][k];
                        if (to == null)
                        {
                            break;
                        }
                        builder.Append(FormatNodeIfNew(to));
                        builder.Append(FormatEdge(from, to));
                    }
                }
            }

            return(builder.ToString());
        }
Beispiel #9
0
        /// <summary>
        /// Tries to repair the lattice by creating and adding an additional Viterbi node to the RIGHT of the newly
        /// inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least
        /// </summary>
        /// <param name="lattice"></param>
        /// <param name="nodeEndIndex"></param>
        void RepairBrokenLatticeAfter(ViterbiLattice lattice, int nodeEndIndex)
        {
            var nodeEndIndices = lattice.EndIndexArr;

            for (var endIndex = nodeEndIndex + 1; endIndex < nodeEndIndices.Length; endIndex++)
            {
                if (nodeEndIndices[endIndex] != null)
                {
                    ViterbiNode glueBase = FindGlueNodeCandidate(nodeEndIndex, nodeEndIndices[endIndex], endIndex);
                    if (glueBase != null)
                    {
                        var delta           = endIndex - nodeEndIndex;
                        var glueBaseSurface = glueBase.Surface;
                        var surface         = glueBaseSurface.Substring(glueBaseSurface.Length - delta);
                        var glueNode        = MakeGlueNode(nodeEndIndex, glueBase, surface);
                        lattice.AddNode(glueNode, nodeEndIndex, nodeEndIndex + glueNode.Surface.Length);
                        return;
                    }
                }
            }
        }
Beispiel #10
0
        /// <summary>
        /// Build lattice from input text
        /// </summary>
        /// <param name="text">source text for the lattice</param>
        /// <returns>built lattice, not null</returns>
        public ViterbiLattice Build(string text)
        {
            var textLength = text.Length;
            var lattice    = new ViterbiLattice(textLength + 2);

            lattice.AddBos();

            var unknownWordEndIndex = -1; // index of the last character of unknown word

            for (var startIndex = 0; startIndex < textLength; startIndex++)
            {
                // If no token ends where current token starts, skip this index
                if (lattice.TokenEndsWhereCurrentTokenStarts(startIndex))
                {
                    var suffix = text.Substring(startIndex);
                    var found  = ProcessIndex(lattice, startIndex, suffix);

                    // In the case of normal mode, it doesn't process unknown word greedily.
                    if (SearchMode || unknownWordEndIndex <= startIndex)
                    {
                        int[] categories = CharacterDefinitions.LookupCategories(suffix[0]);

                        for (int i = 0; i < categories.Length; i++)
                        {
                            int category = categories[i];
                            unknownWordEndIndex = ProcessUnknownWord(category, i, lattice, unknownWordEndIndex, startIndex, suffix, found);
                        }
                    }
                }
            }

            if (UseUserDictionary)
            {
                ProcessUserDictionary(text, lattice);
            }

            lattice.AddEos();

            return(lattice);
        }
Beispiel #11
0
        void BuildSidetracks(ViterbiLattice lattice)
        {
            var startIndexArr = lattice.StartIndexArr;
            var endIndexArr   = lattice.EndIndexArr;

            for (int i = 1; i < startIndexArr.Length; i++)
            {
                if (startIndexArr[i] == null || endIndexArr[i] == null)
                {
                    continue;
                }

                foreach (var node in startIndexArr[i])
                {
                    if (node == null)
                    {
                        break;
                    }

                    BuildSidetracksForNode(endIndexArr[i], node);
                }
            }
        }
Beispiel #12
0
 public string Format(ViterbiLattice lattice)
 {
     return(Format(lattice, null));
 }
Beispiel #13
0
 /// <summary>
 /// Find the best paths with cost at most OPT + costSlack, where OPT is the optimal solution. At most maxCount paths will be returned. The paths are ordered by cost in ascending order.
 /// </summary>
 /// <param name="lattice">the result of a build method</param>
 /// <param name="maxCount">the maximum number of paths to find</param>
 /// <param name="costSlack">the maximum cost slack of a path</param>
 /// <returns>MultiSearchResult containing the shortest paths and their costs</returns>
 public MultiSearchResult SearchMultiple(ViterbiLattice lattice, int maxCount, int costSlack)
 {
     CalculatePathCosts(lattice);
     return(MultiSearcher.GetShortestPaths(lattice, maxCount, costSlack));
 }
Beispiel #14
0
        /// <summary>
        /// Find best path from input lattice.
        /// </summary>
        /// <param name="lattice">the result of build method</param>
        /// <returns>List of ViterbiNode which consist best path</returns>
        public List <ViterbiNode> Search(ViterbiLattice lattice)
        {
            var endIndexArr = CalculatePathCosts(lattice);

            return(BacktrackBestPath(endIndexArr[0][0]));
        }
Beispiel #15
0
        /// <summary>
        /// Checks whether there exists any node in the lattice that connects to the newly inserted entry on the right side
        /// (after the new entry).
        /// </summary>
        /// <param name="endIndex"></param>
        /// <param name="lattice"></param>
        /// <returns>whether the lattice has a node that starts at endIndex</returns>
        bool IsLatticeBrokenAfter(int endIndex, ViterbiLattice lattice)
        {
            var nodeStartIndices = lattice.StartIndexArr;

            return(nodeStartIndices[endIndex] == null);
        }
Beispiel #16
0
        /// <summary>
        /// Checks whether there exists any node in the lattice that connects to the newly inserted entry on the left side
        /// (before the new entry).
        /// </summary>
        /// <param name="nodeIndex"></param>
        /// <param name="lattice"></param>
        /// <returns>whether the lattice has a node that ends at nodeIndex</returns>
        bool IsLatticeBrokenBefore(int nodeIndex, ViterbiLattice lattice)
        {
            var nodeEndIndices = lattice.EndIndexArr;

            return(nodeEndIndices[nodeIndex] == null);
        }