/// <summary> /// Find token(s) in input text and set found token(s) in arrays as normal tokens /// </summary> /// <param name="text"></param> /// <param name="lattice"></param> void ProcessUserDictionary(string text, ViterbiLattice lattice) { var matches = UserDictionary.FindUserDictionaryMatches(text); foreach (var match in matches) { var wordId = match.WordId; var index = match.MatchStartIndex; var length = match.MatchLength; var word = text.Substring(index, length); var node = new ViterbiNode(wordId, word, UserDictionary, index, ViterbiNode.NodeType.User); var nodeStartIndex = index + 1; var nodeEndIndex = nodeStartIndex + length; lattice.AddNode(node, nodeStartIndex, nodeEndIndex); if (IsLatticeBrokenBefore(nodeStartIndex, lattice)) { RepairBrokenLatticeBefore(lattice, index); } if (IsLatticeBrokenAfter(nodeStartIndex + length, lattice)) { RepairBrokenLatticeAfter(lattice, nodeEndIndex); } } }
ViterbiNode[][] CalculatePathCosts(ViterbiLattice lattice) { var startIndexArr = lattice.StartIndexArr; var endIndexArr = lattice.EndIndexArr; for (var i = 1; i < startIndexArr.Length; i++) { if (startIndexArr[i] == null || endIndexArr[i] == null) { // continue since no array which contains ViterbiNodes exists. Or no previous node exists. continue; } foreach (var node in startIndexArr[i]) { if (node == null) { // If array doesn't contain ViterbiNode any more, continue to next index break; } UpdateNode(endIndexArr[i], node); } } return(endIndexArr); }
bool ProcessIndex(ViterbiLattice lattice, int startIndex, string suffix) { var found = false; for (var endIndex = 1; endIndex < suffix.Length + 1; endIndex++) { var prefix = suffix.Substring(0, endIndex); var result = DoubleArrayTrie.Lookup(prefix); if (result > 0) { found = true; // Don't produce unknown word starting from this index foreach (var wordId in Dictionary.LookupWordIds(result)) { var node = new ViterbiNode(wordId, prefix, Dictionary, startIndex, ViterbiNode.NodeType.Known); lattice.AddNode(node, startIndex + 1, startIndex + 1 + endIndex); } } else if (result < 0) { // If result is less than zero, continue to next position break; } } return(found); }
public string Format(ViterbiLattice lattice, List <ViterbiNode> bestPath) { InitBestPathMap(bestPath); var builder = new StringBuilder(); builder.Append(FormatHeader()); builder.Append(FormatNodes(lattice)); builder.Append(FormatTrailer()); return(builder.ToString()); }
int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, string suffix, bool found) { var unknownWordLength = 0; var definition = CharacterDefinitions.LookupDefinition(category); if (definition[CharacterDefinitions.Invoke] == 1 || found == false) { if (definition[CharacterDefinitions.Group] == 0) { unknownWordLength = 1; } else { unknownWordLength = 1; for (var j = 1; j < suffix.Length; j++) { var c = suffix[j]; var categories = CharacterDefinitions.LookupCategories(c); if (categories == null) { break; } if (i < categories.Length && category == categories[i]) { unknownWordLength++; } else { break; } } } } if (unknownWordLength > 0) { var unkWord = suffix.Substring(0, unknownWordLength); var wordIds = UnknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same foreach (var wordId in wordIds) { var node = new ViterbiNode(wordId, unkWord, UnknownDictionary, startIndex, ViterbiNode.NodeType.Unknown); lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength); } unknownWordEndIndex = startIndex + unknownWordLength; } return(unknownWordEndIndex); }
/// <summary> /// Get up to maxCount shortest paths with cost at most OPT + costSlack, where OPT is the optimal solution. The results are ordered in ascending order by cost. /// </summary> /// <param name="lattice">an instance of ViterbiLattice prosecced by a ViterbiSearcher</param> /// <param name="maxCount">the maximum number of results</param> /// <param name="costSlack">the maximum cost slack of a path</param> /// <returns>the shortest paths and their costs</returns> public MultiSearchResult GetShortestPaths(ViterbiLattice lattice, int maxCount, int costSlack) { PathCosts.Clear(); Sidetracks.Clear(); var multiSearchResult = new MultiSearchResult(); BuildSidetracks(lattice); var eos = lattice.EndIndexArr[0][0]; BaseCost = eos.PathCost; var paths = GetPaths(eos, maxCount, costSlack); foreach (var(path, cost) in paths.Zip(Enumerable.Range(0, paths.Count), (p, i) => Tuple.Create(p, PathCosts[i]))) { var nodes = GeneratePath(eos, path); multiSearchResult.Add(nodes, cost); } return(multiSearchResult); }
/// <summary> /// Tries to repair the lattice by creating and adding an additional Viterbi node to the LEFT of the newly /// inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least /// </summary> /// <param name="lattice"></param> /// <param name="index"></param> void RepairBrokenLatticeBefore(ViterbiLattice lattice, int index) { var nodeStartIndices = lattice.StartIndexArr; for (var startIndex = index; startIndex > 0; startIndex--) { if (nodeStartIndices[startIndex] != null) { var glueBase = FindGlueNodeCandidate(index, nodeStartIndices[startIndex], startIndex); if (glueBase != null) { var length = index + 1 - startIndex; var surface = glueBase.Surface.Substring(0, length); var glueNode = MakeGlueNode(startIndex, glueBase, surface); lattice.AddNode(glueNode, startIndex, startIndex + glueNode.Surface.Length); return; } } } }
string FormatNodes(ViterbiLattice lattice) { var startsArray = lattice.StartIndexArr; var endsArray = lattice.EndIndexArr; NodeMap.Clear(); FoundBOS = false; var builder = new StringBuilder(); for (var i = 1; i < endsArray.Length; i++) { if (endsArray[i] == null || startsArray[i] == null) { continue; } for (var j = 0; j < endsArray[i].Length; j++) { var from = endsArray[i][j]; if (from == null) { continue; } builder.Append(FormatNodeIfNew(from)); for (var k = 0; k < startsArray[i].Length; k++) { var to = startsArray[i][k]; if (to == null) { break; } builder.Append(FormatNodeIfNew(to)); builder.Append(FormatEdge(from, to)); } } } return(builder.ToString()); }
/// <summary> /// Tries to repair the lattice by creating and adding an additional Viterbi node to the RIGHT of the newly /// inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least /// </summary> /// <param name="lattice"></param> /// <param name="nodeEndIndex"></param> void RepairBrokenLatticeAfter(ViterbiLattice lattice, int nodeEndIndex) { var nodeEndIndices = lattice.EndIndexArr; for (var endIndex = nodeEndIndex + 1; endIndex < nodeEndIndices.Length; endIndex++) { if (nodeEndIndices[endIndex] != null) { ViterbiNode glueBase = FindGlueNodeCandidate(nodeEndIndex, nodeEndIndices[endIndex], endIndex); if (glueBase != null) { var delta = endIndex - nodeEndIndex; var glueBaseSurface = glueBase.Surface; var surface = glueBaseSurface.Substring(glueBaseSurface.Length - delta); var glueNode = MakeGlueNode(nodeEndIndex, glueBase, surface); lattice.AddNode(glueNode, nodeEndIndex, nodeEndIndex + glueNode.Surface.Length); return; } } } }
/// <summary> /// Build lattice from input text /// </summary> /// <param name="text">source text for the lattice</param> /// <returns>built lattice, not null</returns> public ViterbiLattice Build(string text) { var textLength = text.Length; var lattice = new ViterbiLattice(textLength + 2); lattice.AddBos(); var unknownWordEndIndex = -1; // index of the last character of unknown word for (var startIndex = 0; startIndex < textLength; startIndex++) { // If no token ends where current token starts, skip this index if (lattice.TokenEndsWhereCurrentTokenStarts(startIndex)) { var suffix = text.Substring(startIndex); var found = ProcessIndex(lattice, startIndex, suffix); // In the case of normal mode, it doesn't process unknown word greedily. if (SearchMode || unknownWordEndIndex <= startIndex) { int[] categories = CharacterDefinitions.LookupCategories(suffix[0]); for (int i = 0; i < categories.Length; i++) { int category = categories[i]; unknownWordEndIndex = ProcessUnknownWord(category, i, lattice, unknownWordEndIndex, startIndex, suffix, found); } } } } if (UseUserDictionary) { ProcessUserDictionary(text, lattice); } lattice.AddEos(); return(lattice); }
void BuildSidetracks(ViterbiLattice lattice) { var startIndexArr = lattice.StartIndexArr; var endIndexArr = lattice.EndIndexArr; for (int i = 1; i < startIndexArr.Length; i++) { if (startIndexArr[i] == null || endIndexArr[i] == null) { continue; } foreach (var node in startIndexArr[i]) { if (node == null) { break; } BuildSidetracksForNode(endIndexArr[i], node); } } }
public string Format(ViterbiLattice lattice) { return(Format(lattice, null)); }
/// <summary> /// Find the best paths with cost at most OPT + costSlack, where OPT is the optimal solution. At most maxCount paths will be returned. The paths are ordered by cost in ascending order. /// </summary> /// <param name="lattice">the result of a build method</param> /// <param name="maxCount">the maximum number of paths to find</param> /// <param name="costSlack">the maximum cost slack of a path</param> /// <returns>MultiSearchResult containing the shortest paths and their costs</returns> public MultiSearchResult SearchMultiple(ViterbiLattice lattice, int maxCount, int costSlack) { CalculatePathCosts(lattice); return(MultiSearcher.GetShortestPaths(lattice, maxCount, costSlack)); }
/// <summary> /// Find best path from input lattice. /// </summary> /// <param name="lattice">the result of build method</param> /// <returns>List of ViterbiNode which consist best path</returns> public List <ViterbiNode> Search(ViterbiLattice lattice) { var endIndexArr = CalculatePathCosts(lattice); return(BacktrackBestPath(endIndexArr[0][0])); }
/// <summary> /// Checks whether there exists any node in the lattice that connects to the newly inserted entry on the right side /// (after the new entry). /// </summary> /// <param name="endIndex"></param> /// <param name="lattice"></param> /// <returns>whether the lattice has a node that starts at endIndex</returns> bool IsLatticeBrokenAfter(int endIndex, ViterbiLattice lattice) { var nodeStartIndices = lattice.StartIndexArr; return(nodeStartIndices[endIndex] == null); }
/// <summary> /// Checks whether there exists any node in the lattice that connects to the newly inserted entry on the left side /// (before the new entry). /// </summary> /// <param name="nodeIndex"></param> /// <param name="lattice"></param> /// <returns>whether the lattice has a node that ends at nodeIndex</returns> bool IsLatticeBrokenBefore(int nodeIndex, ViterbiLattice lattice) { var nodeEndIndices = lattice.EndIndexArr; return(nodeEndIndices[nodeIndex] == null); }