/** * Find best path from input lattice. * * @param lattice the result of build method * @return List of ViterbiNode which consist best path */ public List <ViterbiNode> Search(ViterbiLattice lattice) { ViterbiNode[][] endIndexArr = CalculatePathCosts(lattice); LinkedList <ViterbiNode> result = BacktrackBestPath(endIndexArr[0][0]); return(result.ToList()); }
/** * Find token(s) in input text and set found token(s) in arrays as normal tokens * * @param text * @param lattice */ private void ProcessUserDictionary(string text, ViterbiLattice lattice) { List <UserDictionary.UserDictionaryMatch> matches = userDictionary.FindUserDictionaryMatches(text); foreach (UserDictionary.UserDictionaryMatch match in matches) { int wordId = match.GetWordId(); int index = match.GetMatchStartIndex(); int length = match.GetMatchLength(); string word = text.Substring(index, length); ViterbiNode node = new ViterbiNode(wordId, word, userDictionary, index, ViterbiNode.NodeType.USER); int nodeStartIndex = index + 1; int nodeEndIndex = nodeStartIndex + length; lattice.AddNode(node, nodeStartIndex, nodeEndIndex); if (IsLatticeBrokenBefore(nodeStartIndex, lattice)) { RepairBrokenLatticeBefore(lattice, index); } if (IsLatticeBrokenAfter(nodeStartIndex + length, lattice)) { RepairBrokenLatticeAfter(lattice, nodeEndIndex); } } }
private bool ProcessIndex(ViterbiLattice lattice, int startIndex, string suffix) { bool found = false; for (int endIndex = 1; endIndex < suffix.Length + 1; endIndex++) { string prefix = suffix.Substring(0, endIndex); int result = fst.Lookup(prefix); if (result > 0) { found = true; // Don't produce unknown word starting from this index foreach (int wordId in dictionary.LookupWordIds(result)) { ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary, startIndex, ViterbiNode.NodeType.KNOWN); lattice.AddNode(node, startIndex + 1, startIndex + 1 + endIndex); } } else if (result < 0) { // If result is less than zero, continue to next position break; } } return(found); }
/** * Find the best paths with cost at most OPT + costSlack, where OPT is the optimal solution. At most maxCount paths will be returned. The paths are ordered by cost in ascending order. * * @param lattice the result of a build method * @param maxCount the maximum number of paths to find * @param costSlack the maximum cost slack of a path * @return MultiSearchResult containing the shortest paths and their costs */ public MultiSearchResult SearchMultiple(ViterbiLattice lattice, int maxCount, int costSlack) { CalculatePathCosts(lattice); MultiSearchResult result = multiSearcher.GetShortestPaths(lattice, maxCount, costSlack); return(result); }
public string Format(ViterbiLattice lattice, List <ViterbiNode> bestPath) { InitBestPathMap(bestPath); StringBuilder builder = new StringBuilder(); builder.Append(FormatHeader()); builder.Append(FormatNodes(lattice)); builder.Append(FormatTrailer()); return(builder.ToString()); }
private int ProcessUnknownWord(int category, int i, ViterbiLattice lattice, int unknownWordEndIndex, int startIndex, String suffix, bool found) { int unknownWordLength = 0; int[] definition = characterDefinitions.LookupDefinition(category); if (definition[CharacterDefinitions.INVOKE] == 1 || found == false) { if (definition[CharacterDefinitions.GROUP] == 0) { unknownWordLength = 1; } else { unknownWordLength = 1; for (int j = 1; j < suffix.Length; j++) { char c = suffix[j]; int[] categories = characterDefinitions.LookupCategories(c); if (categories == null) { break; } if (i < categories.Length && category == categories[i]) { unknownWordLength++; } else { break; } } } } if (unknownWordLength > 0) { string unkWord = suffix.Substring(0, unknownWordLength); int[] wordIds = unknownDictionary.LookupWordIds(category); // characters in input text are supposed to be the same foreach (int wordId in wordIds) { ViterbiNode node = new ViterbiNode(wordId, unkWord, unknownDictionary, startIndex, ViterbiNode.NodeType.UNKNOWN); lattice.AddNode(node, startIndex + 1, startIndex + 1 + unknownWordLength); } unknownWordEndIndex = startIndex + unknownWordLength; } return(unknownWordEndIndex); }
/** * Tries to repair the lattice by creating and adding an additional Viterbi node to the LEFT of the newly * inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least * * @param lattice * @param index */ private void RepairBrokenLatticeBefore(ViterbiLattice lattice, int index) { ViterbiNode[][] nodeStartIndices = lattice.StartIndexArr; for (int startIndex = index; startIndex > 0; startIndex--) { if (nodeStartIndices[startIndex] != null) { ViterbiNode glueBase = FindGlueNodeCandidate(index, nodeStartIndices[startIndex], startIndex); if (glueBase != null) { int length = index + 1 - startIndex; String surface = glueBase.Surface.Substring(0, length); ViterbiNode glueNode = MakeGlueNode(startIndex, glueBase, surface); lattice.AddNode(glueNode, startIndex, startIndex + glueNode.Surface.Length); return; } } } }
/** * Tries to repair the lattice by creating and adding an additional Viterbi node to the RIGHT of the newly * inserted user dictionary entry by using the substring of the node in the lattice that overlaps the least * @param lattice * @param nodeEndIndex */ private void RepairBrokenLatticeAfter(ViterbiLattice lattice, int nodeEndIndex) { ViterbiNode[][] nodeEndIndices = lattice.EndIndexArr; for (int endIndex = nodeEndIndex + 1; endIndex < nodeEndIndices.Length; endIndex++) { if (nodeEndIndices[endIndex] != null) { ViterbiNode glueBase = FindGlueNodeCandidate(nodeEndIndex, nodeEndIndices[endIndex], endIndex); if (glueBase != null) { int delta = endIndex - nodeEndIndex; String glueBaseSurface = glueBase.Surface; String surface = glueBaseSurface.Substring(glueBaseSurface.Length - delta); ViterbiNode glueNode = MakeGlueNode(nodeEndIndex, glueBase, surface); lattice.AddNode(glueNode, nodeEndIndex, nodeEndIndex + glueNode.Surface.Length); return; } } } }
/** * Get up to maxCount shortest paths with cost at most OPT + costSlack, where OPT is the optimal solution. The results are ordered in ascending order by cost. * * @param lattice an instance of ViterbiLattice prosecced by a ViterbiSearcher * @param maxCount the maximum number of results * @param costSlack the maximum cost slack of a path * @return the shortest paths and their costs */ public MultiSearchResult GetShortestPaths(ViterbiLattice lattice, int maxCount, int costSlack) { pathCosts = new List <int>(); sidetracks = new Dictionary <ViterbiNode, MultiSearcher.SidetrackEdge>(); MultiSearchResult multiSearchResult = new MultiSearchResult(); BuildSidetracks(lattice); ViterbiNode eos = lattice.EndIndexArr[0][0]; baseCost = eos.PathCost; List <SidetrackEdge> paths = GetPaths(eos, maxCount, costSlack); int i = 0; foreach (SidetrackEdge path in paths) { LinkedList <ViterbiNode> nodes = GeneratePath(eos, path); multiSearchResult.Add(nodes, pathCosts[i]); i += 1; } return(multiSearchResult); }
/** * Build lattice from input text * * @param text source text for the lattice * @return built lattice, not null */ public ViterbiLattice Build(string text) { int textLength = text.Length; ViterbiLattice lattice = new ViterbiLattice(textLength + 2); lattice.AddBos(); int unknownWordEndIndex = -1; // index of the last character of unknown word for (int startIndex = 0; startIndex < textLength; startIndex++) { // If no token ends where current token starts, skip this index if (lattice.TokenEndsWhereCurrentTokenStarts(startIndex)) { string suffix = text.Substring(startIndex); bool found = ProcessIndex(lattice, startIndex, suffix); // In the case of normal mode, it doesn't process unknown word greedily. if (searchMode || unknownWordEndIndex <= startIndex) { int[] categories = characterDefinitions.LookupCategories(suffix[0]); for (int i = 0; i < categories.Length; i++) { int category = categories[i]; unknownWordEndIndex = ProcessUnknownWord(category, i, lattice, unknownWordEndIndex, startIndex, suffix, found); } } } } if (useUserDictionary) { ProcessUserDictionary(text, lattice); } lattice.AddEos(); return(lattice); }
private void BuildSidetracks(ViterbiLattice lattice) { ViterbiNode[][] startIndexArr = lattice.StartIndexArr; ViterbiNode[][] endIndexArr = lattice.EndIndexArr; for (int i = 1; i < startIndexArr.Length; i++) { if (startIndexArr[i] == null || endIndexArr[i] == null) { continue; } foreach (ViterbiNode node in startIndexArr[i]) { if (node == null) { break; } BuildSidetracksForNode(endIndexArr[i], node); } } }
private string FormatNodes(ViterbiLattice lattice) { ViterbiNode[][] startsArray = lattice.StartIndexArr; ViterbiNode[][] endsArray = lattice.EndIndexArr; this.nodeMap.Clear(); this.foundBOS = false; StringBuilder builder = new StringBuilder(); for (int i = 1; i < endsArray.Length; i++) { if (endsArray[i] == null || startsArray[i] == null) { continue; } for (int j = 0; j < endsArray[i].Length; j++) { ViterbiNode from = endsArray[i][j]; if (from == null) { continue; } builder.Append(FormatNodeIfNew(from)); for (int k = 0; k < startsArray[i].Length; k++) { ViterbiNode to = startsArray[i][k]; if (to == null) { break; } builder.Append(FormatNodeIfNew(to)); builder.Append(FormatEdge(from, to)); } } } return(builder.ToString()); }
private ViterbiNode[][] CalculatePathCosts(ViterbiLattice lattice) { ViterbiNode[][] startIndexArr = lattice.StartIndexArr; ViterbiNode[][] endIndexArr = lattice.EndIndexArr; for (int i = 1; i < startIndexArr.Length; i++) { if (startIndexArr[i] == null || endIndexArr[i] == null) { // continue since no array which contains ViterbiNodes exists. Or no previous node exists. continue; } foreach (ViterbiNode node in startIndexArr[i]) { if (node == null) { // If array doesn't contain ViterbiNode any more, continue to next index break; } UpdateNode(endIndexArr[i], node); } } return(endIndexArr); }
/** * Checks whether there exists any node in the lattice that connects to the newly inserted entry on the right side * (after the new entry). * * @param endIndex * @param lattice * @return whether the lattice has a node that starts at endIndex */ private bool IsLatticeBrokenAfter(int endIndex, ViterbiLattice lattice) { ViterbiNode[][] nodeStartIndices = lattice.StartIndexArr; return(nodeStartIndices[endIndex] == null); }
/** * Checks whether there exists any node in the lattice that connects to the newly inserted entry on the left side * (before the new entry). * * @param nodeIndex * @param lattice * @return whether the lattice has a node that ends at nodeIndex */ private bool IsLatticeBrokenBefore(int nodeIndex, ViterbiLattice lattice) { ViterbiNode[][] nodeEndIndices = lattice.EndIndexArr; return(nodeEndIndices[nodeIndex] == null); }
public string Format(ViterbiLattice lattice) { return(Format(lattice, null)); }