public virtual IList <SegToken> GetShortPath() { int current; int nodeCount = ToCount; IList <PathNode> path = new List <PathNode>(); PathNode zeroPath = new PathNode(); zeroPath.Weight = 0; zeroPath.PreNode = 0; path.Add(zeroPath); for (current = 1; current <= nodeCount; current++) { double weight; IList <SegTokenPair> edges = GetToList(current); double minWeight = double.MaxValue; SegTokenPair minEdge = null; foreach (SegTokenPair edge in edges) { weight = edge.Weight; PathNode preNode2 = path[edge.From]; if (preNode2.Weight + weight < minWeight) { minWeight = preNode2.Weight + weight; minEdge = edge; } } PathNode newNode = new PathNode(); newNode.Weight = minWeight; newNode.PreNode = minEdge.From; path.Add(newNode); } // Calculate PathNodes int preNode, lastNode; lastNode = path.Count - 1; current = lastNode; IList <int> rpath = new List <int>(); IList <SegToken> resultPath = new List <SegToken>(); rpath.Add(current); while (current != 0) { PathNode currentPathNode = path[current]; preNode = currentPathNode.PreNode; rpath.Add(preNode); current = preNode; } for (int j = rpath.Count - 1; j >= 0; j--) { //int idInteger = rpath.get(j); //int id = idInteger.intValue(); int id = rpath[j]; SegToken t = segTokenList[id]; resultPath.Add(t); } return(resultPath); }
/// <summary> /// <see cref="object.Equals(object)"/> /// </summary> public override bool Equals(object obj) { if (this == obj) { return(true); } if (obj == null) { return(false); } if (GetType() != obj.GetType()) { return(false); } SegToken other = (SegToken)obj; if (!Arrays.Equals(CharArray, other.CharArray)) { return(false); } if (EndOffset != other.EndOffset) { return(false); } if (Index != other.Index) { return(false); } if (StartOffset != other.StartOffset) { return(false); } if (Weight != other.Weight) { return(false); } if (WordType != other.WordType) { return(false); } return(true); }
/// <summary> /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist. /// </summary> /// <param name="token">token <see cref="SegToken"/>.</param> public virtual void AddToken(SegToken token) { int s = token.StartOffset; if (!IsStartExist(s)) { List <SegToken> newlist = new List <SegToken>(); newlist.Add(token); tokenListTable[s] = newlist; } else { IList <SegToken> tokenList = tokenListTable[s]; tokenList.Add(token); } if (s > maxStart) { maxStart = s; } }
/// <summary> /// Create the <see cref="SegGraph"/> for a sentence. /// </summary> /// <param name="sentence">input sentence, without start and end markers</param> /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns> private SegGraph CreateSegGraph(string sentence) { int i = 0, j; int length = sentence.Length; int foundIndex; CharType[] charTypeArray = GetCharTypes(sentence); StringBuilder wordBuf = new StringBuilder(); SegToken token; int frequency = 0; // the number of times word appears. bool hasFullWidth; WordType wordType; char[] charArray; SegGraph segGraph = new SegGraph(); while (i < length) { hasFullWidth = false; switch (charTypeArray[i]) { case CharType.SPACE_LIKE: i++; break; case CharType.HANZI: j = i + 1; //wordBuf.delete(0, wordBuf.length()); wordBuf.Remove(0, wordBuf.Length); // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will // cause word division. wordBuf.Append(sentence[i]); charArray = new char[] { sentence[i] }; frequency = wordDict.GetFrequency(charArray); token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, frequency); segGraph.AddToken(token); foundIndex = wordDict.GetPrefixMatch(charArray); while (j <= length && foundIndex != -1) { if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1) { // It is the phrase we are looking for; In other words, we have found a phrase SegToken // from i to j. It is not a monosyllabic word (single word). frequency = wordDict.GetFrequency(charArray); token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, frequency); segGraph.AddToken(token); } while (j < length && charTypeArray[j] == CharType.SPACE_LIKE) { j++; } if (j < length && charTypeArray[j] == CharType.HANZI) { wordBuf.Append(sentence[j]); charArray = new char[wordBuf.Length]; //wordBuf.GetChars(0, charArray.Length, charArray, 0); wordBuf.CopyTo(0, charArray, 0, charArray.Length); // idArray has been found (foundWordIndex!=-1) as a prefix before. // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. // So start searching after foundWordIndex. foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex); j++; } else { break; } } i++; break; case CharType.FULLWIDTH_LETTER: hasFullWidth = true; /* intentional fallthrough */ j = i + 1; while (j < length && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) { if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) { hasFullWidth = true; } j++; } // Found a Token from i to j. Type is LETTER char string. charArray = Utility.STRING_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; token = new SegToken(charArray, i, j, wordType, frequency); segGraph.AddToken(token); i = j; break; case CharType.LETTER: j = i + 1; while (j < length && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) { if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) { hasFullWidth = true; } j++; } // Found a Token from i to j. Type is LETTER char string. charArray = Utility.STRING_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; token = new SegToken(charArray, i, j, wordType, frequency); segGraph.AddToken(token); i = j; break; case CharType.FULLWIDTH_DIGIT: hasFullWidth = true; /* intentional fallthrough */ j = i + 1; while (j < length && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) { if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) { hasFullWidth = true; } j++; } // Found a Token from i to j. Type is NUMBER char string. charArray = Utility.NUMBER_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; token = new SegToken(charArray, i, j, wordType, frequency); segGraph.AddToken(token); i = j; break; case CharType.DIGIT: j = i + 1; while (j < length && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) { if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) { hasFullWidth = true; } j++; } // Found a Token from i to j. Type is NUMBER char string. charArray = Utility.NUMBER_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; token = new SegToken(charArray, i, j, wordType, frequency); segGraph.AddToken(token); i = j; break; case CharType.DELIMITER: j = i + 1; // No need to search the weight for the punctuation. Picking the highest frequency will work. frequency = Utility.MAX_FREQUENCE; charArray = new char[] { sentence[i] }; token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency); segGraph.AddToken(token); i = j; break; default: j = i + 1; // Treat the unrecognized char symbol as unknown string. // For example, any symbol not in GB2312 is treated as one of these. charArray = Utility.STRING_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); token = new SegToken(charArray, i, j, WordType.STRING, frequency); segGraph.AddToken(token); i = j; break; } } // Add two more Tokens: "beginning xx beginning" charArray = Utility.START_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency); segGraph.AddToken(token); // "end xx end" charArray = Utility.END_CHAR_ARRAY; frequency = wordDict.GetFrequency(charArray); token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END, frequency); segGraph.AddToken(token); return(segGraph); }