/// <summary> /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/> /// </summary> private void GenerateBiSegGraph(SegGraph segGraph) { double smooth = 0.1; int wordPairFreq = 0; int maxStart = segGraph.MaxStart; double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; int next; char[] idBuffer; // get the list of tokens ordered and indexed segTokenList = segGraph.MakeIndex(); // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 int key = -1; IList <SegToken> nextTokens = null; while (key < maxStart) { if (segGraph.IsStartExist(key)) { IList <SegToken> tokenList = segGraph.GetStartList(key); // Calculate all tokens for a given key. foreach (SegToken t1 in tokenList) { oneWordFreq = t1.Weight; next = t1.EndOffset; nextTokens = null; // Find the next corresponding Token. // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". // If we cannot find the next Token, then go to the end and repeat the same cycle. while (next <= maxStart) { // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. if (segGraph.IsStartExist(next)) { nextTokens = segGraph.GetStartList(next); break; } next++; } if (nextTokens == null) { break; } foreach (SegToken t2 in nextTokens) { idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1]; System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length); idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR; System.Array.Copy(t2.CharArray, 0, idBuffer, t1.CharArray.Length + 1, t2.CharArray.Length); // Two linked Words frequency wordPairFreq = bigramDict.GetFrequency(idBuffer); // Smoothing // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 weight = -Math .Log(smooth * (1.0 + oneWordFreq) / (Utility.MAX_FREQUENCE + 0.0) + (1.0 - smooth) * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index, t2.Index, weight); this.AddSegTokenPair(tokenPair); } } } key++; } }
public BiSegGraph(SegGraph segGraph) { segTokenList = segGraph.MakeIndex(); GenerateBiSegGraph(segGraph); }