Example #1
0
        /// <summary>
        /// Save the gram data into stream.
        /// </summary>
        /// <param name="stream">Binary stream.</param>
        public void SaveToBinary(Stream stream)
        {
            if (stream == null)
            {
                throw new ArgumentNullException("stream");
            }

            if (_grammarCount == 0)
            {
                throw new InvalidDataException("There is no nGram data");
            }

            using (TrieTree graphemeTrieTree = new TrieTree(_graphemeDictionary))
            {
                byte[] graphemeDictData = graphemeTrieTree.GetTrieData();
                int graphemeDictLength = graphemeDictData.Length;
                
                // Keep data alignment as 2 bytes
                if (graphemeDictLength % 2 != 0)
                {
                    graphemeDictLength++;
                }

                // Save NGramData
                GrammarState[] grammarStates = new GrammarState[_grammarCount + 2];
                grammarStates[0] = new GrammarState();
                grammarStates[0].ReferenceIndex = 1;

                Dictionary<string, int> grammarIndex = new Dictionary<string, int>();
                int stateIndex = 1;
                int finalGramStateIndex = 0;
                for (int gram = 1; gram <= _maxNgram; gram++)
                {
                    string lastReferredGrammar = string.Empty;
                    if (gram == _maxNgram)
                    {
                        finalGramStateIndex = stateIndex;
                        grammarStates[stateIndex++] = new GrammarState();
                    }

                    foreach (string grammar in _nGramData[gram].Keys)
                    {
                        string[] graphemes = grammar.Split(GrammarSeparator, StringSplitOptions.RemoveEmptyEntries);

                        // last grapheme
                        string lastGrapheme = graphemes[graphemes.Length - 1];
                        int len = 0;
                        int graphemeId = graphemeTrieTree.FindLongest(lastGrapheme, out len);
                        Debug.Assert(graphemeId != -1);
                        Debug.Assert(!grammarIndex.ContainsKey(grammar));

                        // Save the state index for easily query
                        grammarIndex.Add(grammar, stateIndex);
                        grammarStates[stateIndex] = new GrammarState();
                        grammarStates[stateIndex].GraphId = (GrapId)graphemeId;

                        // Convert the probability into ProbabilityInt type with amplifier
                        if (_nGramData[gram][grammar].Probability * _probabilityAmplifier < ProbabilityInt.MinValue)
                        {
                            grammarStates[stateIndex].Prob = ProbabilityInt.MinValue;
                        }
                        else
                        {
                            grammarStates[stateIndex].Prob = (ProbabilityInt)(_nGramData[gram][grammar].Probability * _probabilityAmplifier);
                        }

                        if (_nGramData[gram][grammar].Backoff * _probabilityAmplifier < short.MinValue)
                        {
                            grammarStates[stateIndex].Backoff = ProbabilityInt.MinValue;
                        }
                        else
                        {
                            grammarStates[stateIndex].Backoff = (ProbabilityInt)(_nGramData[gram][grammar].Backoff * _probabilityAmplifier);
                        }

                        // set the reference index for lower level gram data
                        if (gram != 1)
                        {
                            string referredGrammar = graphemes[0];
                            for (int i = 1; i < graphemes.Length - 1; i++)
                            {
                                referredGrammar = referredGrammar + " " + graphemes[i];
                            }

                            if (!referredGrammar.Equals(lastReferredGrammar, StringComparison.Ordinal))
                            {
                                // Update the reference index for the lower level gram
                                lastReferredGrammar = referredGrammar;
                                Debug.Assert(grammarIndex.ContainsKey(lastReferredGrammar));
                                int referredIndex = grammarIndex[lastReferredGrammar];
                                Debug.Assert(grammarStates[referredIndex] != null);
                                if (gram != _maxNgram)
                                {
                                    grammarStates[referredIndex].ReferenceIndex = (ReferenceIndex)stateIndex;
                                }
                                else
                                {
                                    grammarStates[referredIndex].ReferenceIndex = (ReferenceIndex)(stateIndex - finalGramStateIndex);
                                }
                            }
                        }

                        stateIndex++;
                    }
                }

                // Save the model into binary stream
                BinaryWriter bw = new BinaryWriter(stream);
                {
                    // Write the language ID
                    bw.Write((ushort)_language);

                    // Write the Gram Count
                    bw.Write((ushort)this._maxNgram);

                    // Write the Probability Amplifier
                    bw.Write((int)_probabilityAmplifier);

                    // Write the grammar state number
                    bw.Write((uint)finalGramStateIndex);

                    // Write the Final grammar state number
                    bw.Write((uint)(_grammarCount + 2 - finalGramStateIndex));

                    int headerSize = sizeof(ushort) + sizeof(ushort) + sizeof(int) +
                        sizeof(uint) + sizeof(uint) +
                        sizeof(uint) + sizeof(uint) + sizeof(uint);

                    // Write the offset of Dictionary
                    bw.Write((uint)headerSize);

                    // Write the offset of Grammar State
                    bw.Write((uint)(headerSize + graphemeDictLength));

                    // Write the offset of Final Grammar State
                    bw.Write((uint)(headerSize + graphemeDictLength + (finalGramStateIndex *
                        (sizeof(GrapId) + sizeof(ProbabilityInt) + sizeof(ProbabilityInt) + sizeof(ReferenceIndex)))));

                    // Write the grapheme Trie Dictionary
                    bw.Write(graphemeDictData, 0, graphemeDictData.Length);

                    // Add the data alignment for grapheme Trie Dictionary
                    for (int i = graphemeDictData.Length; i < graphemeDictLength; i++)
                    {
                        bw.Write((byte)0);
                    }

                    // Write the grammar states for low level gram
                    for (int i = 0; i < finalGramStateIndex; i++)
                    {
                        bw.Write(grammarStates[i].GraphId);
                        bw.Write(grammarStates[i].Prob);
                        bw.Write(grammarStates[i].Backoff);
                        bw.Write(grammarStates[i].ReferenceIndex);
                    }

                    // Write the grammar state for final level gram
                    for (int i = finalGramStateIndex; i < _grammarCount + 2; i++)
                    {
                        bw.Write(grammarStates[i].GraphId);
                        bw.Write(grammarStates[i].Prob);
                    }
                }
            }           
        }