Exemple #1
0
        // Can only read file of Model1 format for now
        public void ReadFromBinary(BinaryReader br)
        {
            int version = br.ReadInt32();

            if (version != 1)
            {
                throw new Exception();
            }

            var converter = new Utility.VietConverter();

            // Read number of bits for each number type
            m_NumBitPerAccStringIndex  = br.ReadInt32();
            m_NumBitPerAccCountIndex   = br.ReadInt32();
            m_NumBitPerHashEntryCount  = br.ReadInt32();
            m_NumBitPerAccStringLength = br.ReadInt32();

            // Read look up table of accent codes
            m_NumAccString = br.ReadInt32();

            var asciiBytes = new List <byte>(m_NumAccString * 2);

            m_PredictionAsciiIndex = new int[m_NumAccString + 1];

            int index = 0;

            for (int i = 0; i < m_NumAccString; i++)
            {
                byte stringLength = br.ReadByte();

                m_PredictionAsciiIndex[i] = index;

                index += stringLength;

                asciiBytes.AddRange(br.ReadBytes(stringLength));
            }
            m_PredictionAsciiIndex[m_NumAccString] = index;

            m_PredictionAsciiBytes = asciiBytes.ToArray();

            // Read look up table of accent code counts

            int numAccCodeCount = br.ReadInt32();

            m_PredictionCounts = new int[numAccCodeCount];
            for (int i = 0; i < numAccCodeCount; i++)
            {
                m_PredictionCounts[i] = br.ReadInt32();
            }

            // Deserialize hash function
            int hashFunctionBytesLength = br.ReadInt32();

            byte[] hashFunctionBytes  = br.ReadBytes(hashFunctionBytesLength);
            var    hashFunctionStream = new MemoryStream(hashFunctionBytes);
            var    formatter          = new BinaryFormatter();

            m_HashFunction = (MinPerfectHash)formatter.Deserialize(hashFunctionStream);

            // Read bytes for hash codes
            int hashSize = (int)m_HashFunction.N;

            var hashTableByteList = new List <byte>((int)m_HashFunction.N * 4);

            m_HashTableByteIndex = new int[hashSize + 1];
            index = 0;

            for (int i = 0; i < hashSize; i++)
            {
                byte[]   countByte = br.ReadBytes(1);
                BitArray baCount   = new BitArray(countByte);
                int      count     = 0;
                for (int ib = 0; ib < m_NumBitPerHashEntryCount; ib++)
                {
                    count += (baCount[ib] ? 1 : 0) << (m_NumBitPerHashEntryCount - ib - 1);
                }

                if (count == 0)
                {
                    continue;
                }

                br.BaseStream.Position = br.BaseStream.Position - 1;
                int numBitPerElement = m_NumBitPerAccStringIndex + m_NumBitPerAccCountIndex;
                int totalNumBits     = m_NumBitPerHashEntryCount + count * numBitPerElement;
                int numBytes         = (int)Math.Ceiling(totalNumBits / 8.0);

                m_HashTableByteIndex[i] = index;

                index += numBytes;

                hashTableByteList.AddRange(br.ReadBytes(numBytes));
            }
            m_HashTableByteIndex[hashSize] = index;
            m_HashTableBytes = hashTableByteList.ToArray();
        }
Exemple #2
0
        public void WriteToBinary(BinaryWriter bw)
        {
            m_HashFunction = PerfectHash.MPH.MinPerfectHash.Create(new RawStringKeySource(m_Map.Keys.ToList()), c: 1.0);
            m_HashTable    = Enumerable.Repeat <Dictionary <string, int> >(null, (int)m_HashFunction.N).ToList();

            foreach (string rawString in m_Map.Keys)
            {
                int hashCode = (int)m_HashFunction.Search(Encoding.UTF8.GetBytes(rawString));

                m_HashTable[hashCode] = m_Map[rawString];
            }

            var converter = new Utility.VietConverter();

            var uniqueAccentStrings   = new HashSet <string>();
            var uniqueAccentCodeCount = new HashSet <int>();
            int maxHashEntryCount     = 0;

            foreach (Dictionary <string, int> accCodeToCount in m_Map.Values)
            {
                foreach (string accString in accCodeToCount.Keys)
                {
                    uniqueAccentStrings.Add(accString);
                    uniqueAccentCodeCount.Add(accCodeToCount[accString]);
                }

                maxHashEntryCount = Math.Max(maxHashEntryCount, accCodeToCount.Count);
            }
            int numBitPerAccStringIndex = GetRepresentativeBitCount(uniqueAccentStrings.Count);
            int numBitPerAccCountIndex  = GetRepresentativeBitCount(uniqueAccentCodeCount.Count);
            int numBitPerHashEntryCount = GetRepresentativeBitCount(maxHashEntryCount);

            bw.Write(Version);                 // Write version number
            bw.Write(numBitPerAccStringIndex); // How many bits to represent an accent code
            bw.Write(numBitPerAccCountIndex);  // How many bits to represent an accent code's count
            bw.Write(numBitPerHashEntryCount); // How many bits to represent the count of elements in each hash entry

            var accStringList    = new List <string>();
            var accStringToIndex = new Dictionary <string, int>();
            int index            = 0;

            foreach (string accString in uniqueAccentStrings)
            {
                accStringList.Add(accString);
                accStringToIndex.Add(accString, index);
                index++;
            }
            int numBitPerAccStringLength = GetRepresentativeBitCount(accStringList.Max(s => s.Length));

            bw.Write(numBitPerAccStringLength);

            // Write ascii accent codes lookup table
            bw.Write(accStringList.Count);
            foreach (string accString in accStringList)
            {
                string asciiString = String.Join(String.Empty, accString.Select(c => (char)converter.AccentToAsciiMap[c]));
                byte[] asciiBytes  = System.Text.Encoding.ASCII.GetBytes(asciiString);

                bw.Write((byte)asciiBytes.Length);
                bw.Write(asciiBytes);
            }

            var accCodeCountList    = new List <int>();
            var accCodeCountToIndex = new Dictionary <int, int>();

            index = 0;
            foreach (int accCodeCount in uniqueAccentCodeCount)
            {
                accCodeCountList.Add(accCodeCount);
                accCodeCountToIndex.Add(accCodeCount, index);
                index++;
            }
            // Write unique accent code counts lookup table
            bw.Write(accCodeCountList.Count);
            foreach (int accCodeCount in accCodeCountList)
            {
                bw.Write(accCodeCount);
            }

            // Serialize hash function
            var hashFunctionStream = new MemoryStream();
            var formatter          = new BinaryFormatter();

            formatter.Serialize(hashFunctionStream, m_HashFunction);
            byte[] hashFunctionBytes = hashFunctionStream.ToArray();
            bw.Write(hashFunctionBytes.Length);
            bw.Write(hashFunctionBytes);

            var bitList = new List <bool>();

            foreach (var kvp in m_HashTable)
            {
                // Write number of entries, using the minimal number of bits
                int count = kvp != null ? kvp.Count : 0;

                for (int ib = numBitPerHashEntryCount - 1; ib >= 0; ib--)
                {
                    bitList.Add(((count >> ib) & 1) == 1);
                }

                if (kvp != null)
                {
                    foreach (string accString in kvp.Keys)
                    {
                        // Write accent code's index in the lookup table, using the minimal number of bits
                        for (int ib = numBitPerAccStringIndex - 1; ib >= 0; ib--)
                        {
                            bitList.Add(((accStringToIndex[accString] >> ib) & 1) == 1);
                        }

                        // Write accent code count's index in the lookup table, using the minimal number of bits
                        for (int ib = numBitPerAccCountIndex - 1; ib >= 0; ib--)
                        {
                            bitList.Add(((accCodeCountToIndex[kvp[accString]] >> ib) & 1) == 1);
                        }
                    }
                }

                // Write from queue to disk byte-aligned
                while (bitList.Count % 8 != 0)
                {
                    bitList.Add(false);
                }

                BitArray ba    = new BitArray(bitList.ToArray());
                byte[]   bytes = new byte[bitList.Count / 8];
                ba.CopyTo(bytes, 0);

                bw.Write(bytes);

                bitList.Clear();
            }
        }