Beispiel #1
0
        public void WriteToBinary(BinaryWriter bw)
        {
            var converter = new Utility.VietConverter();

            var uniqueAccentStrings   = new HashSet <string>();
            var uniqueAccentCodeCount = new HashSet <int>();
            int maxHashEntryCount     = 0;

            foreach (Dictionary <string, int> accCodeToCount in m_Map.Values)
            {
                foreach (string accString in accCodeToCount.Keys)
                {
                    uniqueAccentStrings.Add(accString);
                    uniqueAccentCodeCount.Add(accCodeToCount[accString]);
                }

                maxHashEntryCount = Math.Max(maxHashEntryCount, accCodeToCount.Count);
            }
            int numBitPerAccStringIndex = GetRepresentativeBitCount(uniqueAccentStrings.Count);
            int numBitPerAccCountIndex  = GetRepresentativeBitCount(uniqueAccentCodeCount.Count);
            int numBitPerHashEntryCount = GetRepresentativeBitCount(maxHashEntryCount);

            bw.Write(Version);                 // Write version number
            bw.Write(numBitPerAccStringIndex); // How many bits to represent an accent code
            bw.Write(numBitPerAccCountIndex);  // How many bits to represent an accent code's count
            bw.Write(numBitPerHashEntryCount); // How many bits to represent the count of elements in each hash entry

            var accStringList    = new List <string>();
            var accStringToIndex = new Dictionary <string, int>();
            int index            = 0;

            foreach (string accString in uniqueAccentStrings)
            {
                accStringList.Add(accString);
                accStringToIndex.Add(accString, index);
                index++;
            }
            int numBitPerAccStringLength = GetRepresentativeBitCount(accStringList.Max(s => s.Length));

            bw.Write(numBitPerAccStringLength);

            // Write ascii accent codes lookup table
            bw.Write(accStringList.Count);
            foreach (string accString in accStringList)
            {
                string asciiString = String.Join(String.Empty, accString.Select(c => (char)converter.AccentToAsciiMap[c]));
                byte[] asciiBytes  = System.Text.Encoding.ASCII.GetBytes(asciiString);

                bw.Write((byte)asciiBytes.Length);
                bw.Write(asciiBytes);
            }

            var accCodeCountList    = new List <int>();
            var accCodeCountToIndex = new Dictionary <int, int>();

            index = 0;
            foreach (int accCodeCount in uniqueAccentCodeCount)
            {
                accCodeCountList.Add(accCodeCount);
                accCodeCountToIndex.Add(accCodeCount, index);
                index++;
            }
            // Write unique accent code counts lookup table
            bw.Write(accCodeCountList.Count);
            foreach (int accCodeCount in accCodeCountList)
            {
                bw.Write(accCodeCount);
            }

            // Write map
            bw.Write(m_Map.Count);

            var orderedMap = m_Map.OrderBy(kvp => kvp.Key);

            foreach (var kvp in orderedMap)
            {
                bw.Write(kvp.Key);
            }

            var bitList = new List <bool>();

            foreach (var kvp in orderedMap)
            {
                // Write number of entries, using the minimal number of bits
                for (int ib = numBitPerHashEntryCount - 1; ib >= 0; ib--)
                {
                    bitList.Add(((kvp.Value.Count >> ib) & 1) == 1);
                }

                foreach (string accString in kvp.Value.Keys)
                {
                    // Write accent code's index in the lookup table, using the minimal number of bits
                    for (int ib = numBitPerAccStringIndex - 1; ib >= 0; ib--)
                    {
                        bitList.Add(((accStringToIndex[accString] >> ib) & 1) == 1);
                    }

                    // Write accent code count's index in the lookup table, using the minimal number of bits
                    for (int ib = numBitPerAccCountIndex - 1; ib >= 0; ib--)
                    {
                        bitList.Add(((accCodeCountToIndex[kvp.Value[accString]] >> ib) & 1) == 1);
                    }
                }

                // Write from queue to disk byte-aligned
                while (bitList.Count % 8 != 0)
                {
                    bitList.Add(false);
                }

                BitArray ba    = new BitArray(bitList.ToArray());
                byte[]   bytes = new byte[bitList.Count / 8];
                ba.CopyTo(bytes, 0);

                bw.Write(bytes);

                bitList.Clear();
            }
        }
Beispiel #2
0
        // Can only read file of Model1 format for now
        public void ReadFromBinary(BinaryReader br)
        {
            int version = br.ReadInt32();

            if (version != 1)
            {
                throw new Exception();
            }

            var converter = new Utility.VietConverter();

            // Read number of bits for each number type
            int numBitPerAccStringIndex  = br.ReadInt32();
            int numBitPerAccCountIndex   = br.ReadInt32();
            int numBitPerHashEntryCount  = br.ReadInt32();
            int numBitPerAccStringLength = br.ReadInt32();

            // Read look up table of accent codes
            var accStringList = new List <string>();
            int numAccString  = br.ReadInt32();

            for (int i = 0; i < numAccString; i++)
            {
                byte   stringLength = br.ReadByte();
                byte[] asciiBytes   = br.ReadBytes(stringLength);

                string asciiString = System.Text.Encoding.ASCII.GetString(asciiBytes);

                accStringList.Add(String.Join(String.Empty,
                                              asciiString.Select(c => converter.AsciiToAccentMap[c])));
            }

            // Read look up table of accent code counts
            var accCodeCountList = new List <int>();
            int numAccCodeCount  = br.ReadInt32();

            for (int i = 0; i < numAccCodeCount; i++)
            {
                accCodeCountList.Add(br.ReadInt32());
            }

            // Read bytes for hash codes
            int numHashCodes = br.ReadInt32();

            byte[] hashCodes = br.ReadBytes(sizeof(int) * numHashCodes);

            for (int i = 0; i < numHashCodes; i++)
            {
                int hashCode = BitConverter.ToInt32(hashCodes, i * sizeof(int));
                m_Map.Add(hashCode, new Dictionary <string, int>());

                byte[]   countByte = br.ReadBytes(1);
                BitArray baCount   = new BitArray(countByte);
                int      count     = 0;
                for (int ib = 0; ib < numBitPerHashEntryCount; ib++)
                {
                    count += (baCount[ib] ? 1 : 0) << (numBitPerHashEntryCount - ib - 1);
                }

                br.BaseStream.Position = br.BaseStream.Position - 1;
                int numBitPerElement = numBitPerAccStringIndex + numBitPerAccCountIndex;
                int totalNumBits     = numBitPerHashEntryCount + count * numBitPerElement;
                int numBytes         = (int)Math.Ceiling(totalNumBits / 8.0);

                byte[]   bytes = br.ReadBytes(numBytes);
                BitArray ba    = new BitArray(bytes);

                int iWork = numBitPerHashEntryCount;
                for (int iElement = 0; iElement < count; iElement++)
                {
                    int accCodeIndex = 0;
                    for (int ib = 0; ib < numBitPerAccStringIndex; ib++)
                    {
                        accCodeIndex += (ba[iWork++] ? 1 : 0) << (numBitPerAccStringIndex - ib - 1);
                    }

                    int accCodeCountIndex = 0;
                    for (int ib = 0; ib < numBitPerAccCountIndex; ib++)
                    {
                        accCodeCountIndex += (ba[iWork++] ? 1 : 0) << (numBitPerAccCountIndex - ib - 1);
                    }

                    string accCode      = accStringList[accCodeIndex];
                    int    accCodeCount = accCodeCountList[accCodeCountIndex];

                    // TODO: convert accCode to accent string before inserting
                    m_Map[hashCode].Add(accCode, accCodeCount);
                }
            }
        }
Beispiel #3
0
        // Can only read file of Model1 format for now
        public void ReadFromBinary(BinaryReader br)
        {
            int version = br.ReadInt32();

            if (version != 1)
            {
                throw new Exception();
            }

            var converter = new Utility.VietConverter();

            // Read number of bits for each number type
            m_NumBitPerAccStringIndex  = br.ReadInt32();
            m_NumBitPerAccCountIndex   = br.ReadInt32();
            m_NumBitPerHashEntryCount  = br.ReadInt32();
            m_NumBitPerAccStringLength = br.ReadInt32();

            // Read look up table of accent codes
            m_NumAccString = br.ReadInt32();

            var asciiBytes = new List <byte>(m_NumAccString * 2);

            m_PredictionAsciiIndex = new int[m_NumAccString + 1];

            int index = 0;

            for (int i = 0; i < m_NumAccString; i++)
            {
                byte stringLength = br.ReadByte();

                m_PredictionAsciiIndex[i] = index;

                index += stringLength;

                asciiBytes.AddRange(br.ReadBytes(stringLength));
            }
            m_PredictionAsciiIndex[m_NumAccString] = index;

            m_PredictionAsciiBytes = asciiBytes.ToArray();

            // Read look up table of accent code counts

            int numAccCodeCount = br.ReadInt32();

            m_PredictionCounts = new int[numAccCodeCount];
            for (int i = 0; i < numAccCodeCount; i++)
            {
                m_PredictionCounts[i] = br.ReadInt32();
            }

            // Deserialize hash function
            int hashFunctionBytesLength = br.ReadInt32();

            byte[] hashFunctionBytes  = br.ReadBytes(hashFunctionBytesLength);
            var    hashFunctionStream = new MemoryStream(hashFunctionBytes);
            var    formatter          = new BinaryFormatter();

            m_HashFunction = (MinPerfectHash)formatter.Deserialize(hashFunctionStream);

            // Read bytes for hash codes
            int hashSize = (int)m_HashFunction.N;

            var hashTableByteList = new List <byte>((int)m_HashFunction.N * 4);

            m_HashTableByteIndex = new int[hashSize + 1];
            index = 0;

            for (int i = 0; i < hashSize; i++)
            {
                byte[]   countByte = br.ReadBytes(1);
                BitArray baCount   = new BitArray(countByte);
                int      count     = 0;
                for (int ib = 0; ib < m_NumBitPerHashEntryCount; ib++)
                {
                    count += (baCount[ib] ? 1 : 0) << (m_NumBitPerHashEntryCount - ib - 1);
                }

                if (count == 0)
                {
                    continue;
                }

                br.BaseStream.Position = br.BaseStream.Position - 1;
                int numBitPerElement = m_NumBitPerAccStringIndex + m_NumBitPerAccCountIndex;
                int totalNumBits     = m_NumBitPerHashEntryCount + count * numBitPerElement;
                int numBytes         = (int)Math.Ceiling(totalNumBits / 8.0);

                m_HashTableByteIndex[i] = index;

                index += numBytes;

                hashTableByteList.AddRange(br.ReadBytes(numBytes));
            }
            m_HashTableByteIndex[hashSize] = index;
            m_HashTableBytes = hashTableByteList.ToArray();
        }
Beispiel #4
0
        /// <summary>
        /// Parses text files and generate a list of list of words.
        /// </summary>
        /// <returns>A list of segment which is in turn a list of words.</returns>
        public static List <List <string> > ParseData(TupleList <string, string> dataFiles, bool writeOut = true)
        {
            // The parsing algorithm is as follows:
            // 1. Read 1 line at a time and ignore all whitespace or skip if it's a white space only line
            // 2. If this line contains more than a certain number of html characters such as &# then skip it
            // 3. If present, extract groups inside parentheses out as separate sentences since they often are self-sufficient
            // 4. Break the line into different segments through characters . , ; : ! ?
            //      - An issue with the , . characters is that numbers use them as well, e.g. 7,5 or 40.000.
            //      - Another issue with the dot character is that it can be used as ellipsis during part of a sentence. For example: Là ngày... em sẽ xa ta luôn.
            //      - Characters such as ! and ? don't really affect word meanings and often signal end of phrase
            // 5. Remove empty or white-space segments
            // 6. Break each segment into a list of words with white-space separators.
            //      - Note that white-space separators here include more than just the space character (ascii code 32)
            //        but can also include \t or html white-space character such as &nbsp; etc...
            // 7. Remove words that are only characters such as * >
            // 8. Remove quote characters ' " ” (8221) “ (8220) from words since they normally only serve as emphasis functions

            string[]         ignores           = { "&#" };
            char[]           quotes            = { '\'', '"', '“', '”' };
            char[]           segmentSeparators = { ',', ';', ':', '.', '!', '?' };
            HashSet <string> removeSet         = new HashSet <string>(new string[] { "*", ">" });

            VietConverter         converter         = new VietConverter();
            List <List <string> > globalSegmentList = new List <List <string> >();

            foreach (var tuple in dataFiles)
            {
                var textFiles = Directory.EnumerateFiles(tuple.Item1, tuple.Item2, SearchOption.AllDirectories);
                foreach (var textFile in textFiles)
                {
                    Console.WriteLine(textFile);
                    using (StreamReader sr = new StreamReader(File.OpenRead(textFile)))
                    {
                        while (sr.EndOfStream == false)
                        {
                            string line = sr.ReadLine().Trim();
                            // Ignore white-space strings
                            if (!String.IsNullOrWhiteSpace(line))
                            {
                                bool ignore = false;
                                // Ignore strings that contain invalid characters such as html characters
                                foreach (string ignorePattern in ignores)
                                {
                                    if (line.Contains(ignorePattern))
                                    {
                                        ignore = true;
                                        break;
                                    }
                                }
                                if (ignore)
                                {
                                    continue;
                                }

                                // Extract parentheses groups from current string
                                var groups = TextParser.ExtractParentheses(line);

                                foreach (string group in groups)
                                {
                                    if (!String.IsNullOrWhiteSpace(group)) // Make sure once again that the groups aren't white-space only
                                    {
                                        // Break each group into segments
                                        string[] segmentArray = group.Split(segmentSeparators, StringSplitOptions.RemoveEmptyEntries);
                                        foreach (string segment in segmentArray)
                                        {
                                            List <int> wordList = new List <int>();

                                            bool skipSegment = false;

                                            // Break each segment into words
                                            List <string> normSegment = new List <string>();

                                            string[] wordArray = segment.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
                                            if (wordArray.Length > 1)
                                            {
                                                foreach (string word in wordArray)
                                                {
                                                    string normWord = word;

                                                    // Make sure the word is not white-space only
                                                    if (!removeSet.Contains(normWord))
                                                    {
                                                        // Remove quote characters
                                                        foreach (var quote in quotes)
                                                        {
                                                            normWord = normWord.Replace(quote.ToString(), "");
                                                        }
                                                        normWord = normWord.Trim().ToLower();

                                                        if (!String.IsNullOrWhiteSpace(normWord))
                                                        {
                                                            normSegment.Add(normWord);
                                                        }
                                                    }
                                                    else
                                                    {
                                                        skipSegment = true;
                                                        break;
                                                    }
                                                }
                                            }

                                            if (!skipSegment && normSegment.Count > 1)
                                            {
                                                globalSegmentList.Add(normSegment);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(globalSegmentList);
        }
Beispiel #5
0
        public void WriteToBinary(BinaryWriter bw)
        {
            m_HashFunction = PerfectHash.MPH.MinPerfectHash.Create(new RawStringKeySource(m_Map.Keys.ToList()), c: 1.0);
            m_HashTable    = Enumerable.Repeat <Dictionary <string, int> >(null, (int)m_HashFunction.N).ToList();

            foreach (string rawString in m_Map.Keys)
            {
                int hashCode = (int)m_HashFunction.Search(Encoding.UTF8.GetBytes(rawString));

                m_HashTable[hashCode] = m_Map[rawString];
            }

            var converter = new Utility.VietConverter();

            var uniqueAccentStrings   = new HashSet <string>();
            var uniqueAccentCodeCount = new HashSet <int>();
            int maxHashEntryCount     = 0;

            foreach (Dictionary <string, int> accCodeToCount in m_Map.Values)
            {
                foreach (string accString in accCodeToCount.Keys)
                {
                    uniqueAccentStrings.Add(accString);
                    uniqueAccentCodeCount.Add(accCodeToCount[accString]);
                }

                maxHashEntryCount = Math.Max(maxHashEntryCount, accCodeToCount.Count);
            }
            int numBitPerAccStringIndex = GetRepresentativeBitCount(uniqueAccentStrings.Count);
            int numBitPerAccCountIndex  = GetRepresentativeBitCount(uniqueAccentCodeCount.Count);
            int numBitPerHashEntryCount = GetRepresentativeBitCount(maxHashEntryCount);

            bw.Write(Version);                 // Write version number
            bw.Write(numBitPerAccStringIndex); // How many bits to represent an accent code
            bw.Write(numBitPerAccCountIndex);  // How many bits to represent an accent code's count
            bw.Write(numBitPerHashEntryCount); // How many bits to represent the count of elements in each hash entry

            var accStringList    = new List <string>();
            var accStringToIndex = new Dictionary <string, int>();
            int index            = 0;

            foreach (string accString in uniqueAccentStrings)
            {
                accStringList.Add(accString);
                accStringToIndex.Add(accString, index);
                index++;
            }
            int numBitPerAccStringLength = GetRepresentativeBitCount(accStringList.Max(s => s.Length));

            bw.Write(numBitPerAccStringLength);

            // Write ascii accent codes lookup table
            bw.Write(accStringList.Count);
            foreach (string accString in accStringList)
            {
                string asciiString = String.Join(String.Empty, accString.Select(c => (char)converter.AccentToAsciiMap[c]));
                byte[] asciiBytes  = System.Text.Encoding.ASCII.GetBytes(asciiString);

                bw.Write((byte)asciiBytes.Length);
                bw.Write(asciiBytes);
            }

            var accCodeCountList    = new List <int>();
            var accCodeCountToIndex = new Dictionary <int, int>();

            index = 0;
            foreach (int accCodeCount in uniqueAccentCodeCount)
            {
                accCodeCountList.Add(accCodeCount);
                accCodeCountToIndex.Add(accCodeCount, index);
                index++;
            }
            // Write unique accent code counts lookup table
            bw.Write(accCodeCountList.Count);
            foreach (int accCodeCount in accCodeCountList)
            {
                bw.Write(accCodeCount);
            }

            // Serialize hash function
            var hashFunctionStream = new MemoryStream();
            var formatter          = new BinaryFormatter();

            formatter.Serialize(hashFunctionStream, m_HashFunction);
            byte[] hashFunctionBytes = hashFunctionStream.ToArray();
            bw.Write(hashFunctionBytes.Length);
            bw.Write(hashFunctionBytes);

            var bitList = new List <bool>();

            foreach (var kvp in m_HashTable)
            {
                // Write number of entries, using the minimal number of bits
                int count = kvp != null ? kvp.Count : 0;

                for (int ib = numBitPerHashEntryCount - 1; ib >= 0; ib--)
                {
                    bitList.Add(((count >> ib) & 1) == 1);
                }

                if (kvp != null)
                {
                    foreach (string accString in kvp.Keys)
                    {
                        // Write accent code's index in the lookup table, using the minimal number of bits
                        for (int ib = numBitPerAccStringIndex - 1; ib >= 0; ib--)
                        {
                            bitList.Add(((accStringToIndex[accString] >> ib) & 1) == 1);
                        }

                        // Write accent code count's index in the lookup table, using the minimal number of bits
                        for (int ib = numBitPerAccCountIndex - 1; ib >= 0; ib--)
                        {
                            bitList.Add(((accCodeCountToIndex[kvp[accString]] >> ib) & 1) == 1);
                        }
                    }
                }

                // Write from queue to disk byte-aligned
                while (bitList.Count % 8 != 0)
                {
                    bitList.Add(false);
                }

                BitArray ba    = new BitArray(bitList.ToArray());
                byte[]   bytes = new byte[bitList.Count / 8];
                ba.CopyTo(bytes, 0);

                bw.Write(bytes);

                bitList.Clear();
            }
        }