public void WriteToBinary(BinaryWriter bw) { var converter = new Utility.VietConverter(); var uniqueAccentStrings = new HashSet <string>(); var uniqueAccentCodeCount = new HashSet <int>(); int maxHashEntryCount = 0; foreach (Dictionary <string, int> accCodeToCount in m_Map.Values) { foreach (string accString in accCodeToCount.Keys) { uniqueAccentStrings.Add(accString); uniqueAccentCodeCount.Add(accCodeToCount[accString]); } maxHashEntryCount = Math.Max(maxHashEntryCount, accCodeToCount.Count); } int numBitPerAccStringIndex = GetRepresentativeBitCount(uniqueAccentStrings.Count); int numBitPerAccCountIndex = GetRepresentativeBitCount(uniqueAccentCodeCount.Count); int numBitPerHashEntryCount = GetRepresentativeBitCount(maxHashEntryCount); bw.Write(Version); // Write version number bw.Write(numBitPerAccStringIndex); // How many bits to represent an accent code bw.Write(numBitPerAccCountIndex); // How many bits to represent an accent code's count bw.Write(numBitPerHashEntryCount); // How many bits to represent the count of elements in each hash entry var accStringList = new List <string>(); var accStringToIndex = new Dictionary <string, int>(); int index = 0; foreach (string accString in uniqueAccentStrings) { accStringList.Add(accString); accStringToIndex.Add(accString, index); index++; } int numBitPerAccStringLength = GetRepresentativeBitCount(accStringList.Max(s => s.Length)); bw.Write(numBitPerAccStringLength); // Write ascii accent codes lookup table bw.Write(accStringList.Count); foreach (string accString in accStringList) { string asciiString = String.Join(String.Empty, accString.Select(c => (char)converter.AccentToAsciiMap[c])); byte[] asciiBytes = System.Text.Encoding.ASCII.GetBytes(asciiString); bw.Write((byte)asciiBytes.Length); bw.Write(asciiBytes); } var accCodeCountList = new List <int>(); var accCodeCountToIndex = new Dictionary <int, int>(); index = 0; foreach (int accCodeCount in uniqueAccentCodeCount) { accCodeCountList.Add(accCodeCount); accCodeCountToIndex.Add(accCodeCount, index); index++; } // Write unique accent code counts lookup table bw.Write(accCodeCountList.Count); foreach (int accCodeCount in accCodeCountList) { bw.Write(accCodeCount); } // Write map bw.Write(m_Map.Count); var orderedMap = m_Map.OrderBy(kvp => kvp.Key); foreach (var kvp in orderedMap) { bw.Write(kvp.Key); } var bitList = new List <bool>(); foreach (var kvp in orderedMap) { // Write number of entries, using the minimal number of bits for (int ib = numBitPerHashEntryCount - 1; ib >= 0; ib--) { bitList.Add(((kvp.Value.Count >> ib) & 1) == 1); } foreach (string accString in kvp.Value.Keys) { // Write accent code's index in the lookup table, using the minimal number of bits for (int ib = numBitPerAccStringIndex - 1; ib >= 0; ib--) { bitList.Add(((accStringToIndex[accString] >> ib) & 1) == 1); } // Write accent code count's index in the lookup table, using the minimal number of bits for (int ib = numBitPerAccCountIndex - 1; ib >= 0; ib--) { bitList.Add(((accCodeCountToIndex[kvp.Value[accString]] >> ib) & 1) == 1); } } // Write from queue to disk byte-aligned while (bitList.Count % 8 != 0) { bitList.Add(false); } BitArray ba = new BitArray(bitList.ToArray()); byte[] bytes = new byte[bitList.Count / 8]; ba.CopyTo(bytes, 0); bw.Write(bytes); bitList.Clear(); } }
// Can only read file of Model1 format for now public void ReadFromBinary(BinaryReader br) { int version = br.ReadInt32(); if (version != 1) { throw new Exception(); } var converter = new Utility.VietConverter(); // Read number of bits for each number type int numBitPerAccStringIndex = br.ReadInt32(); int numBitPerAccCountIndex = br.ReadInt32(); int numBitPerHashEntryCount = br.ReadInt32(); int numBitPerAccStringLength = br.ReadInt32(); // Read look up table of accent codes var accStringList = new List <string>(); int numAccString = br.ReadInt32(); for (int i = 0; i < numAccString; i++) { byte stringLength = br.ReadByte(); byte[] asciiBytes = br.ReadBytes(stringLength); string asciiString = System.Text.Encoding.ASCII.GetString(asciiBytes); accStringList.Add(String.Join(String.Empty, asciiString.Select(c => converter.AsciiToAccentMap[c]))); } // Read look up table of accent code counts var accCodeCountList = new List <int>(); int numAccCodeCount = br.ReadInt32(); for (int i = 0; i < numAccCodeCount; i++) { accCodeCountList.Add(br.ReadInt32()); } // Read bytes for hash codes int numHashCodes = br.ReadInt32(); byte[] hashCodes = br.ReadBytes(sizeof(int) * numHashCodes); for (int i = 0; i < numHashCodes; i++) { int hashCode = BitConverter.ToInt32(hashCodes, i * sizeof(int)); m_Map.Add(hashCode, new Dictionary <string, int>()); byte[] countByte = br.ReadBytes(1); BitArray baCount = new BitArray(countByte); int count = 0; for (int ib = 0; ib < numBitPerHashEntryCount; ib++) { count += (baCount[ib] ? 1 : 0) << (numBitPerHashEntryCount - ib - 1); } br.BaseStream.Position = br.BaseStream.Position - 1; int numBitPerElement = numBitPerAccStringIndex + numBitPerAccCountIndex; int totalNumBits = numBitPerHashEntryCount + count * numBitPerElement; int numBytes = (int)Math.Ceiling(totalNumBits / 8.0); byte[] bytes = br.ReadBytes(numBytes); BitArray ba = new BitArray(bytes); int iWork = numBitPerHashEntryCount; for (int iElement = 0; iElement < count; iElement++) { int accCodeIndex = 0; for (int ib = 0; ib < numBitPerAccStringIndex; ib++) { accCodeIndex += (ba[iWork++] ? 1 : 0) << (numBitPerAccStringIndex - ib - 1); } int accCodeCountIndex = 0; for (int ib = 0; ib < numBitPerAccCountIndex; ib++) { accCodeCountIndex += (ba[iWork++] ? 1 : 0) << (numBitPerAccCountIndex - ib - 1); } string accCode = accStringList[accCodeIndex]; int accCodeCount = accCodeCountList[accCodeCountIndex]; // TODO: convert accCode to accent string before inserting m_Map[hashCode].Add(accCode, accCodeCount); } } }
// Can only read file of Model1 format for now public void ReadFromBinary(BinaryReader br) { int version = br.ReadInt32(); if (version != 1) { throw new Exception(); } var converter = new Utility.VietConverter(); // Read number of bits for each number type m_NumBitPerAccStringIndex = br.ReadInt32(); m_NumBitPerAccCountIndex = br.ReadInt32(); m_NumBitPerHashEntryCount = br.ReadInt32(); m_NumBitPerAccStringLength = br.ReadInt32(); // Read look up table of accent codes m_NumAccString = br.ReadInt32(); var asciiBytes = new List <byte>(m_NumAccString * 2); m_PredictionAsciiIndex = new int[m_NumAccString + 1]; int index = 0; for (int i = 0; i < m_NumAccString; i++) { byte stringLength = br.ReadByte(); m_PredictionAsciiIndex[i] = index; index += stringLength; asciiBytes.AddRange(br.ReadBytes(stringLength)); } m_PredictionAsciiIndex[m_NumAccString] = index; m_PredictionAsciiBytes = asciiBytes.ToArray(); // Read look up table of accent code counts int numAccCodeCount = br.ReadInt32(); m_PredictionCounts = new int[numAccCodeCount]; for (int i = 0; i < numAccCodeCount; i++) { m_PredictionCounts[i] = br.ReadInt32(); } // Deserialize hash function int hashFunctionBytesLength = br.ReadInt32(); byte[] hashFunctionBytes = br.ReadBytes(hashFunctionBytesLength); var hashFunctionStream = new MemoryStream(hashFunctionBytes); var formatter = new BinaryFormatter(); m_HashFunction = (MinPerfectHash)formatter.Deserialize(hashFunctionStream); // Read bytes for hash codes int hashSize = (int)m_HashFunction.N; var hashTableByteList = new List <byte>((int)m_HashFunction.N * 4); m_HashTableByteIndex = new int[hashSize + 1]; index = 0; for (int i = 0; i < hashSize; i++) { byte[] countByte = br.ReadBytes(1); BitArray baCount = new BitArray(countByte); int count = 0; for (int ib = 0; ib < m_NumBitPerHashEntryCount; ib++) { count += (baCount[ib] ? 1 : 0) << (m_NumBitPerHashEntryCount - ib - 1); } if (count == 0) { continue; } br.BaseStream.Position = br.BaseStream.Position - 1; int numBitPerElement = m_NumBitPerAccStringIndex + m_NumBitPerAccCountIndex; int totalNumBits = m_NumBitPerHashEntryCount + count * numBitPerElement; int numBytes = (int)Math.Ceiling(totalNumBits / 8.0); m_HashTableByteIndex[i] = index; index += numBytes; hashTableByteList.AddRange(br.ReadBytes(numBytes)); } m_HashTableByteIndex[hashSize] = index; m_HashTableBytes = hashTableByteList.ToArray(); }
/// <summary> /// Parses text files and generate a list of list of words. /// </summary> /// <returns>A list of segment which is in turn a list of words.</returns> public static List <List <string> > ParseData(TupleList <string, string> dataFiles, bool writeOut = true) { // The parsing algorithm is as follows: // 1. Read 1 line at a time and ignore all whitespace or skip if it's a white space only line // 2. If this line contains more than a certain number of html characters such as &# then skip it // 3. If present, extract groups inside parentheses out as separate sentences since they often are self-sufficient // 4. Break the line into different segments through characters . , ; : ! ? // - An issue with the , . characters is that numbers use them as well, e.g. 7,5 or 40.000. // - Another issue with the dot character is that it can be used as ellipsis during part of a sentence. For example: Là ngày... em sẽ xa ta luôn. // - Characters such as ! and ? don't really affect word meanings and often signal end of phrase // 5. Remove empty or white-space segments // 6. Break each segment into a list of words with white-space separators. // - Note that white-space separators here include more than just the space character (ascii code 32) // but can also include \t or html white-space character such as etc... // 7. Remove words that are only characters such as * > // 8. Remove quote characters ' " ” (8221) “ (8220) from words since they normally only serve as emphasis functions string[] ignores = { "&#" }; char[] quotes = { '\'', '"', '“', '”' }; char[] segmentSeparators = { ',', ';', ':', '.', '!', '?' }; HashSet <string> removeSet = new HashSet <string>(new string[] { "*", ">" }); VietConverter converter = new VietConverter(); List <List <string> > globalSegmentList = new List <List <string> >(); foreach (var tuple in dataFiles) { var textFiles = Directory.EnumerateFiles(tuple.Item1, tuple.Item2, SearchOption.AllDirectories); foreach (var textFile in textFiles) { Console.WriteLine(textFile); using (StreamReader sr = new StreamReader(File.OpenRead(textFile))) { while (sr.EndOfStream == false) { string line = sr.ReadLine().Trim(); // Ignore white-space strings if (!String.IsNullOrWhiteSpace(line)) { bool ignore = false; // Ignore strings that contain invalid characters such as html characters foreach (string ignorePattern in ignores) { if (line.Contains(ignorePattern)) { ignore = true; break; } } if (ignore) { continue; } // Extract parentheses groups from current string var groups = TextParser.ExtractParentheses(line); foreach (string group in groups) { if (!String.IsNullOrWhiteSpace(group)) // Make sure once again that the groups aren't white-space only { // Break each group into segments string[] segmentArray = group.Split(segmentSeparators, StringSplitOptions.RemoveEmptyEntries); foreach (string segment in segmentArray) { List <int> wordList = new List <int>(); bool skipSegment = false; // Break each segment into words List <string> normSegment = new List <string>(); string[] wordArray = segment.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); if (wordArray.Length > 1) { foreach (string word in wordArray) { string normWord = word; // Make sure the word is not white-space only if (!removeSet.Contains(normWord)) { // Remove quote characters foreach (var quote in quotes) { normWord = normWord.Replace(quote.ToString(), ""); } normWord = normWord.Trim().ToLower(); if (!String.IsNullOrWhiteSpace(normWord)) { normSegment.Add(normWord); } } else { skipSegment = true; break; } } } if (!skipSegment && normSegment.Count > 1) { globalSegmentList.Add(normSegment); } } } } } } } } } return(globalSegmentList); }
public void WriteToBinary(BinaryWriter bw) { m_HashFunction = PerfectHash.MPH.MinPerfectHash.Create(new RawStringKeySource(m_Map.Keys.ToList()), c: 1.0); m_HashTable = Enumerable.Repeat <Dictionary <string, int> >(null, (int)m_HashFunction.N).ToList(); foreach (string rawString in m_Map.Keys) { int hashCode = (int)m_HashFunction.Search(Encoding.UTF8.GetBytes(rawString)); m_HashTable[hashCode] = m_Map[rawString]; } var converter = new Utility.VietConverter(); var uniqueAccentStrings = new HashSet <string>(); var uniqueAccentCodeCount = new HashSet <int>(); int maxHashEntryCount = 0; foreach (Dictionary <string, int> accCodeToCount in m_Map.Values) { foreach (string accString in accCodeToCount.Keys) { uniqueAccentStrings.Add(accString); uniqueAccentCodeCount.Add(accCodeToCount[accString]); } maxHashEntryCount = Math.Max(maxHashEntryCount, accCodeToCount.Count); } int numBitPerAccStringIndex = GetRepresentativeBitCount(uniqueAccentStrings.Count); int numBitPerAccCountIndex = GetRepresentativeBitCount(uniqueAccentCodeCount.Count); int numBitPerHashEntryCount = GetRepresentativeBitCount(maxHashEntryCount); bw.Write(Version); // Write version number bw.Write(numBitPerAccStringIndex); // How many bits to represent an accent code bw.Write(numBitPerAccCountIndex); // How many bits to represent an accent code's count bw.Write(numBitPerHashEntryCount); // How many bits to represent the count of elements in each hash entry var accStringList = new List <string>(); var accStringToIndex = new Dictionary <string, int>(); int index = 0; foreach (string accString in uniqueAccentStrings) { accStringList.Add(accString); accStringToIndex.Add(accString, index); index++; } int numBitPerAccStringLength = GetRepresentativeBitCount(accStringList.Max(s => s.Length)); bw.Write(numBitPerAccStringLength); // Write ascii accent codes lookup table bw.Write(accStringList.Count); foreach (string accString in accStringList) { string asciiString = String.Join(String.Empty, accString.Select(c => (char)converter.AccentToAsciiMap[c])); byte[] asciiBytes = System.Text.Encoding.ASCII.GetBytes(asciiString); bw.Write((byte)asciiBytes.Length); bw.Write(asciiBytes); } var accCodeCountList = new List <int>(); var accCodeCountToIndex = new Dictionary <int, int>(); index = 0; foreach (int accCodeCount in uniqueAccentCodeCount) { accCodeCountList.Add(accCodeCount); accCodeCountToIndex.Add(accCodeCount, index); index++; } // Write unique accent code counts lookup table bw.Write(accCodeCountList.Count); foreach (int accCodeCount in accCodeCountList) { bw.Write(accCodeCount); } // Serialize hash function var hashFunctionStream = new MemoryStream(); var formatter = new BinaryFormatter(); formatter.Serialize(hashFunctionStream, m_HashFunction); byte[] hashFunctionBytes = hashFunctionStream.ToArray(); bw.Write(hashFunctionBytes.Length); bw.Write(hashFunctionBytes); var bitList = new List <bool>(); foreach (var kvp in m_HashTable) { // Write number of entries, using the minimal number of bits int count = kvp != null ? kvp.Count : 0; for (int ib = numBitPerHashEntryCount - 1; ib >= 0; ib--) { bitList.Add(((count >> ib) & 1) == 1); } if (kvp != null) { foreach (string accString in kvp.Keys) { // Write accent code's index in the lookup table, using the minimal number of bits for (int ib = numBitPerAccStringIndex - 1; ib >= 0; ib--) { bitList.Add(((accStringToIndex[accString] >> ib) & 1) == 1); } // Write accent code count's index in the lookup table, using the minimal number of bits for (int ib = numBitPerAccCountIndex - 1; ib >= 0; ib--) { bitList.Add(((accCodeCountToIndex[kvp[accString]] >> ib) & 1) == 1); } } } // Write from queue to disk byte-aligned while (bitList.Count % 8 != 0) { bitList.Add(false); } BitArray ba = new BitArray(bitList.ToArray()); byte[] bytes = new byte[bitList.Count / 8]; ba.CopyTo(bytes, 0); bw.Write(bytes); bitList.Clear(); } }