private static void AddToTrie(SYMBMaskEntry *trie, SYMBMaskHeader *head, int id, SYMBHeader *table) { //If list is empty add the node and quit if (head->_numEntries == 0) { trie[head->_numEntries++] = new SYMBMaskEntry(1, -1, -1, -1, id, 0); return; } string value = table->GetStringEntry(id); //String.IsNullOrEmpty(value) if ((value ?? "") == "") { throw new ArgumentException("String is null or whitespace"); } SYMBMaskEntry search = trie[head->_rootId]; List <int> path = new List <int> { head->_rootId }; //Find the string that matches the current string in the trie. Needs to be done in order to determine where the important bit is in the string while (search._flags == 0) { //Assume that strings are treated as having an infinite number of null chars following them if (search._bit / 8 >= value.Length) { path.Add(search._leftId); search = trie[search._leftId]; continue; } // _leftId corresponds to bit=0, _rightId corresponds to bit=1 if (CheckBit(value, search._bit)) { path.Add(search._rightId); search = trie[search._rightId]; } else { path.Add(search._leftId); search = trie[search._leftId]; } } string searchVal = table->GetStringEntry(search._stringId); //Can't add duplicate strings if (searchVal == value) { throw new ArgumentException("Duplicate string"); } bool mismatch = false; int minLength = Math.Min(searchVal.Length, value.Length); short bit = 0; //Locate mismatching character between the two strings for (short i = 0; i < minLength; i++) { if (value[i] != searchVal[i]) { mismatch = true; bit = (short)(8 * i); break; } } bool right; //If a char was different one string does not contain the other if (mismatch) { //Find where the bits differed int cmpint = value[bit / 8] ^ searchVal[bit / 8]; bit += clz8[cmpint]; //If the bit is 1 the string being added takes the left fork right = CheckBit(value, bit); if (head->_numEntries == 1) { trie[1] = new SYMBMaskEntry(1, -1, -1, -1, id, 1); trie[2] = new SYMBMaskEntry(0, bit, right ? 0 : 1, right ? 1 : 0, -1, -1); head->_numEntries = 3; head->_rootId = 2; return; } //If the mismatch bit is lower than the first mismatch bit the new branch will be the root of the tree if (bit < trie[path[0]]._bit) { trie[head->_numEntries++] = new SYMBMaskEntry(1, -1, -1, -1, id, head->_numEntries / 2); if (right) { trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, path[0], head->_numEntries - 2, -1, -1); } else { trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, head->_numEntries - 2, path[0], -1, -1); } head->_rootId = head->_numEntries - 1; return; } //Locate where the branch needs to be inserted for (int i = 1; i < path.Count; i++) { if (trie[path[i]]._bit > bit || trie[path[i]]._flags == 1) { //Add leaf trie[head->_numEntries++] = new SYMBMaskEntry(1, -1, -1, -1, id, head->_numEntries / 2); //Remap previous branch to point to new branch if (trie[path[i - 1]]._leftId == path[i]) { trie[path[i - 1]]._leftId = head->_numEntries; } else { trie[path[i - 1]]._rightId = head->_numEntries; } //Create new branch if (right) { trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, path[i], head->_numEntries - 2, -1, -1); } else { trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, head->_numEntries - 2, path[i], -1, -1); } return; } } //This should never happen throw new Exception("Error building tree, unexpected structure"); } //Since mismatch is false, one string is a substring of the other //The longer string is the one that takes the left branch right = value.Length > searchVal.Length; bit = (short)(minLength * 8); if (right) { //Find the first bit after the substring that's 1. Will always occur in the first 8 bits because 0x00 denotes string termination and thus isn't in value bit += clz8[value[bit / 8]]; //If path.Count == 1 the only value is a leaf if (path.Count == 1) { trie[1] = new SYMBMaskEntry(1, -1, -1, -1, id, 1); trie[2] = new SYMBMaskEntry(0, bit, 0, 1, -1, -1); head->_numEntries = 3; head->_rootId = 2; return; } //Update old branch, insert new branch and node, and quit. trie[path[path.Count-2]] is the last branch that was a comparison. trie[head->_numEntries++] = new SYMBMaskEntry(1, -1, -1, -1, id, head->_numEntries / 2); int trace = path.Count - 2; if (trie[path[trace]]._leftId == path[trace + 1]) { //Handling an extremely specific and annoying edge case while (trie[path[trace]]._bit > bit) { trace--; if (trace < 0) { //This node is actually the root of the tree trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, path[0], head->_numEntries - 2, -1, -1); head->_rootId = head->_numEntries - 2; return; } } trie[path[trace]]._leftId = head->_numEntries; } else { trie[path[trace]]._rightId = head->_numEntries; } trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, path[trace + 1], head->_numEntries - 2, -1, -1); return; } //Find first bit comparison that happens after the substring ends int index; for (index = 0; trie[path[index]]._flags == 0 && trie[path[index]]._bit <= bit; index++) { } //Find the first bit that's 1 and isn't already used in the trie int cmpVal = searchVal[bit / 8]; byte clzVal; bool test = trie[path[index]]._flags == 0; while (true) { clzVal = clz8[cmpVal]; if (clzVal == 8) { bit += 8; cmpVal = searchVal[bit / 8]; continue; } if (test && trie[path[index]]._bit <= bit + clzVal) { if (trie[path[index]]._bit == bit + clzVal) { cmpVal ^= (1 << 7) >> clzVal; } test = trie[path[++index]]._flags == 0; continue; } bit += clzVal; break; } //If the trie is a single leaf the new branch is the root of the trie if (head->_numEntries == 1) { trie[1] = new SYMBMaskEntry(1, -1, -1, -1, id, 1); trie[2] = new SYMBMaskEntry(0, bit, 1, 0, -1, -1); head->_numEntries = 3; head->_rootId = 2; return; } //Update old branch, insert new branch and node, and quit trie[head->_numEntries++] = new SYMBMaskEntry(1, -1, -1, -1, id, head->_numEntries / 2); if (trie[path[index - 1]]._leftId == path[index]) { trie[path[index - 1]]._leftId = head->_numEntries; } else { trie[path[index - 1]]._rightId = head->_numEntries; } trie[head->_numEntries++] = new SYMBMaskEntry(0, bit, head->_numEntries - 2, path[index], -1, -1); return; }
//Code written by Mawootad public static void Build(int[] indices, SYMBHeader *header, SYMBMaskHeader *maskHeader, SYMBMaskEntry *entries) { //initialization maskHeader->_rootId = 0; maskHeader->_numEntries = 0; //Loop over indicies and add them. This seems to be roughly how the file is normally built, as it has the same resulting leaf-node-leaf-node pattern foreach (int id in indices) { AddToTrie(entries, maskHeader, id, header); } }