// this should be removed after CreateParsingTire is implemented public EncodingData(byte[][] symbols, TextEncoding encoding, Tuple <byte, int>[] parsingTrie) { _symbols = symbols; _encoding = encoding; var tire = new ParsingTrieNode[parsingTrie.Length]; for (int i = 0; i < parsingTrie.Length; i++) { tire[i] = new ParsingTrieNode() { valueOrNumChildren = parsingTrie[i].Item1, IndexOrSymbol = parsingTrie[i].Item2 }; } _parsingTrie = tire; }
// The return value here is the index in parsingTrieList at which the parent node was placed. private static int CreateParsingTrieNodeAndChildren(ref List <ParsingTrieNode> parsingTrieList, List <Suffix> sortedSuffixes) { // If there is only one suffix, create a leaf node if (sortedSuffixes.Count == 1) { ParsingTrieNode leafNode = new ParsingTrieNode(); leafNode.ValueOrNumChildren = 0; leafNode.IndexOrSymbol = sortedSuffixes[0].SymbolIndex; int leafNodeIndex = parsingTrieList.Count; parsingTrieList.Add(leafNode); return(leafNodeIndex); } // Group suffixes into clumps based on first byte List <SuffixClump> clumps = new List <SuffixClump>(sortedSuffixes.Count); byte beginningByte = sortedSuffixes[0].Bytes[0]; SuffixClump currentClump = new SuffixClump(beginningByte); clumps.Add(currentClump); // Initialize sequence detection Sequence currentSequence = new Sequence(0, beginningByte); Sequence longestSequence = currentSequence; foreach (Suffix suffix in sortedSuffixes) { if (suffix.Bytes[0] == beginningByte) { currentClump.Suffixes.Add(new Suffix(suffix.SymbolIndex, suffix.Bytes.Slice(1))); } else { beginningByte = suffix.Bytes[0]; // Determine if the new clump is part of a sequence if (beginningByte == currentSequence.EndValue + 1) { // This clump is part of the current sequence currentSequence.EndIndex++; currentSequence.EndValue++; if (!currentSequence.Equals(longestSequence) && currentSequence.CompareTo(longestSequence) > 0) { // Replace the longest sequence with this sequence longestSequence = currentSequence; } } else { // This clump is part of a new sequence currentSequence = new Sequence(clumps.Count, beginningByte); } // This is a new clump, with at least one suffix inside it. Add to the list of clumps. currentClump = new SuffixClump(beginningByte); currentClump.Suffixes.Add(new Suffix(suffix.SymbolIndex, suffix.Bytes.Slice(1))); clumps.Add(currentClump); } } // Now that we know how many children there are, create parent node and place in list ParsingTrieNode parentNode = new ParsingTrieNode(); parentNode.ValueOrNumChildren = (byte)clumps.Count; // Only bother specifying a sequence if the longest sequence is sufficiently long if (longestSequence.Length > 5) { parentNode.IndexOrSymbol = longestSequence.CreateSequenceMap(); } else { parentNode.IndexOrSymbol = 0; } int parentNodeIndex = parsingTrieList.Count; parsingTrieList.Add(parentNode); // Reserve space in list for child nodes. In this algorithm, all parent nodes are created first, leaving gaps for the child nodes // to be filled in once it is known where they point to. int childNodeStartIndex = parsingTrieList.Count; for (int i = 0; i < clumps.Count; i++) { parsingTrieList.Add(default(ParsingTrieNode)); } // Process child nodes List <ParsingTrieNode> childNodes = new List <ParsingTrieNode>(); foreach (SuffixClump clump in clumps) { ParsingTrieNode childNode = new ParsingTrieNode(); childNode.ValueOrNumChildren = clump.BeginningByte; childNode.IndexOrSymbol = CreateParsingTrieNodeAndChildren(ref parsingTrieList, clump.Suffixes); childNodes.Add(childNode); } // Place child nodes in spots allocated for them int childNodeIndex = childNodeStartIndex; foreach (ParsingTrieNode childNode in childNodes) { parsingTrieList[childNodeIndex] = childNode; childNodeIndex++; } return(parentNodeIndex); }