Exemple #1
0
        // this should be removed after CreateParsingTire is implemented
        public EncodingData(byte[][] symbols, TextEncoding encoding, Tuple <byte, int>[] parsingTrie)
        {
            _symbols  = symbols;
            _encoding = encoding;

            var tire = new ParsingTrieNode[parsingTrie.Length];

            for (int i = 0; i < parsingTrie.Length; i++)
            {
                tire[i] = new ParsingTrieNode()
                {
                    valueOrNumChildren = parsingTrie[i].Item1, IndexOrSymbol = parsingTrie[i].Item2
                };
            }

            _parsingTrie = tire;
        }
        // The return value here is the index in parsingTrieList at which the parent node was placed.
        private static int CreateParsingTrieNodeAndChildren(ref List <ParsingTrieNode> parsingTrieList, List <Suffix> sortedSuffixes)
        {
            // If there is only one suffix, create a leaf node
            if (sortedSuffixes.Count == 1)
            {
                ParsingTrieNode leafNode = new ParsingTrieNode();
                leafNode.ValueOrNumChildren = 0;
                leafNode.IndexOrSymbol      = sortedSuffixes[0].SymbolIndex;
                int leafNodeIndex = parsingTrieList.Count;
                parsingTrieList.Add(leafNode);
                return(leafNodeIndex);
            }

            // Group suffixes into clumps based on first byte
            List <SuffixClump> clumps = new List <SuffixClump>(sortedSuffixes.Count);
            byte        beginningByte = sortedSuffixes[0].Bytes[0];
            SuffixClump currentClump  = new SuffixClump(beginningByte);

            clumps.Add(currentClump);

            // Initialize sequence detection
            Sequence currentSequence = new Sequence(0, beginningByte);
            Sequence longestSequence = currentSequence;

            foreach (Suffix suffix in sortedSuffixes)
            {
                if (suffix.Bytes[0] == beginningByte)
                {
                    currentClump.Suffixes.Add(new Suffix(suffix.SymbolIndex, suffix.Bytes.Slice(1)));
                }
                else
                {
                    beginningByte = suffix.Bytes[0];

                    // Determine if the new clump is part of a sequence
                    if (beginningByte == currentSequence.EndValue + 1)
                    {
                        // This clump is part of the current sequence
                        currentSequence.EndIndex++;
                        currentSequence.EndValue++;

                        if (!currentSequence.Equals(longestSequence) && currentSequence.CompareTo(longestSequence) > 0)
                        {
                            // Replace the longest sequence with this sequence
                            longestSequence = currentSequence;
                        }
                    }
                    else
                    {
                        // This clump is part of a new sequence
                        currentSequence = new Sequence(clumps.Count, beginningByte);
                    }

                    // This is a new clump, with at least one suffix inside it. Add to the list of clumps.
                    currentClump = new SuffixClump(beginningByte);
                    currentClump.Suffixes.Add(new Suffix(suffix.SymbolIndex, suffix.Bytes.Slice(1)));
                    clumps.Add(currentClump);
                }
            }

            // Now that we know how many children there are, create parent node and place in list
            ParsingTrieNode parentNode = new ParsingTrieNode();

            parentNode.ValueOrNumChildren = (byte)clumps.Count;
            // Only bother specifying a sequence if the longest sequence is sufficiently long
            if (longestSequence.Length > 5)
            {
                parentNode.IndexOrSymbol = longestSequence.CreateSequenceMap();
            }
            else
            {
                parentNode.IndexOrSymbol = 0;
            }
            int parentNodeIndex = parsingTrieList.Count;

            parsingTrieList.Add(parentNode);

            // Reserve space in list for child nodes. In this algorithm, all parent nodes are created first, leaving gaps for the child nodes
            // to be filled in once it is known where they point to.
            int childNodeStartIndex = parsingTrieList.Count;

            for (int i = 0; i < clumps.Count; i++)
            {
                parsingTrieList.Add(default(ParsingTrieNode));
            }

            // Process child nodes
            List <ParsingTrieNode> childNodes = new List <ParsingTrieNode>();

            foreach (SuffixClump clump in clumps)
            {
                ParsingTrieNode childNode = new ParsingTrieNode();
                childNode.ValueOrNumChildren = clump.BeginningByte;
                childNode.IndexOrSymbol      = CreateParsingTrieNodeAndChildren(ref parsingTrieList, clump.Suffixes);
                childNodes.Add(childNode);
            }

            // Place child nodes in spots allocated for them
            int childNodeIndex = childNodeStartIndex;

            foreach (ParsingTrieNode childNode in childNodes)
            {
                parsingTrieList[childNodeIndex] = childNode;
                childNodeIndex++;
            }

            return(parentNodeIndex);
        }