Exemplo n.º 1
0
        private static void AppendPattern(ILinks <ulong> links, LinksConstants <ulong> constants, ulong start, ulong patternMarker, Dictionary <ulong, char> chars, ulong any, ulong @continue, StringBuilder sb, ulong initialPosition, RawNumberToAddressConverter <ulong> rawNumberToAddressConverter)
        {
            sb.Append('(');
            var alternatives = 0;

            links.Each(linkParts =>
            {
                var link = new UInt64Link(linkParts);
                if (patternMarker == link.Target)
                {
                    if (alternatives > 0)
                    {
                        sb.Append('|');
                    }
                    alternatives++;
                }
                else if (!constants.IsExternalReference(link.Target))
                {
                    var charPosition = new UInt64Link(links.GetLink(link.Target));
                    if (constants.IsExternalReference(charPosition.Source) && chars.TryGetValue(charPosition.Target, out char targetSymbol))
                    {
                        var position = rawNumberToAddressConverter.Convert(charPosition.Source) - 10;
                        if (position == initialPosition)
                        {
                            if (alternatives > 0)
                            {
                                sb.Append('|');
                            }
                            sb.Append(targetSymbol);
                            AppendPattern(links, constants, link.Target, patternMarker, chars, any, @continue, sb, initialPosition + 1, rawNumberToAddressConverter);
                            alternatives++;
                        }
                    }
                }
                return(@continue);
            }, new UInt64Link(any, start, any));
            sb.Append(')');
        }
Exemplo n.º 2
0
 private static string LinkToString(ILinks <ulong> links, LinksConstants <ulong> constants, ulong linkAddress, Dictionary <ulong, char> chars, RawNumberToAddressConverter <ulong> rawNumberToAddressConverter)
 {
     if (chars.TryGetValue(linkAddress, out char @char))
     {
         return($"'{@char.ToString()}'");
     }
     else if (constants.IsExternalReference(linkAddress))
     {
         return(rawNumberToAddressConverter.Convert(linkAddress).ToString());
     }
     else
     {
         var link = new UInt64Link(links.GetLink(linkAddress));
         if (constants.IsExternalReference(link.Source) && chars.TryGetValue(link.Target, out char targetChar))
         {
             return($"[{rawNumberToAddressConverter.Convert(link.Source)}]'{targetChar}'");
         }
         else
         {
             return(linkAddress.ToString());
         }
     }
 }
 private void CommitDeletion(UInt64Link before)
 {
     CommitTransition(new Transition { TransactionId = _currentTransactionId, Before = before });
 }
 private void CommitUpdate(UInt64Link before, UInt64Link after)
 {
     CommitTransition(new Transition { TransactionId = _currentTransactionId, Before = before, After = after });
 }
 private void CommitCreation(UInt64Link after)
 {
     CommitTransition(new Transition { TransactionId = _currentTransactionId, After = after });
 }
            /// <remarks>
            /// Original algorithm idea: https://en.wikipedia.org/wiki/Byte_pair_encoding .
            /// Faster version (pairs' frequencies dictionary is not recreated).
            /// </remarks>
            public ulong[] Precompress2(ulong[] sequence)
            {
                if (sequence.IsNullOrEmpty())
                    return null;

                if (sequence.Length == 1)
                    return sequence;

                var newLength = sequence.Length;
                var copy = new ulong[sequence.Length];
                copy[0] = sequence[0];

                for (var i = 1; i < sequence.Length; i++)
                {
                    copy[i] = sequence[i];

                    var pair = new UInt64Link(sequence[i - 1], sequence[i]);

                    ulong frequency;
                    if (_pairsFrequencies.TryGetValue(pair, out frequency))
                    {
                        var newFrequency = frequency + 1;

                        if (_maxFrequency < newFrequency)
                        {
                            _maxFrequency = newFrequency;
                            _maxPair = pair;
                        }

                        _pairsFrequencies[pair] = newFrequency;
                    }
                    else
                        _pairsFrequencies.Add(pair, 1);
                }

                //var tempPair = new Link();

                while (!_maxPair.IsNull())
                {
                    var maxPair = _maxPair;

                    ResetMaxPair();

                    var maxPairSource = maxPair.Source;

                    var maxPairLink = _links.CreateAndUpdate(maxPairSource, maxPair.Target);

                    // Substitute all usages
                    for (var i = 1; i < copy.Length; i++)
                    {
                        var startIndex = i - 1;

                        while (startIndex < copy.Length && copy[startIndex] == 0)
                        {
                            i++;
                            startIndex++;
                        }
                        if (startIndex == copy.Length - 1) break;

                        if (copy[startIndex] == maxPairSource)
                        {
                            while (i < copy.Length && copy[i] == 0) i++;
                            if (i == copy.Length) break;

                            if (copy[i] == maxPair.Target)
                            {
                                var oldLeft = copy[startIndex];
                                var oldRight = copy[i];

                                copy[startIndex] = maxPairLink;
                                copy[i] = 0;
                                // TODO: Вместо записи нулевых дырок, можно хранить отрицательным числом размер диапазона (дырки) на которую надо прыгнуть, это дополнительно ускорило бы алгоритм.

                                // Требуется отдельно, так как пары могут идти подряд,
                                // например в "ааа" пара "аа" была посчитана дважды
                                //pairsFrequencies[maxPair]--;

                                var frequency = _pairsFrequencies[maxPair];
                                if (frequency == 1)
                                    _pairsFrequencies.Remove(maxPair);
                                else
                                    _pairsFrequencies[maxPair] = frequency - 1;

                                //UpdateMaxPair2(maxPair, frequency);

                                newLength--;

                                if (startIndex > 0)
                                {
                                    var previous = startIndex - 1;
                                    while (previous >= 0 && copy[previous] == 0) previous--;
                                    if (previous >= 0)
                                    {
                                        var previousOldPair = new UInt64Link(copy[previous], oldLeft);
                                        //if (!nextOldPair.Equals(maxPair))
                                        {
                                            //pairsFrequencies[nextOldPair]--;
                                            if (_pairsFrequencies.TryGetValue(previousOldPair, out frequency))
                                            {
                                                if (frequency == 1)
                                                    _pairsFrequencies.Remove(previousOldPair);
                                                else
                                                    _pairsFrequencies[previousOldPair] = frequency - 1;

                                                //if(!maxPair.Equals(previousOldPair))
                                                //    UpdateMaxPair2(previousOldPair, frequency - 1);
                                            }
                                        }

                                        var previousNewPair = new UInt64Link(copy[previous], copy[startIndex]);
                                        //pairsFrequencies[nextNewPair]++;
                                        if (_pairsFrequencies.TryGetValue(previousNewPair, out frequency))
                                        {
                                            _pairsFrequencies[previousNewPair] = frequency + 1;

                                            //if (!maxPair.Equals(previousNewPair))
                                            UpdateMaxPair(previousNewPair, frequency + 1);
                                        }
                                        else
                                            _pairsFrequencies.Add(previousNewPair, 1);
                                    }
                                }

                                if (i < copy.Length)
                                {
                                    var next = i;
                                    while (next < copy.Length && copy[next] == 0) next++;
                                    if (next < copy.Length)
                                    {
                                        var nextOldPair = new UInt64Link(oldRight, copy[next]);
                                        //if (!nextOldPair.Equals(maxPair))
                                        {
                                            //pairsFrequencies[nextOldPair]--;
                                            if (_pairsFrequencies.TryGetValue(nextOldPair, out frequency))
                                            {
                                                if (frequency == 1)
                                                    _pairsFrequencies.Remove(nextOldPair);
                                                else
                                                    _pairsFrequencies[nextOldPair] = frequency - 1;

                                                //if (!maxPair.Equals(nextOldPair))
                                                //    UpdateMaxPair2(nextOldPair, frequency - 1);
                                            }
                                        }

                                        var nextNewPair = new UInt64Link(copy[startIndex], copy[next]);
                                        //pairsFrequencies[nextNewPair]++;
                                        if (_pairsFrequencies.TryGetValue(nextNewPair, out frequency))
                                        {
                                            _pairsFrequencies[nextNewPair] = frequency + 1;

                                            //if (!maxPair.Equals(nextNewPair))
                                            UpdateMaxPair(nextNewPair, frequency + 1);
                                        }
                                        else
                                            _pairsFrequencies.Add(nextNewPair, 1);
                                    }
                                }
                            }
                        }
                        else
                        {
                            while (i < copy.Length && copy[i] == 0) i++;
                            if (i == copy.Length) break;

                            //tempPair.Source = copy[startIndex];
                            //tempPair.Target = copy[i];

                            var pair = new UInt64Link(copy[startIndex], copy[i]);

                            //if (!maxPair.Equals(pair))
                            //{
                            ulong frequency;
                            if (_pairsFrequencies.TryGetValue(pair, out frequency))
                                UpdateMaxPair(pair, frequency);
                            //}
                        }
                    }

                    //////if (!_maxPair2.IsNull())
                    //////{
                    //////    UpdateMaxPair(_maxPair2, _maxFrequency2);
                    //////}

                    //_maxPair = Link.Null;
                    //_maxFrequency = 1;

                    //foreach (var pairsFrequency in _pairsFrequencies)
                    //    UpdateMaxPair(pairsFrequency.Key, pairsFrequency.Value);
                }

                var final = new ulong[newLength];

                var j = 0;
                for (var i = 0; i < copy.Length; i++)
                {
                    while (i < copy.Length && copy[i] == 0) i++;
                    if (i == copy.Length) break;

                    final[j++] = copy[i];
                }

                return final;
            }
            /// <remarks>
            /// Original algorithm idea: https://en.wikipedia.org/wiki/Byte_pair_encoding .
            /// Faster version (pairs' frequencies dictionary is not recreated).
            /// </remarks>
            public ulong[] Precompress0(ulong[] sequence)
            {
                if (sequence.IsNullOrEmpty())
                    return null;

                if (sequence.Length == 1)
                    return sequence;

                var oldLength = sequence.Length;
                var newLength = sequence.Length;

                // Can be faster if source sequence allowed to be changed
                var copy = new ulong[sequence.Length];
                copy[0] = sequence[0];

                for (var i = 1; i < sequence.Length; i++)
                {
                    copy[i] = sequence[i];

                    var pair = new UInt64Link(sequence[i - 1], sequence[i]);
                    UpdateMaxPair(pair, IncrementFrequency(pair));
                }

                while (!_maxPair.IsNull())
                {
                    var maxPairSource = _maxPair.Source;
                    var maxPairTarget = _maxPair.Target;
                    var maxPairResult = _links.CreateAndUpdate(maxPairSource, maxPairTarget);

                    oldLength--;
                    var oldLengthMinusTwo = oldLength - 1;

                    // Substitute all usages
                    int w = 0, r = 0; // (r == read, w == write)
                    for (; r < oldLength; r++)
                    {
                        if (copy[r] == maxPairSource && copy[r + 1] == maxPairTarget)
                        {
                            if (r > 0)
                            {
                                var previous = copy[w - 1];
                                DecrementFrequency(new UInt64Link(previous, maxPairSource));
                                IncrementFrequency(new UInt64Link(previous, maxPairResult));
                            }
                            if (r < oldLengthMinusTwo)
                            {
                                var next = copy[r + 2];
                                DecrementFrequency(new UInt64Link(maxPairTarget, next));
                                IncrementFrequency(new UInt64Link(maxPairResult, next));
                            }

                            copy[w++] = maxPairResult;
                            r++;
                            newLength--;
                        }
                        else
                        {
                            copy[w++] = copy[r];
                        }
                    }
                    copy[w] = copy[r];

                    _pairsFrequencies.Remove(_maxPair);

                    oldLength = newLength;

                    // Медленный вариант UpdateMaxPair
                    //_maxPair = Link.Null;
                    //_maxFrequency = 1;

                    // TODO: Разобраться почему, если переместить сюда строчку "_pairsFrequencies.Remove(_maxPair);" алгоритм зацикливается

                    //foreach (var pairsFrequency in _pairsFrequencies)
                    //    UpdateMaxPair(pairsFrequency.Key, pairsFrequency.Value);

                    // Быстрее
                    UpdateMaxPair2();
                }

                var final = new ulong[newLength];
                Array.Copy(copy, final, newLength);

                return final;
            }
 private void ResetMaxPair()
 {
     _maxPair = UInt64Link.Null;
     _maxFrequency = 1;
     _maxPair2 = UInt64Link.Null;
     _maxFrequency2 = 1;
 }
 private ulong IncrementFrequency(UInt64Link pair)
 {
     ulong frequency;
     if (_pairsFrequencies.TryGetValue(pair, out frequency))
     {
         frequency++;
         _pairsFrequencies[pair] = frequency;
     }
     else
     {
         frequency = 1;
         _pairsFrequencies.Add(pair, frequency);
     }
     return frequency;
 }
Exemplo n.º 10
0
 public Compressor(SynchronizedLinks<ulong> links, Sequences sequences)
 {
     _links = links;
     _sequences = sequences;
     _maxPair = UInt64Link.Null;
     _maxFrequency = 1;
     _maxPair2 = UInt64Link.Null;
     _maxFrequency2 = 1;
     _pairsFrequencies = new UnsafeDictionary<UInt64Link, ulong>();
 }
Exemplo n.º 11
0
        /// <remarks>
        /// Original algorithm idea: https://en.wikipedia.org/wiki/Byte_pair_encoding .
        /// Faster version (pairs' frequencies dictionary is not recreated).
        /// </remarks>
        public static ulong[] PrecompressSequence2(this SynchronizedLinks<ulong> links, ulong[] sequence)
        {
            if (sequence.IsNullOrEmpty())
                return null;

            if (sequence.Length == 1)
                return sequence;

            var newLength = sequence.Length;
            var copy = new ulong[sequence.Length];
            copy[0] = sequence[0];

            var pairsFrequencies = new Dictionary<UInt64Link, ulong>();

            var maxPair = UInt64Link.Null;
            ulong maxFrequency = 1;

            for (var i = 1; i < sequence.Length; i++)
            {
                copy[i] = sequence[i];

                var pair = new UInt64Link(sequence[i - 1], sequence[i]);

                ulong frequency;
                if (pairsFrequencies.TryGetValue(pair, out frequency))
                {
                    var newFrequency = frequency + 1;

                    if (maxFrequency < newFrequency)
                    {
                        maxFrequency = newFrequency;
                        maxPair = pair;
                    }

                    pairsFrequencies[pair] = newFrequency;
                }
                else
                    pairsFrequencies.Add(pair, 1);
            }

            while (!maxPair.IsNull())
            {
                var maxPairSource = maxPair.Source;

                var maxPairLink = links.CreateAndUpdate(maxPairSource, maxPair.Target);

                // Substitute all usages
                for (var i = 1; i < copy.Length; i++)
                {
                    var startIndex = i - 1;

                    if (copy[startIndex] == maxPairSource)
                    {
                        while (i < copy.Length && copy[i] == 0) i++;
                        if (i == copy.Length) break;

                        if (copy[i] == maxPair.Target)
                        {
                            var oldLeft = copy[startIndex];
                            var oldRight = copy[i];

                            copy[startIndex] = maxPairLink;
                            copy[i] = 0; // TODO: Вместо записи нулевых дырок, можно хранить отрицательным числом размер диапазона (дырки) на которую надо прыгнуть, это дополнительно ускорило бы алгоритм.

                            // Требуется отдельно, так как пары могут идти подряд,
                            // например в "ааа" пара "аа" была посчитана дважды
                            pairsFrequencies[maxPair]--;

                            newLength--;

                            if (startIndex > 0)
                            {
                                var previous = startIndex - 1;
                                while (previous >= 0 && copy[previous] == 0) previous--;
                                if (previous >= 0)
                                {
                                    ulong frequency;

                                    var nextOldPair = new UInt64Link(copy[previous], oldLeft);
                                    //if (!nextOldPair.Equals(maxPair))
                                    {
                                        //pairsFrequencies[nextOldPair]--;
                                        if (pairsFrequencies.TryGetValue(nextOldPair, out frequency))
                                            pairsFrequencies[nextOldPair] = frequency - 1;
                                    }

                                    var nextNewPair = new UInt64Link(copy[previous], copy[startIndex]);
                                    //pairsFrequencies[nextNewPair]++;
                                    if (pairsFrequencies.TryGetValue(nextNewPair, out frequency))
                                        pairsFrequencies[nextNewPair] = frequency + 1;
                                    else
                                        pairsFrequencies.Add(nextNewPair, 1);
                                }
                            }

                            if (i < copy.Length)
                            {
                                var next = i;
                                while (next < copy.Length && copy[next] == 0) next++;
                                if (next < copy.Length)
                                {
                                    ulong frequency;

                                    var nextOldPair = new UInt64Link(oldRight, copy[next]);
                                    //if (!nextOldPair.Equals(maxPair))
                                    {
                                        //pairsFrequencies[nextOldPair]--;
                                        if (pairsFrequencies.TryGetValue(nextOldPair, out frequency))
                                            pairsFrequencies[nextOldPair] = frequency - 1;
                                    }

                                    var nextNewPair = new UInt64Link(copy[startIndex], copy[next]);
                                    //pairsFrequencies[nextNewPair]++;
                                    if (pairsFrequencies.TryGetValue(nextNewPair, out frequency))
                                        pairsFrequencies[nextNewPair] = frequency + 1;
                                    else
                                        pairsFrequencies.Add(nextNewPair, 1);
                                }
                            }
                        }
                    }
                }

                //pairsFrequencies[maxPair] = 0;
                //pairsFrequencies.Remove(maxPair);

                //if (pairsFrequencies[maxPair] > 0)
                //{

                //}

                maxPair = UInt64Link.Null;
                maxFrequency = 1;

                foreach (var pairsFrequency in pairsFrequencies)
                {
                    var frequency = pairsFrequency.Value;
                    if (frequency > 1)
                    {
                        var pair = pairsFrequency.Key;

                        if (maxFrequency < frequency)
                        {
                            maxFrequency = frequency;
                            maxPair = pair;
                        }
                        if (maxFrequency == frequency &&
                            (pair.Source + pair.Target) > (maxPair.Source + maxPair.Target))
                        {
                            maxPair = pair;
                        }
                    }
                }

                //{
                //    var pairsFrequenciesCheck = new Dictionary<Link, ulong>();
                //    var maxPairCheck = Link.Null;
                //    ulong maxFrequencyCheck = 1;

                //    for (var i = 1; i < copy.Length; i++)
                //    {
                //        var startIndex = i - 1;

                //        while (i < copy.Length && copy[i] == 0) i++;
                //        if (i == copy.Length) break;

                //        var pair = new Link(copy[startIndex], copy[i]);

                //        ulong frequency;
                //        if (pairsFrequenciesCheck.TryGetValue(pair, out frequency))
                //        {
                //            var newFrequency = frequency + 1;

                //            if (maxFrequencyCheck < newFrequency)
                //            {
                //                maxFrequencyCheck = newFrequency;
                //                maxPairCheck = pair;
                //            }

                //            pairsFrequenciesCheck[pair] = newFrequency;
                //        }
                //        else
                //            pairsFrequenciesCheck.Add(pair, 1);
                //    }

                //    if (!maxPairCheck.Equals(maxPair) || maxFrequency != maxFrequencyCheck)
                //    {

                //    }
                //}
            }

            var final = new ulong[newLength];

            var j = 0;
            for (var i = 1; i < copy.Length; i++)
            {
                final[j++] = copy[i - 1];

                while (i < copy.Length && copy[i] == 0) i++;
            }

            //var finalSequence = new ulong[groupedSequence.Count];

            //for (int i = 0; i < finalSequence.Length; i++)
            //{
            //    var part = groupedSequence[i];
            //    finalSequence[i] = part.Length == 1 ? part[0] : sequences.CreateBalancedVariant(part);
            //}

            //return sequences.CreateBalancedVariant(finalSequence);
            //return sequences.CreateBalancedVariant(final);

            return final;
        }
Exemplo n.º 12
0
        /// <remarks>
        /// Original algorithm idea: https://en.wikipedia.org/wiki/Byte_pair_encoding .
        /// Slow version (pairs' frequencies dictionary is recreated).
        /// </remarks>
        public static ulong[] PrecompressSequence1(this SynchronizedLinks<ulong> links, ulong[] sequence)
        {
            if (sequence.IsNullOrEmpty())
                return null;

            if (sequence.Length == 1)
                return sequence;

            var newLength = sequence.Length;

            var copy = new ulong[sequence.Length];
            Array.Copy(sequence, copy, sequence.Length);

            UInt64Link maxPair;

            do
            {
                var pairsFrequencies = new Dictionary<UInt64Link, ulong>();

                maxPair = UInt64Link.Null;
                ulong maxFrequency = 1;

                for (var i = 1; i < copy.Length; i++)
                {
                    var startIndex = i - 1;

                    while (i < copy.Length && copy[i] == 0) i++;
                    if (i == copy.Length) break;

                    var pair = new UInt64Link(copy[startIndex], copy[i]);

                    ulong frequency;
                    if (pairsFrequencies.TryGetValue(pair, out frequency))
                    {
                        var newFrequency = frequency + 1;

                        if (maxFrequency < newFrequency)
                        {
                            maxFrequency = newFrequency;
                            maxPair = pair;
                        }

                        pairsFrequencies[pair] = newFrequency;
                    }
                    else
                        pairsFrequencies.Add(pair, 1);
                }

                if (!maxPair.IsNull())
                {
                    var maxPairLink = links.CreateAndUpdate(maxPair.Source, maxPair.Target);

                    // Substitute all usages
                    for (var i = 1; i < copy.Length; i++)
                    {
                        if (copy[i - 1] == maxPair.Source)
                        {
                            var startIndex = i - 1;

                            while (i < copy.Length && copy[i] == 0) i++;
                            if (i == copy.Length) break;

                            if (copy[i] == maxPair.Target)
                            {
                                copy[startIndex] = maxPairLink;
                                copy[i] = 0;
                                newLength--;
                            }
                        }
                    }
                }

            } while (!maxPair.IsNull());


            var final = new ulong[newLength];

            var j = 0;
            for (var i = 1; i < copy.Length; i++)
            {
                final[j++] = copy[i - 1];

                while (i < copy.Length && copy[i] == 0) i++;
            }

            //var finalSequence = new ulong[groupedSequence.Count];

            //for (int i = 0; i < finalSequence.Length; i++)
            //{
            //    var part = groupedSequence[i];
            //    finalSequence[i] = part.Length == 1 ? part[0] : sequences.CreateBalancedVariant(part);
            //}

            //return sequences.CreateBalancedVariant(finalSequence);
            return final;
        }
Exemplo n.º 13
0
            private void UpdateMaxPair2()
            {
                ResetMaxPair();

                var entries = _pairsFrequencies.entries;
                for (var i = 0; i < entries.Length; i++)
                {
                    if (entries[i].hashCode >= 0)
                    {
                        var frequency = entries[i].value;
                        if (frequency > 1)
                        {
                            if (_maxFrequency > frequency)
                                continue;

                            if (_maxFrequency < frequency)
                            {
                                _maxFrequency = frequency;
                                _maxPair = entries[i].key;
                            }
                            else if (_maxFrequency == frequency &&
                                (entries[i].key.Source + entries[i].key.Target) > (_maxPair.Source + _maxPair.Target))
                            {
                                _maxPair = entries[i].key;
                            }
                        }
                    }
                }
            }
Exemplo n.º 14
0
 private void UpdateMaxPair2(UInt64Link pair, ulong frequency)
 {
     if (!_maxPair.Equals(pair))
     {
         if (_maxPair2.Equals(pair))
         {
             _maxFrequency2 = frequency;
         }
         else if (_maxFrequency2 < frequency)
         {
             _maxFrequency2 = frequency;
             _maxPair2 = pair;
         }
         else if (_maxFrequency2 == frequency &&
                  (pair.Source + pair.Target) > (_maxPair2.Source + _maxPair2.Target))
         {
             _maxPair = pair;
         }
     }
 }
Exemplo n.º 15
0
 private void UpdateMaxPair(UInt64Link pair, ulong frequency)
 {
     if (frequency > 1)
     {
         if (_maxFrequency < frequency)
         {
             _maxFrequency = frequency;
             _maxPair = pair;
         }
         else if (_maxFrequency == frequency &&
             (pair.Source + pair.Target) > (_maxPair.Source + _maxPair.Target))
         {
             _maxPair = pair;
         }
     }
 }
Exemplo n.º 16
0
            private void DecrementFrequency(UInt64Link pair)
            {
                ulong frequency;
                if (_pairsFrequencies.TryGetValue(pair, out frequency))
                {
                    frequency--;

                    if (frequency == 0)
                        _pairsFrequencies.Remove(pair);
                    else
                        _pairsFrequencies[pair] = frequency;
                }
                //return frequency;
            }
Exemplo n.º 17
0
        static void Main(string[] args)
        {
            var constants = new LinksConstants <ulong>((1, long.MaxValue), (long.MaxValue + 1UL, ulong.MaxValue));

            using var memory = new UInt64ResizableDirectMemoryLinks(new HeapResizableDirectMemory());
            var links = memory.DecorateWithAutomaticUniquenessAndUsagesResolution();

            var addressToRawNumberConverter = new AddressToRawNumberConverter <ulong>();
            var rawNumberToAddressConverter = new RawNumberToAddressConverter <ulong>();

            var root = links.GetOrCreate(1UL, 1UL);
            var unicodeSymbolMarker = links.GetOrCreate(root, addressToRawNumberConverter.Convert(1));
            var patternRootMarker   = links.GetOrCreate(root, addressToRawNumberConverter.Convert(2));

            var charToUnicodeSymbolConverter = new Platform.Data.Doublets.Unicode.CharToUnicodeSymbolConverter <ulong>(links, addressToRawNumberConverter, unicodeSymbolMarker);

            var strings = new[] { "href", "target", "rel", "media", "hreflang", "type", "sizes", "content", "name", "src", "charset", "text", "cite", "ping", "alt", "sandbox", "width", "height", "data", "value", "poster", "coords", "shape", "scope", "action", "enctype", "method", "accept", "max", "min", "pattern", "placeholder", "step", "label", "wrap", "icon", "radiogroup" };

            var patternRootMarkerArray = new[] { patternRootMarker };

            var sequences = strings.Select((s, i) => patternRootMarkerArray.Concat(BuildSequence(s, i, links, addressToRawNumberConverter, charToUnicodeSymbolConverter)).Concat(patternRootMarkerArray).ToArray()).ToArray();

            var index = new SequenceIndex <ulong>(links);

            var any       = links.Constants.Any;
            var @continue = links.Constants.Continue;

            for (int i = 0; i < sequences.Length; i++)
            {
                index.Add(sequences[i]);
            }

            var chars = new Dictionary <ulong, char>();

            links.Each(linkParts =>
            {
                var link = new UInt64Link(linkParts);

                if (link.Target == unicodeSymbolMarker)
                {
                    var symbol = (char)rawNumberToAddressConverter.Convert(link.Source);
                    chars.Add(link.Index, symbol);
                    Console.WriteLine($"({link.Index}: '{symbol}'->{link.Target})");
                }
                else
                {
                    var sourceString = LinkToString(links, constants, link.Source, chars, rawNumberToAddressConverter);
                    var targetString = LinkToString(links, constants, link.Target, chars, rawNumberToAddressConverter);
                    Console.WriteLine($"({link.Index}: {sourceString}->{targetString})");
                }
                return(@continue);
            }, new UInt64Link(any, any, any));

            StringBuilder sb = new StringBuilder();

            sb.Append('^');
            AppendPattern(links, constants, patternRootMarker, patternRootMarker, chars, any, @continue, sb, 0UL, rawNumberToAddressConverter);
            sb.Append('$');
            var result = sb.ToString();

            var simplificationRegex = new Regex(@"\(([a-z\?]*)\)", RegexOptions.Compiled);

            while (simplificationRegex.IsMatch(result))
            {
                result = simplificationRegex.Replace(result, "$1");
            }

            // (|t)
            // t?
            var simplificationRegex2 = new Regex(@"\(\|([a-z])\)", RegexOptions.Compiled);

            while (simplificationRegex2.IsMatch(result))
            {
                result = simplificationRegex2.Replace(result, "$1?");
            }

            // Repeat
            while (simplificationRegex.IsMatch(result))
            {
                result = simplificationRegex.Replace(result, "$1");
            }

            var regex = new Regex(result);

            for (int i = 0; i < strings.Length; i++)
            {
                if (!regex.IsMatch(strings[i]))
                {
                    Console.WriteLine($"Error: {strings[i]} does not match the pattern.");
                }
            }

            Console.WriteLine(result);

            Console.WriteLine(links.Count());
            Console.WriteLine("Hello World!");
        }
Exemplo n.º 18
0
            /// <remarks>
            /// Original algorithm idea: https://en.wikipedia.org/wiki/Byte_pair_encoding .
            /// If pair repeats twice it is maximum pair.
            /// </remarks>
            public ulong[] Precompress4(ulong[] sequence)
            {
                if (sequence.IsNullOrEmpty())
                    return null;

                if (sequence.Length == 1)
                    return sequence;

                var oldLength = sequence.Length;
                var newLength = sequence.Length;

                // Can be faster if source sequence allowed to be changed
                var copy = new ulong[sequence.Length];
                Array.Copy(sequence, copy, copy.Length);

                var set = new HashSet<UInt64Link>();

                for (var i = 1; i < sequence.Length; i++)
                {
                    var pair = new UInt64Link(sequence[i - 1], sequence[i]);

                    //UpdateMaxPair(pair, IncrementFrequency(pair));
                    //if(_maxFrequency >= 2)
                    //    break;

                    if (!set.Add(pair))
                    {
                        _maxPair = pair;
                        //_maxFrequency = 2;
                        break;
                    }
                }

                while (!_maxPair.IsNull())
                {
                    var maxPairSource = _maxPair.Source;
                    var maxPairTarget = _maxPair.Target;
                    var maxPairResult = _links.CreateAndUpdate(maxPairSource, maxPairTarget);

                    oldLength--;
                    var oldLengthMinusTwo = oldLength - 1;

                    _maxPair = UInt64Link.Null;
                    set.Clear();

                    // Substitute all usages
                    int w = 0, r = 0; // (r == read, w == write)
                    for (; r < oldLength; r++)
                    {
                        if (copy[r] == maxPairSource && copy[r + 1] == maxPairTarget)
                        {
                            //if (_maxPair.IsNull())
                            //{
                            //    if (r > 0)
                            //    {
                            //        var previous = copy[w - 1];
                            //        set.Remove(new Link(previous, maxPairSource));
                            //        var pair = new Link(previous, maxPairResult);
                            //        if (!set.Add(pair)) _maxPair = pair;
                            //        //DecrementFrequency(new Link(previous, maxPairSource));
                            //        //IncrementFrequency(new Link(previous, maxPairResult));
                            //    }
                            //    if (r < oldLengthMinusTwo)
                            //    {
                            //        var next = copy[r + 2];
                            //        set.Remove(new Link(maxPairTarget, next));
                            //        var pair = new Link(maxPairResult, next);
                            //        if (!set.Add(pair)) _maxPair = pair;
                            //        //DecrementFrequency(new Link(maxPairTarget, next));
                            //        //IncrementFrequency(new Link(maxPairResult, next));
                            //    }
                            //}

                            copy[w++] = maxPairResult;
                            r++;
                            newLength--;
                        }
                        else
                        {
                            //if (_maxPair.IsNull() && w > 0) // 8 sec
                            //{
                            //    var pair = new Link(copy[w - 1], copy[w]);
                            //    if (!set.Add(pair)) _maxPair = pair;
                            //}

                            if (_maxPair.IsNull()) // 4 sec
                            {
                                var pair = new UInt64Link(copy[r], copy[r + 1]);
                                if (!set.Add(pair)) _maxPair = pair;
                            }
                            copy[w++] = copy[r];

                            //if (_maxPair.IsNull()) // 8 sec
                            //{
                            //    var pair = new Link(copy[w - 1], copy[w]);
                            //    if (!set.Add(pair)) _maxPair = pair;
                            //}
                        }
                    }
                    //if (_maxPair.IsNull()) // 8 sec
                    //{
                    //    var pair = new Link(copy[w - 1], copy[w]);
                    //    if (!set.Add(pair)) _maxPair = pair;
                    //}
                    copy[w] = copy[r];

                    //_pairsFrequencies.Remove(_maxPair);

                    //_maxPair = Link.Null;
                    //set.Clear();

                    oldLength = newLength;

                    //for (var i = 1; i < newLength; i++)
                    //{
                    //    var pair = new Link(copy[i - 1], copy[i]);

                    //    //UpdateMaxPair(pair, IncrementFrequency(pair));
                    //    //if (_maxFrequency >= 2)
                    //    //    break;

                    //    if (!set.Add(pair))
                    //    {
                    //        _maxPair = pair;
                    //        //_maxFrequency = 2;
                    //        break; ;
                    //    }
                    //}

                    // Медленный вариант UpdateMaxPair
                    //_maxPair = Link.Null;
                    //_maxFrequency = 1;

                    // TODO: Разобраться почему, если переместить сюда строчку "_pairsFrequencies.Remove(_maxPair);" алгоритм зацикливается

                    //foreach (var pairsFrequency in _pairsFrequencies)
                    //    UpdateMaxPair(pairsFrequency.Key, pairsFrequency.Value);

                    // Быстрее
                    //UpdateMaxPair2();
                }

                var final = new ulong[newLength];
                Array.Copy(copy, final, newLength);

                return final;
            }