C# (CSharp) Rhea.Compression.Dictionary SubstringArray.Remove 예제들

프로그래밍 언어: C# (CSharp)

네임스페이스/패키지 이름: Rhea.Compression.Dictionary

클래스/타입: SubstringArray

메소드/함수: Remove

hotexamples.com에서의 예제들: 4

C# (CSharp) Rhea.Compression.Dictionary SubstringArray.Remove - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C# (CSharp)의 Rhea.Compression.Dictionary.SubstringArray.Remove에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Index(3)

Length(3)

Remove(2)

SetScore(2)

Add(1)

IndexOf(1)

Score(1)

Sort(1)

예제 #1

파일 보기

파일: DictionaryOptimizer.cs 프로젝트: modulexcite/Rhea.Compression

        private byte[] Pack(int desiredLength)
        {
            var pruned = new SubstringArray(1024);
            int i, size = 0;

            for (i = substrings.Size - 1; i >= 0; i--)
            {
                bool alreadyCovered = false;
                for (int j = 0, c = pruned.Size; j < c; j++)
                {
                    if (pruned.IndexOf(j, substrings, i, bytes, suffixArray) != -1)
                    {

                        alreadyCovered = true;
                        break;
                    }
                }

                if (alreadyCovered)
                {
                    continue;
                }

                for (int j = pruned.Size - 1; j >= 0; j--)
                {
                    if (substrings.IndexOf(i, pruned, j, bytes, suffixArray) != -1)
                    {
                        size -= pruned.Length(j);
                        pruned.Remove(j);
                    }
                }
                pruned.SetScore(pruned.Size, substrings.Index(i), substrings.Length(i), substrings.Score(i));
                size += substrings.Length(i);
                // We calculate 2x because when we lay the strings out end to end we will merge common prefix/suffixes
                if (size >= 2 * desiredLength)
                {
                    break;
                }
            }

            byte[] packed = new byte[desiredLength];
            int pi = desiredLength;

            int count;
            for (i = 0, count = pruned.Size; i < count && pi > 0; i++)
            {
                int length = pruned.Length(i);
                if (pi - length < 0)
                {
                    length = pi;
                }
                pi -= prepend(bytes, suffixArray[pruned.Index(i)], packed, pi, length);
            }

            if (pi > 0)
            {
                packed = packed.Skip(pi).Take(packed.Length).ToArray();
            }

            return packed;
        }

예제 #2

파일 보기

파일: DictionaryOptimizer.cs 프로젝트: jtmueller/Rhea.Compression

        private byte[] Pack(int desiredLength)
        {
            var pruned = new SubstringArray(1024);
            int i, size = 0;

            for (i = substrings.Size - 1; i >= 0; i--)
            {
                bool alreadyCovered = false;
                for (int j = 0, c = pruned.Size; j < c; j++)
                {
                    if (pruned.IndexOf(j, substrings, i, bytes, suffixArray) != -1)
                    {
                        alreadyCovered = true;
                        break;
                    }
                }

                if (alreadyCovered)
                {
                    continue;
                }

                for (int j = pruned.Size - 1; j >= 0; j--)
                {
                    if (substrings.IndexOf(i, pruned, j, bytes, suffixArray) != -1)
                    {
                        size -= pruned.Length(j);
                        pruned.Remove(j);
                    }
                }
                pruned.SetScore(pruned.Size, substrings.Index(i), substrings.Length(i), substrings.Score(i));
                size += substrings.Length(i);
                // We calculate 2x because when we lay the strings out end to end we will merge common prefix/suffixes
                if (size >= 2 * desiredLength)
                {
                    break;
                }
            }

            byte[] packed = new byte[desiredLength];
            int    pi     = desiredLength;

            int count;

            for (i = 0, count = pruned.Size; i < count && pi > 0; i++)
            {
                int length = pruned.Length(i);
                if (pi - length < 0)
                {
                    length = pi;
                }
                pi -= prepend(bytes, suffixArray[pruned.Index(i)], packed, pi, length);
            }

            if (pi > 0)
            {
                packed = packed.Skip(pi).Take(packed.Length).ToArray();
            }

            return(packed);
        }

예제 #3

파일 보기

파일: DictionaryOptimizer.cs 프로젝트: modulexcite/Rhea.Compression

        // TODO Bring this up to parity with C++ version, which has optimized
        private void ComputeSubstrings()
        {
            var activeSubstrings = new SubstringArray(128);
            var uniqueDocIds = new HashSet<int>();

            substrings = new SubstringArray(1024);
            int n = lcp.Length;

            int lastLCP = lcp[0];
            for (int i = 1; i <= n; i++)
            {
                // Note we need to process currently existing runs, so we do that by acting like we hit an LCP of 0 at the end.
                // That is why the we loop i <= n vs i < n.  Otherwise runs that exist at the end of the suffixarray/lcp will
                // never be "cashed in" and counted in the substrings.  DictionaryOptimizerTest has a unit test for this.
                int currentLCP = i == n ? 0 : lcp[i];

                if (currentLCP > lastLCP)
                {
                    // The order here is important so we can optimize adding redundant strings below.
                    for (int j = lastLCP + 1; j <= currentLCP; j++)
                    {
                        activeSubstrings.Add(i, j, 0);
                    }
                }
                else if (currentLCP < lastLCP)
                {
                    int lastActiveIndex = -1, lastActiveLength = -1, lastActiveCount = -1;
                    for (int j = activeSubstrings.Size - 1; j >= 0; j--)
                    {
                        if (activeSubstrings.Length(j) > currentLCP)
                        {
                            int activeCount = i - activeSubstrings.Index(j) + 1;
                            int activeLength = activeSubstrings.Length(j);
                            int activeIndex = activeSubstrings.Index(j);

                            // Ok we have a string which occurs activeCount times.  The true measure of its
                            // value is how many unique documents it occurs in, because occurring 1000 times in the same
                            // document isn't valuable because once it occurs once, subsequent occurrences will reference
                            // a previous occurring instance in the document.  So for 2 documents: "garrick garrick garrick toubassi",
                            // "toubassi", the string toubassi is far more valuable in a shared dictionary.  So find out
                            // how many unique documents this string occurs in.  We do this by taking the start position of
                            // each occurrence, and then map that back to the document using the "starts" array, and uniquing.
                            // 
                            // TODO Bring this up to parity with C++ version, which has optimized
                            //

                            for (int k = activeSubstrings.Index(j) - 1; k < i; k++)
                            {
                                int byteIndex = suffixArray[k];

                                // Could make this a lookup table if we are willing to burn an int[bytes.length] but thats a lot
                                int docIndex = starts.BinarySearch(byteIndex);

                                if (docIndex < 0)
                                {
                                    docIndex = -docIndex -2;
                                }

                                // While we are at it lets make sure this is a string that actually exists in a single
                                // document, vs spanning two concatenanted documents.  The idea is that for documents
                                // "http://espn.com", "http://google.com", "http://yahoo.com", we don't want to consider
                                // ".comhttp://" to be a legal string.  So make sure the length of this string doesn't
                                // cross a document boundary for this particular occurrence.
                                int nextDocStart = docIndex < starts.Count - 1 ? starts[docIndex + 1] : bytes.Length;
                                if (activeLength <= nextDocStart - byteIndex)
                                {
                                    uniqueDocIds.Add(docIndex);
                                }
                            }

                            int scoreCount = uniqueDocIds.Count;

                            uniqueDocIds.Clear();

                            activeSubstrings.Remove(j);

                            if (scoreCount == 0)
                            {
                                continue;
                            }

                            // Don't add redundant strings.  If we just  added ABC, don't add AB if it has the same count.  This cuts down the size of substrings
                            // from growing very large.
                            if (!(lastActiveIndex != -1 && lastActiveIndex == activeIndex && lastActiveCount == activeCount && lastActiveLength > activeLength))
                            {

                                if (activeLength > 3)
                                {
                                    substrings.Add(activeIndex, activeLength, scoreCount);
                                }
                            }
                            lastActiveIndex = activeIndex;
                            lastActiveLength = activeLength;
                            lastActiveCount = activeCount;
                        }
                    }
                }
                lastLCP = currentLCP;
            }
            substrings.Sort();
        }

예제 #4

파일 보기

파일: DictionaryOptimizer.cs 프로젝트: jtmueller/Rhea.Compression

        // TODO Bring this up to parity with C++ version, which has optimized
        private void ComputeSubstrings()
        {
            var activeSubstrings = new SubstringArray(128);
            var uniqueDocIds     = new HashSet <int>();

            substrings = new SubstringArray(1024);
            int n = lcp.Length;

            int lastLCP = lcp[0];

            for (int i = 1; i <= n; i++)
            {
                // Note we need to process currently existing runs, so we do that by acting like we hit an LCP of 0 at the end.
                // That is why the we loop i <= n vs i < n.  Otherwise runs that exist at the end of the suffixarray/lcp will
                // never be "cashed in" and counted in the substrings.  DictionaryOptimizerTest has a unit test for this.
                int currentLCP = i == n ? 0 : lcp[i];

                if (currentLCP > lastLCP)
                {
                    // The order here is important so we can optimize adding redundant strings below.
                    for (int j = lastLCP + 1; j <= currentLCP; j++)
                    {
                        activeSubstrings.Add(i, j, 0);
                    }
                }
                else if (currentLCP < lastLCP)
                {
                    int lastActiveIndex = -1, lastActiveLength = -1, lastActiveCount = -1;
                    for (int j = activeSubstrings.Size - 1; j >= 0; j--)
                    {
                        if (activeSubstrings.Length(j) > currentLCP)
                        {
                            int activeCount  = i - activeSubstrings.Index(j) + 1;
                            int activeLength = activeSubstrings.Length(j);
                            int activeIndex  = activeSubstrings.Index(j);

                            // Ok we have a string which occurs activeCount times.  The true measure of its
                            // value is how many unique documents it occurs in, because occurring 1000 times in the same
                            // document isn't valuable because once it occurs once, subsequent occurrences will reference
                            // a previous occurring instance in the document.  So for 2 documents: "garrick garrick garrick toubassi",
                            // "toubassi", the string toubassi is far more valuable in a shared dictionary.  So find out
                            // how many unique documents this string occurs in.  We do this by taking the start position of
                            // each occurrence, and then map that back to the document using the "starts" array, and uniquing.
                            //
                            // TODO Bring this up to parity with C++ version, which has optimized
                            //

                            for (int k = activeSubstrings.Index(j) - 1; k < i; k++)
                            {
                                int byteIndex = suffixArray[k];

                                // Could make this a lookup table if we are willing to burn an int[bytes.length] but thats a lot
                                int docIndex = starts.BinarySearch(byteIndex);

                                if (docIndex < 0)
                                {
                                    docIndex = -docIndex - 2;
                                }

                                // While we are at it lets make sure this is a string that actually exists in a single
                                // document, vs spanning two concatenanted documents.  The idea is that for documents
                                // "http://espn.com", "http://google.com", "http://yahoo.com", we don't want to consider
                                // ".comhttp://" to be a legal string.  So make sure the length of this string doesn't
                                // cross a document boundary for this particular occurrence.
                                int nextDocStart = docIndex < starts.Count - 1 ? starts[docIndex + 1] : bytes.Length;
                                if (activeLength <= nextDocStart - byteIndex)
                                {
                                    uniqueDocIds.Add(docIndex);
                                }
                            }

                            int scoreCount = uniqueDocIds.Count;

                            uniqueDocIds.Clear();

                            activeSubstrings.Remove(j);

                            if (scoreCount == 0)
                            {
                                continue;
                            }

                            // Don't add redundant strings.  If we just  added ABC, don't add AB if it has the same count.  This cuts down the size of substrings
                            // from growing very large.
                            if (!(lastActiveIndex != -1 && lastActiveIndex == activeIndex && lastActiveCount == activeCount && lastActiveLength > activeLength))
                            {
                                if (activeLength > 3)
                                {
                                    substrings.Add(activeIndex, activeLength, scoreCount);
                                }
                            }
                            lastActiveIndex  = activeIndex;
                            lastActiveLength = activeLength;
                            lastActiveCount  = activeCount;
                        }
                    }
                }
                lastLCP = currentLCP;
            }
            substrings.Sort();
        }