Esempio n. 1
0
        public int IndexOf(int s1, SubstringArray sa, int s2, byte[] s, int[] prefixes)
        {
            int index1  = indexes[s1];
            int length1 = lengths[s1];
            int index2  = sa.indexes[s2];
            int length2 = sa.lengths[s2];

            for (int i = prefixes[index1], n = prefixes[index1] + length1 - length2 + 1; i < n; i++)
            {
                bool found = true;
                for (int j = prefixes[index2], nj = prefixes[index2] + length2, i1 = i; j < nj; j++, i1++)
                {
                    if (s[i1] != s[j])
                    {
                        found = false;
                        break;
                    }
                }
                if (found)
                {
                    return(i);
                }
            }
            return(-1);
        }
Esempio n. 2
0
        public void Sort()
        {
            var histogram = new int[256];
            var working   = new SubstringArray(size);

            for (int bitOffset = 0; bitOffset <= 24; bitOffset += 8)
            {
                if (bitOffset > 0)
                {
                    for (int j = 0; j < histogram.Length; j++)
                    {
                        histogram[j] = 0;
                    }
                }
                int i, count, rollingSum;
                for (i = 0, count = size; i < count; i++)
                {
                    int sortValue = scores[i];
                    int sortByte  = (sortValue >> bitOffset) & 0xff;
                    histogram[sortByte]++;
                }

                for (i = 0, count = histogram.Length, rollingSum = 0; i < count; i++)
                {
                    int tmp = histogram[i];
                    histogram[i] = rollingSum;
                    rollingSum  += tmp;
                }

                for (i = 0, count = size; i < count; i++)
                {
                    int sortValue = scores[i];
                    int sortByte  = (sortValue >> bitOffset) & 0xff;
                    int newOffset = histogram[sortByte]++;
                    working.SetScore(newOffset, indexes[i], lengths[i], scores[i]);
                }

                // swap (brain transplant) innards
                int[] t = working.indexes;
                working.indexes = indexes;
                indexes         = t;

                t = working.lengths;
                working.lengths = lengths;
                lengths         = t;

                t = working.scores;
                working.scores = scores;
                scores         = t;

                size         = working.size;
                working.size = 0;

                i = working.capacity;
                working.capacity = capacity;
                capacity         = i;
            }
        }
        public void Sort()
        {
            var histogram = new int[256];
            var working = new SubstringArray(size);

            for (int bitOffset = 0; bitOffset <= 24; bitOffset += 8)
            {
                if (bitOffset > 0)
                {
                    for (int j = 0; j < histogram.Length; j++)
                    {
                        histogram[j] = 0;
                    }
                }
                int i, count, rollingSum;
                for (i = 0, count = size; i < count; i++)
                {
                    int sortValue = scores[i];
                    int sortByte = (sortValue >> bitOffset) & 0xff;
                    histogram[sortByte]++;
                }

                for (i = 0, count = histogram.Length, rollingSum = 0; i < count; i++)
                {
                    int tmp = histogram[i];
                    histogram[i] = rollingSum;
                    rollingSum += tmp;
                }

                for (i = 0, count = size; i < count; i++)
                {
                    int sortValue = scores[i];
                    int sortByte = (sortValue >> bitOffset) & 0xff;
                    int newOffset = histogram[sortByte]++;
                    working.SetScore(newOffset, indexes[i], lengths[i], scores[i]);
                }

                // swap (brain transplant) innards
                int[] t = working.indexes;
                working.indexes = indexes;
                indexes = t;

                t = working.lengths;
                working.lengths = lengths;
                lengths = t;

                t = working.scores;
                working.scores = scores;
                scores = t;

                size = working.size;
                working.size = 0;

                i = working.capacity;
                working.capacity = capacity;
                capacity = i;
            }
        }
        private byte[] Pack(int desiredLength)
        {
            var pruned = new SubstringArray(1024);
            int i, size = 0;

            for (i = substrings.Size - 1; i >= 0; i--)
            {
                bool alreadyCovered = false;
                for (int j = 0, c = pruned.Size; j < c; j++)
                {
                    if (pruned.IndexOf(j, substrings, i, bytes, suffixArray) != -1)
                    {

                        alreadyCovered = true;
                        break;
                    }
                }

                if (alreadyCovered)
                {
                    continue;
                }

                for (int j = pruned.Size - 1; j >= 0; j--)
                {
                    if (substrings.IndexOf(i, pruned, j, bytes, suffixArray) != -1)
                    {
                        size -= pruned.Length(j);
                        pruned.Remove(j);
                    }
                }
                pruned.SetScore(pruned.Size, substrings.Index(i), substrings.Length(i), substrings.Score(i));
                size += substrings.Length(i);
                // We calculate 2x because when we lay the strings out end to end we will merge common prefix/suffixes
                if (size >= 2 * desiredLength)
                {
                    break;
                }
            }

            byte[] packed = new byte[desiredLength];
            int pi = desiredLength;

            int count;
            for (i = 0, count = pruned.Size; i < count && pi > 0; i++)
            {
                int length = pruned.Length(i);
                if (pi - length < 0)
                {
                    length = pi;
                }
                pi -= prepend(bytes, suffixArray[pruned.Index(i)], packed, pi, length);
            }

            if (pi > 0)
            {
                packed = packed.Skip(pi).Take(packed.Length).ToArray();
            }

            return packed;
        }
        // TODO Bring this up to parity with C++ version, which has optimized
        private void ComputeSubstrings()
        {
            var activeSubstrings = new SubstringArray(128);
            var uniqueDocIds = new HashSet<int>();

            substrings = new SubstringArray(1024);
            int n = lcp.Length;

            int lastLCP = lcp[0];
            for (int i = 1; i <= n; i++)
            {
                // Note we need to process currently existing runs, so we do that by acting like we hit an LCP of 0 at the end.
                // That is why the we loop i <= n vs i < n.  Otherwise runs that exist at the end of the suffixarray/lcp will
                // never be "cashed in" and counted in the substrings.  DictionaryOptimizerTest has a unit test for this.
                int currentLCP = i == n ? 0 : lcp[i];

                if (currentLCP > lastLCP)
                {
                    // The order here is important so we can optimize adding redundant strings below.
                    for (int j = lastLCP + 1; j <= currentLCP; j++)
                    {
                        activeSubstrings.Add(i, j, 0);
                    }
                }
                else if (currentLCP < lastLCP)
                {
                    int lastActiveIndex = -1, lastActiveLength = -1, lastActiveCount = -1;
                    for (int j = activeSubstrings.Size - 1; j >= 0; j--)
                    {
                        if (activeSubstrings.Length(j) > currentLCP)
                        {
                            int activeCount = i - activeSubstrings.Index(j) + 1;
                            int activeLength = activeSubstrings.Length(j);
                            int activeIndex = activeSubstrings.Index(j);

                            // Ok we have a string which occurs activeCount times.  The true measure of its
                            // value is how many unique documents it occurs in, because occurring 1000 times in the same
                            // document isn't valuable because once it occurs once, subsequent occurrences will reference
                            // a previous occurring instance in the document.  So for 2 documents: "garrick garrick garrick toubassi",
                            // "toubassi", the string toubassi is far more valuable in a shared dictionary.  So find out
                            // how many unique documents this string occurs in.  We do this by taking the start position of
                            // each occurrence, and then map that back to the document using the "starts" array, and uniquing.
                            // 
                            // TODO Bring this up to parity with C++ version, which has optimized
                            //

                            for (int k = activeSubstrings.Index(j) - 1; k < i; k++)
                            {
                                int byteIndex = suffixArray[k];

                                // Could make this a lookup table if we are willing to burn an int[bytes.length] but thats a lot
                                int docIndex = starts.BinarySearch(byteIndex);

                                if (docIndex < 0)
                                {
                                    docIndex = -docIndex -2;
                                }

                                // While we are at it lets make sure this is a string that actually exists in a single
                                // document, vs spanning two concatenanted documents.  The idea is that for documents
                                // "http://espn.com", "http://google.com", "http://yahoo.com", we don't want to consider
                                // ".comhttp://" to be a legal string.  So make sure the length of this string doesn't
                                // cross a document boundary for this particular occurrence.
                                int nextDocStart = docIndex < starts.Count - 1 ? starts[docIndex + 1] : bytes.Length;
                                if (activeLength <= nextDocStart - byteIndex)
                                {
                                    uniqueDocIds.Add(docIndex);
                                }
                            }

                            int scoreCount = uniqueDocIds.Count;

                            uniqueDocIds.Clear();

                            activeSubstrings.Remove(j);

                            if (scoreCount == 0)
                            {
                                continue;
                            }

                            // Don't add redundant strings.  If we just  added ABC, don't add AB if it has the same count.  This cuts down the size of substrings
                            // from growing very large.
                            if (!(lastActiveIndex != -1 && lastActiveIndex == activeIndex && lastActiveCount == activeCount && lastActiveLength > activeLength))
                            {

                                if (activeLength > 3)
                                {
                                    substrings.Add(activeIndex, activeLength, scoreCount);
                                }
                            }
                            lastActiveIndex = activeIndex;
                            lastActiveLength = activeLength;
                            lastActiveCount = activeCount;
                        }
                    }
                }
                lastLCP = currentLCP;
            }
            substrings.Sort();
        }
        public int IndexOf(int s1, SubstringArray sa, int s2, byte[] s, int[] prefixes)
        {
            int index1 = indexes[s1];
            int length1 = lengths[s1];
            int index2 = sa.indexes[s2];
            int length2 = sa.lengths[s2];

            for (int i = prefixes[index1], n = prefixes[index1] + length1 - length2 + 1; i < n; i++)
            {
                bool found = true;
                for (int j = prefixes[index2], nj = prefixes[index2] + length2, i1 = i; j < nj; j++, i1++)
                {
                    if (s[i1] != s[j])
                    {
                        found = false;
                        break;
                    }
                }
                if (found)
                {
                    return i;
                }
            }
            return -1;
        }
        private byte[] Pack(int desiredLength)
        {
            var pruned = new SubstringArray(1024);
            int i, size = 0;

            for (i = substrings.Size - 1; i >= 0; i--)
            {
                bool alreadyCovered = false;
                for (int j = 0, c = pruned.Size; j < c; j++)
                {
                    if (pruned.IndexOf(j, substrings, i, bytes, suffixArray) != -1)
                    {
                        alreadyCovered = true;
                        break;
                    }
                }

                if (alreadyCovered)
                {
                    continue;
                }

                for (int j = pruned.Size - 1; j >= 0; j--)
                {
                    if (substrings.IndexOf(i, pruned, j, bytes, suffixArray) != -1)
                    {
                        size -= pruned.Length(j);
                        pruned.Remove(j);
                    }
                }
                pruned.SetScore(pruned.Size, substrings.Index(i), substrings.Length(i), substrings.Score(i));
                size += substrings.Length(i);
                // We calculate 2x because when we lay the strings out end to end we will merge common prefix/suffixes
                if (size >= 2 * desiredLength)
                {
                    break;
                }
            }

            byte[] packed = new byte[desiredLength];
            int    pi     = desiredLength;

            int count;

            for (i = 0, count = pruned.Size; i < count && pi > 0; i++)
            {
                int length = pruned.Length(i);
                if (pi - length < 0)
                {
                    length = pi;
                }
                pi -= prepend(bytes, suffixArray[pruned.Index(i)], packed, pi, length);
            }

            if (pi > 0)
            {
                packed = packed.Skip(pi).Take(packed.Length).ToArray();
            }

            return(packed);
        }
        // TODO Bring this up to parity with C++ version, which has optimized
        private void ComputeSubstrings()
        {
            var activeSubstrings = new SubstringArray(128);
            var uniqueDocIds     = new HashSet <int>();

            substrings = new SubstringArray(1024);
            int n = lcp.Length;

            int lastLCP = lcp[0];

            for (int i = 1; i <= n; i++)
            {
                // Note we need to process currently existing runs, so we do that by acting like we hit an LCP of 0 at the end.
                // That is why the we loop i <= n vs i < n.  Otherwise runs that exist at the end of the suffixarray/lcp will
                // never be "cashed in" and counted in the substrings.  DictionaryOptimizerTest has a unit test for this.
                int currentLCP = i == n ? 0 : lcp[i];

                if (currentLCP > lastLCP)
                {
                    // The order here is important so we can optimize adding redundant strings below.
                    for (int j = lastLCP + 1; j <= currentLCP; j++)
                    {
                        activeSubstrings.Add(i, j, 0);
                    }
                }
                else if (currentLCP < lastLCP)
                {
                    int lastActiveIndex = -1, lastActiveLength = -1, lastActiveCount = -1;
                    for (int j = activeSubstrings.Size - 1; j >= 0; j--)
                    {
                        if (activeSubstrings.Length(j) > currentLCP)
                        {
                            int activeCount  = i - activeSubstrings.Index(j) + 1;
                            int activeLength = activeSubstrings.Length(j);
                            int activeIndex  = activeSubstrings.Index(j);

                            // Ok we have a string which occurs activeCount times.  The true measure of its
                            // value is how many unique documents it occurs in, because occurring 1000 times in the same
                            // document isn't valuable because once it occurs once, subsequent occurrences will reference
                            // a previous occurring instance in the document.  So for 2 documents: "garrick garrick garrick toubassi",
                            // "toubassi", the string toubassi is far more valuable in a shared dictionary.  So find out
                            // how many unique documents this string occurs in.  We do this by taking the start position of
                            // each occurrence, and then map that back to the document using the "starts" array, and uniquing.
                            //
                            // TODO Bring this up to parity with C++ version, which has optimized
                            //

                            for (int k = activeSubstrings.Index(j) - 1; k < i; k++)
                            {
                                int byteIndex = suffixArray[k];

                                // Could make this a lookup table if we are willing to burn an int[bytes.length] but thats a lot
                                int docIndex = starts.BinarySearch(byteIndex);

                                if (docIndex < 0)
                                {
                                    docIndex = -docIndex - 2;
                                }

                                // While we are at it lets make sure this is a string that actually exists in a single
                                // document, vs spanning two concatenanted documents.  The idea is that for documents
                                // "http://espn.com", "http://google.com", "http://yahoo.com", we don't want to consider
                                // ".comhttp://" to be a legal string.  So make sure the length of this string doesn't
                                // cross a document boundary for this particular occurrence.
                                int nextDocStart = docIndex < starts.Count - 1 ? starts[docIndex + 1] : bytes.Length;
                                if (activeLength <= nextDocStart - byteIndex)
                                {
                                    uniqueDocIds.Add(docIndex);
                                }
                            }

                            int scoreCount = uniqueDocIds.Count;

                            uniqueDocIds.Clear();

                            activeSubstrings.Remove(j);

                            if (scoreCount == 0)
                            {
                                continue;
                            }

                            // Don't add redundant strings.  If we just  added ABC, don't add AB if it has the same count.  This cuts down the size of substrings
                            // from growing very large.
                            if (!(lastActiveIndex != -1 && lastActiveIndex == activeIndex && lastActiveCount == activeCount && lastActiveLength > activeLength))
                            {
                                if (activeLength > 3)
                                {
                                    substrings.Add(activeIndex, activeLength, scoreCount);
                                }
                            }
                            lastActiveIndex  = activeIndex;
                            lastActiveLength = activeLength;
                            lastActiveCount  = activeCount;
                        }
                    }
                }
                lastLCP = currentLCP;
            }
            substrings.Sort();
        }