public int IndexOf(int s1, SubstringArray sa, int s2, byte[] s, int[] prefixes) { int index1 = indexes[s1]; int length1 = lengths[s1]; int index2 = sa.indexes[s2]; int length2 = sa.lengths[s2]; for (int i = prefixes[index1], n = prefixes[index1] + length1 - length2 + 1; i < n; i++) { bool found = true; for (int j = prefixes[index2], nj = prefixes[index2] + length2, i1 = i; j < nj; j++, i1++) { if (s[i1] != s[j]) { found = false; break; } } if (found) { return(i); } } return(-1); }
public void Sort() { var histogram = new int[256]; var working = new SubstringArray(size); for (int bitOffset = 0; bitOffset <= 24; bitOffset += 8) { if (bitOffset > 0) { for (int j = 0; j < histogram.Length; j++) { histogram[j] = 0; } } int i, count, rollingSum; for (i = 0, count = size; i < count; i++) { int sortValue = scores[i]; int sortByte = (sortValue >> bitOffset) & 0xff; histogram[sortByte]++; } for (i = 0, count = histogram.Length, rollingSum = 0; i < count; i++) { int tmp = histogram[i]; histogram[i] = rollingSum; rollingSum += tmp; } for (i = 0, count = size; i < count; i++) { int sortValue = scores[i]; int sortByte = (sortValue >> bitOffset) & 0xff; int newOffset = histogram[sortByte]++; working.SetScore(newOffset, indexes[i], lengths[i], scores[i]); } // swap (brain transplant) innards int[] t = working.indexes; working.indexes = indexes; indexes = t; t = working.lengths; working.lengths = lengths; lengths = t; t = working.scores; working.scores = scores; scores = t; size = working.size; working.size = 0; i = working.capacity; working.capacity = capacity; capacity = i; } }
private byte[] Pack(int desiredLength) { var pruned = new SubstringArray(1024); int i, size = 0; for (i = substrings.Size - 1; i >= 0; i--) { bool alreadyCovered = false; for (int j = 0, c = pruned.Size; j < c; j++) { if (pruned.IndexOf(j, substrings, i, bytes, suffixArray) != -1) { alreadyCovered = true; break; } } if (alreadyCovered) { continue; } for (int j = pruned.Size - 1; j >= 0; j--) { if (substrings.IndexOf(i, pruned, j, bytes, suffixArray) != -1) { size -= pruned.Length(j); pruned.Remove(j); } } pruned.SetScore(pruned.Size, substrings.Index(i), substrings.Length(i), substrings.Score(i)); size += substrings.Length(i); // We calculate 2x because when we lay the strings out end to end we will merge common prefix/suffixes if (size >= 2 * desiredLength) { break; } } byte[] packed = new byte[desiredLength]; int pi = desiredLength; int count; for (i = 0, count = pruned.Size; i < count && pi > 0; i++) { int length = pruned.Length(i); if (pi - length < 0) { length = pi; } pi -= prepend(bytes, suffixArray[pruned.Index(i)], packed, pi, length); } if (pi > 0) { packed = packed.Skip(pi).Take(packed.Length).ToArray(); } return packed; }
// TODO Bring this up to parity with C++ version, which has optimized private void ComputeSubstrings() { var activeSubstrings = new SubstringArray(128); var uniqueDocIds = new HashSet<int>(); substrings = new SubstringArray(1024); int n = lcp.Length; int lastLCP = lcp[0]; for (int i = 1; i <= n; i++) { // Note we need to process currently existing runs, so we do that by acting like we hit an LCP of 0 at the end. // That is why the we loop i <= n vs i < n. Otherwise runs that exist at the end of the suffixarray/lcp will // never be "cashed in" and counted in the substrings. DictionaryOptimizerTest has a unit test for this. int currentLCP = i == n ? 0 : lcp[i]; if (currentLCP > lastLCP) { // The order here is important so we can optimize adding redundant strings below. for (int j = lastLCP + 1; j <= currentLCP; j++) { activeSubstrings.Add(i, j, 0); } } else if (currentLCP < lastLCP) { int lastActiveIndex = -1, lastActiveLength = -1, lastActiveCount = -1; for (int j = activeSubstrings.Size - 1; j >= 0; j--) { if (activeSubstrings.Length(j) > currentLCP) { int activeCount = i - activeSubstrings.Index(j) + 1; int activeLength = activeSubstrings.Length(j); int activeIndex = activeSubstrings.Index(j); // Ok we have a string which occurs activeCount times. The true measure of its // value is how many unique documents it occurs in, because occurring 1000 times in the same // document isn't valuable because once it occurs once, subsequent occurrences will reference // a previous occurring instance in the document. So for 2 documents: "garrick garrick garrick toubassi", // "toubassi", the string toubassi is far more valuable in a shared dictionary. So find out // how many unique documents this string occurs in. We do this by taking the start position of // each occurrence, and then map that back to the document using the "starts" array, and uniquing. // // TODO Bring this up to parity with C++ version, which has optimized // for (int k = activeSubstrings.Index(j) - 1; k < i; k++) { int byteIndex = suffixArray[k]; // Could make this a lookup table if we are willing to burn an int[bytes.length] but thats a lot int docIndex = starts.BinarySearch(byteIndex); if (docIndex < 0) { docIndex = -docIndex -2; } // While we are at it lets make sure this is a string that actually exists in a single // document, vs spanning two concatenanted documents. The idea is that for documents // "http://espn.com", "http://google.com", "http://yahoo.com", we don't want to consider // ".comhttp://" to be a legal string. So make sure the length of this string doesn't // cross a document boundary for this particular occurrence. int nextDocStart = docIndex < starts.Count - 1 ? starts[docIndex + 1] : bytes.Length; if (activeLength <= nextDocStart - byteIndex) { uniqueDocIds.Add(docIndex); } } int scoreCount = uniqueDocIds.Count; uniqueDocIds.Clear(); activeSubstrings.Remove(j); if (scoreCount == 0) { continue; } // Don't add redundant strings. If we just added ABC, don't add AB if it has the same count. This cuts down the size of substrings // from growing very large. if (!(lastActiveIndex != -1 && lastActiveIndex == activeIndex && lastActiveCount == activeCount && lastActiveLength > activeLength)) { if (activeLength > 3) { substrings.Add(activeIndex, activeLength, scoreCount); } } lastActiveIndex = activeIndex; lastActiveLength = activeLength; lastActiveCount = activeCount; } } } lastLCP = currentLCP; } substrings.Sort(); }
public int IndexOf(int s1, SubstringArray sa, int s2, byte[] s, int[] prefixes) { int index1 = indexes[s1]; int length1 = lengths[s1]; int index2 = sa.indexes[s2]; int length2 = sa.lengths[s2]; for (int i = prefixes[index1], n = prefixes[index1] + length1 - length2 + 1; i < n; i++) { bool found = true; for (int j = prefixes[index2], nj = prefixes[index2] + length2, i1 = i; j < nj; j++, i1++) { if (s[i1] != s[j]) { found = false; break; } } if (found) { return i; } } return -1; }
private byte[] Pack(int desiredLength) { var pruned = new SubstringArray(1024); int i, size = 0; for (i = substrings.Size - 1; i >= 0; i--) { bool alreadyCovered = false; for (int j = 0, c = pruned.Size; j < c; j++) { if (pruned.IndexOf(j, substrings, i, bytes, suffixArray) != -1) { alreadyCovered = true; break; } } if (alreadyCovered) { continue; } for (int j = pruned.Size - 1; j >= 0; j--) { if (substrings.IndexOf(i, pruned, j, bytes, suffixArray) != -1) { size -= pruned.Length(j); pruned.Remove(j); } } pruned.SetScore(pruned.Size, substrings.Index(i), substrings.Length(i), substrings.Score(i)); size += substrings.Length(i); // We calculate 2x because when we lay the strings out end to end we will merge common prefix/suffixes if (size >= 2 * desiredLength) { break; } } byte[] packed = new byte[desiredLength]; int pi = desiredLength; int count; for (i = 0, count = pruned.Size; i < count && pi > 0; i++) { int length = pruned.Length(i); if (pi - length < 0) { length = pi; } pi -= prepend(bytes, suffixArray[pruned.Index(i)], packed, pi, length); } if (pi > 0) { packed = packed.Skip(pi).Take(packed.Length).ToArray(); } return(packed); }
// TODO Bring this up to parity with C++ version, which has optimized private void ComputeSubstrings() { var activeSubstrings = new SubstringArray(128); var uniqueDocIds = new HashSet <int>(); substrings = new SubstringArray(1024); int n = lcp.Length; int lastLCP = lcp[0]; for (int i = 1; i <= n; i++) { // Note we need to process currently existing runs, so we do that by acting like we hit an LCP of 0 at the end. // That is why the we loop i <= n vs i < n. Otherwise runs that exist at the end of the suffixarray/lcp will // never be "cashed in" and counted in the substrings. DictionaryOptimizerTest has a unit test for this. int currentLCP = i == n ? 0 : lcp[i]; if (currentLCP > lastLCP) { // The order here is important so we can optimize adding redundant strings below. for (int j = lastLCP + 1; j <= currentLCP; j++) { activeSubstrings.Add(i, j, 0); } } else if (currentLCP < lastLCP) { int lastActiveIndex = -1, lastActiveLength = -1, lastActiveCount = -1; for (int j = activeSubstrings.Size - 1; j >= 0; j--) { if (activeSubstrings.Length(j) > currentLCP) { int activeCount = i - activeSubstrings.Index(j) + 1; int activeLength = activeSubstrings.Length(j); int activeIndex = activeSubstrings.Index(j); // Ok we have a string which occurs activeCount times. The true measure of its // value is how many unique documents it occurs in, because occurring 1000 times in the same // document isn't valuable because once it occurs once, subsequent occurrences will reference // a previous occurring instance in the document. So for 2 documents: "garrick garrick garrick toubassi", // "toubassi", the string toubassi is far more valuable in a shared dictionary. So find out // how many unique documents this string occurs in. We do this by taking the start position of // each occurrence, and then map that back to the document using the "starts" array, and uniquing. // // TODO Bring this up to parity with C++ version, which has optimized // for (int k = activeSubstrings.Index(j) - 1; k < i; k++) { int byteIndex = suffixArray[k]; // Could make this a lookup table if we are willing to burn an int[bytes.length] but thats a lot int docIndex = starts.BinarySearch(byteIndex); if (docIndex < 0) { docIndex = -docIndex - 2; } // While we are at it lets make sure this is a string that actually exists in a single // document, vs spanning two concatenanted documents. The idea is that for documents // "http://espn.com", "http://google.com", "http://yahoo.com", we don't want to consider // ".comhttp://" to be a legal string. So make sure the length of this string doesn't // cross a document boundary for this particular occurrence. int nextDocStart = docIndex < starts.Count - 1 ? starts[docIndex + 1] : bytes.Length; if (activeLength <= nextDocStart - byteIndex) { uniqueDocIds.Add(docIndex); } } int scoreCount = uniqueDocIds.Count; uniqueDocIds.Clear(); activeSubstrings.Remove(j); if (scoreCount == 0) { continue; } // Don't add redundant strings. If we just added ABC, don't add AB if it has the same count. This cuts down the size of substrings // from growing very large. if (!(lastActiveIndex != -1 && lastActiveIndex == activeIndex && lastActiveCount == activeCount && lastActiveLength > activeLength)) { if (activeLength > 3) { substrings.Add(activeIndex, activeLength, scoreCount); } } lastActiveIndex = activeIndex; lastActiveLength = activeLength; lastActiveCount = activeCount; } } } lastLCP = currentLCP; } substrings.Sort(); }