public static SuffixArray <T> .tuple_t[] Build( IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter) { CreateIsLetterOrDigitArray(); var totalSuffixCount = (from value in objs.Skip(index).Take(length) select GetSuffixCount(stringValueGetter.GetStringValue(value)) ).Sum(); var suffixIndex = 0; var suffixes = new suffix_t[totalSuffixCount]; for (int i = index, end = index + length; i < end; i++) { var str = stringValueGetter.GetStringValue(objs[i]); //if ( str == "м.бабий" ) //System.Diagnostics.Debugger.Break(); //var __ = GetSuffix( i, str ).Distinct().ToArray(); foreach (var _suffix in GetSuffixes_v2(i, str).Distinct()) { suffixes[suffixIndex++] = _suffix; } } Array.Resize <suffix_t>(ref suffixes, suffixIndex); Array.Sort <suffix_t>(suffixes, suffixComparison); var tuples = new SuffixArray <T> .tuple_t[suffixes.Length]; suffixIndex = 0; var suffix = suffixes[suffixIndex]; var suffixText = suffix.Suffix; var data = new SimplyLinkedList <SuffixArray <T> .data_t>(); tuples[suffixIndex++] = new SuffixArray <T> .tuple_t() { Suffix = suffixText, Data = data }; data.Add(new SuffixArray <T> .data_t(suffix.SuffixIndex, suffix.WordIndex)); for (int i = 1, len = suffixes.Length; i < len; i++) { suffix = suffixes[i]; if (!suffixText.StartsWith(suffix.Suffix)) { suffixText = suffix.Suffix; data = new SimplyLinkedList <SuffixArray <T> .data_t>(); tuples[suffixIndex++] = new SuffixArray <T> .tuple_t() { Suffix = suffixText, Data = data }; } data.Add(new SuffixArray <T> .data_t(suffix.SuffixIndex, suffix.WordIndex)); } suffixes = null; Array.Resize <SuffixArray <T> .tuple_t>(ref tuples, suffixIndex); Array.Reverse(tuples); DestroyIsLetterOrDigitArray(); return(tuples); }
Find(string suffix, FindModeEnum findMode = FindModeEnum.IgnoreCase) { suffix = CorrectFindSuffix(suffix, findMode); var tuple = new tuple_t() { Suffix = suffix }; var bpt_tuples = _BPT.GetValues(tuple, StartsWithStringComparer.Inst); foreach (var bpt_tuple in bpt_tuples) { foreach (var data in bpt_tuple.Data) { var word = _StringValueGetter.GetStringValue(_Objects[data.ObjIndex]); var endIndex = data.SuffixIndex + suffix.Length; if (endIndex <= word.Length) { yield return(find_result_t.Create(data.ObjIndex, word, data.SuffixIndex, suffix.Length)); } } } }
Find(string suffix, FindModeEnum findMode = FindModeEnum.IgnoreCase) { suffix = CorrectFindSuffix(suffix, findMode); var index = InternalBinarySearch(suffix); if (0 <= index) { //up for (int i = index; 0 <= i; i--) { var t = _Array[i]; if (!t.Suffix.StartsWith(suffix)) { break; } foreach (var data in t.Data) { var word = _StringValueGetter.GetStringValue(_Objects[data.ObjIndex]); var endIndex = data.SuffixIndex + suffix.Length; if (endIndex <= word.Length) { yield return(find_result_t.Create(data.ObjIndex, word, data.SuffixIndex, suffix.Length)); } } } //down for (int i = index + 1, arrayLength = _Array.Length; i < arrayLength; i++) { var t = _Array[i]; if (!t.Suffix.StartsWith(suffix)) { break; } foreach (var data in t.Data) { var word = _StringValueGetter.GetStringValue(_Objects[data.ObjIndex]); var endIndex = data.SuffixIndex + suffix.Length; if (endIndex <= word.Length) { yield return(find_result_t.Create(data.ObjIndex, word, data.SuffixIndex, suffix.Length)); } } } } }
public static BPlusTreeList <tuple_t> Build( IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter) { CreateMapArrays(); var totalSuffixCount = (from value in objs.Skip(index).Take(length) select GetSuffixCount(stringValueGetter.GetStringValue(value)) ).Sum(); /* * var capacity = (int) Math.Sqrt( totalSuffixCount ); //(int) (Math.Sqrt( length - index ) + 1); * var bpt = new BPlusTreeList< tuple_t >( default(tuple_t), capacity, capacity ); */ int BLOCK_CAPACITY_4_LST = 512; var bpt = new BPlusTreeList <tuple_t>(default(tuple_t), ((int)(totalSuffixCount / BLOCK_CAPACITY_4_LST * 1.0 + 0.5) + 25), BLOCK_CAPACITY_4_LST); var set = new Set <suffix_t>(new suffix_t_IEqualityComparer()); for (int i = index, end = index + length; i < end; i++) { var str = stringValueGetter.GetStringValue(objs[i]); #region test.commented. /* * if ( str == "м.бабий" ) * System.Diagnostics.Debugger.Break(); * var __ = GetSuffix( i, str ).Distinct().ToArray(); */ #endregion var tuple = new tuple_t() { Data = new SimplyLinkedList <data_t>() }; var tupleExists = default(tuple_t); #region test.commented. /* * str = "м.бабий"; * var x1 = GetSuffixes( i, str ).ToArray(); * var x2 = GetSuffixes( i, str ).Distinct().ToArray(); * if ( x1.Length != x2.Length ) * { * foreach ( var suff_t in GetSuffixes( i, str ) ) * { * set.Add( suff_t ); * } * System.Diagnostics.Debug.Assert( set.Count == x2.Length ); * } */ #endregion foreach (var suff_t in GetSuffixes(i, str) /*.Distinct()*/) { if (!set.Add(suff_t)) { continue; } var data = new data_t(suff_t.SuffixIndex, suff_t.WordIndex); tuple.Suffix = suff_t.Suffix; if (bpt.AddOrGetExistsValue(tuple, out tupleExists)) { tuple.Data.Add(data); tuple = new tuple_t() { Data = new SimplyLinkedList <data_t>() }; } else { tupleExists.Data.Add(data); } } set.Clear(); } DestroyMapArrays(); var bpt_out = new BPlusTreeList <tuple_t>(default(tuple_t), bpt.Count / bpt.BlockCount, bpt.BlockCount); using (var e = bpt.GetEnumerator()) { if (e.MoveNext()) { var root_tuple = e.Current; bpt_out.Add(root_tuple); for ( ; e.MoveNext();) { var tuple = e.Current; if (root_tuple.Suffix.StartsWith(tuple.Suffix)) { foreach (var data in tuple.Data) { root_tuple.Data.Add(data); } } else { root_tuple = tuple; bpt_out.Add(root_tuple); } } } } return(bpt_out); }