public static SuffixArray <T> .tuple_t[] Build( IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter) { CreateIsLetterOrDigitArray(); var totalSuffixCount = (from value in objs.Skip(index).Take(length) select GetSuffixCount(stringValueGetter.GetStringValue(value)) ).Sum(); var suffixIndex = 0; var suffixes = new suffix_t[totalSuffixCount]; for (int i = index, end = index + length; i < end; i++) { var str = stringValueGetter.GetStringValue(objs[i]); //if ( str == "м.бабий" ) //System.Diagnostics.Debugger.Break(); //var __ = GetSuffix( i, str ).Distinct().ToArray(); foreach (var _suffix in GetSuffixes_v2(i, str).Distinct()) { suffixes[suffixIndex++] = _suffix; } } Array.Resize <suffix_t>(ref suffixes, suffixIndex); Array.Sort <suffix_t>(suffixes, suffixComparison); var tuples = new SuffixArray <T> .tuple_t[suffixes.Length]; suffixIndex = 0; var suffix = suffixes[suffixIndex]; var suffixText = suffix.Suffix; var data = new SimplyLinkedList <SuffixArray <T> .data_t>(); tuples[suffixIndex++] = new SuffixArray <T> .tuple_t() { Suffix = suffixText, Data = data }; data.Add(new SuffixArray <T> .data_t(suffix.SuffixIndex, suffix.WordIndex)); for (int i = 1, len = suffixes.Length; i < len; i++) { suffix = suffixes[i]; if (!suffixText.StartsWith(suffix.Suffix)) { suffixText = suffix.Suffix; data = new SimplyLinkedList <SuffixArray <T> .data_t>(); tuples[suffixIndex++] = new SuffixArray <T> .tuple_t() { Suffix = suffixText, Data = data }; } data.Add(new SuffixArray <T> .data_t(suffix.SuffixIndex, suffix.WordIndex)); } suffixes = null; Array.Resize <SuffixArray <T> .tuple_t>(ref tuples, suffixIndex); Array.Reverse(tuples); DestroyIsLetterOrDigitArray(); return(tuples); }
public SuffixArray_v2(IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter) { if (objs == null) { throw (new ArgumentNullException(nameof(objs))); } if ((length <= 0) || (length <= index) || (objs.Count < length)) { throw (new ArgumentException("index-or-length")); } if (stringValueGetter == null) { throw (new ArgumentNullException(nameof(stringValueGetter))); } //if ( values.Any( s => string.IsNullOrEmpty( s ) ) ) throw (new ArgumentNullException("values.Any()")); _Objects = objs; _StringValueGetter = stringValueGetter; _BPT = SuffixArrayBuilder.Build(objs, index, length, stringValueGetter); }
/// <summary> /// Initializes a new instance of the <see cref="T:System.Collections.Generic.sorted_list_key_char`2" /> class that is empty, has the default initial capacity, and uses the default <see cref="T:System.Collections.Generic.IComparer`1" />. /// </summary> public SuffixArray_v2(IList <T> objs, IStringValueGetter <T> stringValueGetter) : this(objs, 0, objs.Count, stringValueGetter) { }
public static BPlusTreeList <tuple_t> Build( IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter) { CreateMapArrays(); var totalSuffixCount = (from value in objs.Skip(index).Take(length) select GetSuffixCount(stringValueGetter.GetStringValue(value)) ).Sum(); /* * var capacity = (int) Math.Sqrt( totalSuffixCount ); //(int) (Math.Sqrt( length - index ) + 1); * var bpt = new BPlusTreeList< tuple_t >( default(tuple_t), capacity, capacity ); */ int BLOCK_CAPACITY_4_LST = 512; var bpt = new BPlusTreeList <tuple_t>(default(tuple_t), ((int)(totalSuffixCount / BLOCK_CAPACITY_4_LST * 1.0 + 0.5) + 25), BLOCK_CAPACITY_4_LST); var set = new Set <suffix_t>(new suffix_t_IEqualityComparer()); for (int i = index, end = index + length; i < end; i++) { var str = stringValueGetter.GetStringValue(objs[i]); #region test.commented. /* * if ( str == "м.бабий" ) * System.Diagnostics.Debugger.Break(); * var __ = GetSuffix( i, str ).Distinct().ToArray(); */ #endregion var tuple = new tuple_t() { Data = new SimplyLinkedList <data_t>() }; var tupleExists = default(tuple_t); #region test.commented. /* * str = "м.бабий"; * var x1 = GetSuffixes( i, str ).ToArray(); * var x2 = GetSuffixes( i, str ).Distinct().ToArray(); * if ( x1.Length != x2.Length ) * { * foreach ( var suff_t in GetSuffixes( i, str ) ) * { * set.Add( suff_t ); * } * System.Diagnostics.Debug.Assert( set.Count == x2.Length ); * } */ #endregion foreach (var suff_t in GetSuffixes(i, str) /*.Distinct()*/) { if (!set.Add(suff_t)) { continue; } var data = new data_t(suff_t.SuffixIndex, suff_t.WordIndex); tuple.Suffix = suff_t.Suffix; if (bpt.AddOrGetExistsValue(tuple, out tupleExists)) { tuple.Data.Add(data); tuple = new tuple_t() { Data = new SimplyLinkedList <data_t>() }; } else { tupleExists.Data.Add(data); } } set.Clear(); } DestroyMapArrays(); var bpt_out = new BPlusTreeList <tuple_t>(default(tuple_t), bpt.Count / bpt.BlockCount, bpt.BlockCount); using (var e = bpt.GetEnumerator()) { if (e.MoveNext()) { var root_tuple = e.Current; bpt_out.Add(root_tuple); for ( ; e.MoveNext();) { var tuple = e.Current; if (root_tuple.Suffix.StartsWith(tuple.Suffix)) { foreach (var data in tuple.Data) { root_tuple.Data.Add(data); } } else { root_tuple = tuple; bpt_out.Add(root_tuple); } } } } return(bpt_out); }