コード例 #1
0
ファイル: SuffixArray.cs プロジェクト: zamgi/SuffixArray
        public static SuffixArray <T> .tuple_t[] Build(
            IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter)
        {
            CreateIsLetterOrDigitArray();

            var totalSuffixCount = (from value in objs.Skip(index).Take(length)
                                    select GetSuffixCount(stringValueGetter.GetStringValue(value))
                                    ).Sum();
            var suffixIndex = 0;
            var suffixes    = new suffix_t[totalSuffixCount];

            for (int i = index, end = index + length; i < end; i++)
            {
                var str = stringValueGetter.GetStringValue(objs[i]);
                //if ( str == "м.бабий" )
                //System.Diagnostics.Debugger.Break();
                //var __ = GetSuffix( i, str ).Distinct().ToArray();
                foreach (var _suffix in GetSuffixes_v2(i, str).Distinct())
                {
                    suffixes[suffixIndex++] = _suffix;
                }
            }
            Array.Resize <suffix_t>(ref suffixes, suffixIndex);
            Array.Sort <suffix_t>(suffixes, suffixComparison);


            var tuples = new SuffixArray <T> .tuple_t[suffixes.Length];

            suffixIndex = 0;
            var suffix     = suffixes[suffixIndex];
            var suffixText = suffix.Suffix;
            var data       = new SimplyLinkedList <SuffixArray <T> .data_t>();

            tuples[suffixIndex++] = new SuffixArray <T> .tuple_t()
            {
                Suffix = suffixText, Data = data
            };
            data.Add(new SuffixArray <T> .data_t(suffix.SuffixIndex, suffix.WordIndex));
            for (int i = 1, len = suffixes.Length; i < len; i++)
            {
                suffix = suffixes[i];
                if (!suffixText.StartsWith(suffix.Suffix))
                {
                    suffixText            = suffix.Suffix;
                    data                  = new SimplyLinkedList <SuffixArray <T> .data_t>();
                    tuples[suffixIndex++] = new SuffixArray <T> .tuple_t()
                    {
                        Suffix = suffixText, Data = data
                    };
                }
                data.Add(new SuffixArray <T> .data_t(suffix.SuffixIndex, suffix.WordIndex));
            }
            suffixes = null;
            Array.Resize <SuffixArray <T> .tuple_t>(ref tuples, suffixIndex);
            Array.Reverse(tuples);

            DestroyIsLetterOrDigitArray();

            return(tuples);
        }
コード例 #2
0
ファイル: SuffixArray_v2.cs プロジェクト: zamgi/SuffixArray
        Find(string suffix, FindModeEnum findMode = FindModeEnum.IgnoreCase)
        {
            suffix = CorrectFindSuffix(suffix, findMode);

            var tuple = new tuple_t()
            {
                Suffix = suffix
            };
            var bpt_tuples = _BPT.GetValues(tuple, StartsWithStringComparer.Inst);

            foreach (var bpt_tuple in bpt_tuples)
            {
                foreach (var data in bpt_tuple.Data)
                {
                    var word     = _StringValueGetter.GetStringValue(_Objects[data.ObjIndex]);
                    var endIndex = data.SuffixIndex + suffix.Length;
                    if (endIndex <= word.Length)
                    {
                        yield return(find_result_t.Create(data.ObjIndex, word, data.SuffixIndex, suffix.Length));
                    }
                }
            }
        }
コード例 #3
0
ファイル: SuffixArray.cs プロジェクト: zamgi/SuffixArray
        Find(string suffix, FindModeEnum findMode = FindModeEnum.IgnoreCase)
        {
            suffix = CorrectFindSuffix(suffix, findMode);

            var index = InternalBinarySearch(suffix);

            if (0 <= index)
            {
                //up
                for (int i = index; 0 <= i; i--)
                {
                    var t = _Array[i];
                    if (!t.Suffix.StartsWith(suffix))
                    {
                        break;
                    }
                    foreach (var data in t.Data)
                    {
                        var word     = _StringValueGetter.GetStringValue(_Objects[data.ObjIndex]);
                        var endIndex = data.SuffixIndex + suffix.Length;
                        if (endIndex <= word.Length)
                        {
                            yield return(find_result_t.Create(data.ObjIndex, word, data.SuffixIndex, suffix.Length));
                        }
                    }
                }
                //down
                for (int i = index + 1, arrayLength = _Array.Length; i < arrayLength; i++)
                {
                    var t = _Array[i];
                    if (!t.Suffix.StartsWith(suffix))
                    {
                        break;
                    }
                    foreach (var data in t.Data)
                    {
                        var word     = _StringValueGetter.GetStringValue(_Objects[data.ObjIndex]);
                        var endIndex = data.SuffixIndex + suffix.Length;
                        if (endIndex <= word.Length)
                        {
                            yield return(find_result_t.Create(data.ObjIndex, word, data.SuffixIndex, suffix.Length));
                        }
                    }
                }
            }
        }
コード例 #4
0
ファイル: SuffixArray_v2.cs プロジェクト: zamgi/SuffixArray
            public static BPlusTreeList <tuple_t> Build(
                IList <T> objs, int index, int length, IStringValueGetter <T> stringValueGetter)
            {
                CreateMapArrays();

                var totalSuffixCount = (from value in objs.Skip(index).Take(length)
                                        select GetSuffixCount(stringValueGetter.GetStringValue(value))
                                        ).Sum();

                /*
                 * var capacity = (int) Math.Sqrt( totalSuffixCount ); //(int) (Math.Sqrt( length - index ) + 1);
                 * var bpt = new BPlusTreeList< tuple_t >( default(tuple_t), capacity, capacity );
                 */
                int BLOCK_CAPACITY_4_LST = 512;
                var bpt = new BPlusTreeList <tuple_t>(default(tuple_t), ((int)(totalSuffixCount / BLOCK_CAPACITY_4_LST * 1.0 + 0.5) + 25), BLOCK_CAPACITY_4_LST);

                var set = new Set <suffix_t>(new suffix_t_IEqualityComparer());

                for (int i = index, end = index + length; i < end; i++)
                {
                    var str = stringValueGetter.GetStringValue(objs[i]);
                    #region test.commented.

                    /*
                     * if ( str == "м.бабий" )
                     * System.Diagnostics.Debugger.Break();
                     * var __ = GetSuffix( i, str ).Distinct().ToArray();
                     */
                    #endregion

                    var tuple = new tuple_t()
                    {
                        Data = new SimplyLinkedList <data_t>()
                    };
                    var tupleExists = default(tuple_t);

                    #region test.commented.

                    /*
                     * str = "м.бабий";
                     * var x1 = GetSuffixes( i, str ).ToArray();
                     * var x2 = GetSuffixes( i, str ).Distinct().ToArray();
                     * if ( x1.Length != x2.Length )
                     * {
                     *  foreach ( var suff_t in GetSuffixes( i, str ) )
                     *  {
                     *      set.Add( suff_t );
                     *  }
                     *  System.Diagnostics.Debug.Assert( set.Count == x2.Length );
                     * }
                     */
                    #endregion

                    foreach (var suff_t in GetSuffixes(i, str) /*.Distinct()*/)
                    {
                        if (!set.Add(suff_t))
                        {
                            continue;
                        }

                        var data = new data_t(suff_t.SuffixIndex, suff_t.WordIndex);
                        tuple.Suffix = suff_t.Suffix;
                        if (bpt.AddOrGetExistsValue(tuple, out tupleExists))
                        {
                            tuple.Data.Add(data);
                            tuple = new tuple_t()
                            {
                                Data = new SimplyLinkedList <data_t>()
                            };
                        }
                        else
                        {
                            tupleExists.Data.Add(data);
                        }
                    }
                    set.Clear();
                }

                DestroyMapArrays();

                var bpt_out = new BPlusTreeList <tuple_t>(default(tuple_t), bpt.Count / bpt.BlockCount, bpt.BlockCount);
                using (var e = bpt.GetEnumerator())
                {
                    if (e.MoveNext())
                    {
                        var root_tuple = e.Current;
                        bpt_out.Add(root_tuple);
                        for ( ; e.MoveNext();)
                        {
                            var tuple = e.Current;
                            if (root_tuple.Suffix.StartsWith(tuple.Suffix))
                            {
                                foreach (var data in tuple.Data)
                                {
                                    root_tuple.Data.Add(data);
                                }
                            }
                            else
                            {
                                root_tuple = tuple;
                                bpt_out.Add(root_tuple);
                            }
                        }
                    }
                }

                return(bpt_out);
            }