Beispiel #1
0
        /// <summary>Constructs the generalized suffix array from separate 'words'. Each list in the parameter <paramref name="words"/> is treated as 'word'. The elements are treated as characters. The elements are converted
        /// to an integer alphabet by means of the <see cref="IntegerText"/> class.</summary>
        /// <typeparam name="T">The type of characters of the text. It must implement IComparable.</typeparam>
        /// <param name="words">The list of 'words'.</param>
        /// <param name="withSeparators">If set to <c>true</c>, the words were treated as individual words, and will be separated by special separator elements. If set to <c>false</c>, the converted text will contain the concenated 'words' without separator elements. If set to <c>false</c>, they will be concenated to form one single word.</param>
        /// <param name="useSortedMapping">If this parameter is true, a sorted mapping of the elements T to integers will be used. The type T then has to implement IComparable. If this parameter is <c>false</c>, a unsorted <see cref="System.Collections.Generic.HashSet&lt;T&gt;"/> will be used to make a unique mapping of the elements to integers.</param>
        /// <param name="customSortingComparer">If <paramref name="useSortedMapping"/> is <c>true</c>, you can here provide a custom comparer for the elements of type T. Otherwise, if you want to use the default comparer, leave this parameter <c>null</c>.</param>
        /// <returns>The generalized suffix array. Since each list in <paramref name="words"/> is treated as separate word, the generalized suffix array is prepared to search for the longest common substring in these words.</returns>
        public static GeneralizedSuffixArray FromWords <T>(IEnumerable <IEnumerable <T> > words, bool withSeparators, bool useSortedMapping, IComparer <T> customSortingComparer)
        {
            var integerText = IntegerText.FromWords <T>(words, true, 3, useSortedMapping, customSortingComparer);
            var result      = new GeneralizedSuffixArray(integerText.Text, integerText.TextLength, integerText.NumberOfWords, integerText.WordStartPositions, integerText.AlphabetSize);

            return(result);
        }
Beispiel #2
0
        /// <summary>Generates an integer text from words (= a collection of strings). The algorithm determines the lexicographical order of all elements in all lists
        /// and then maps each unique element to an integer value, with increasing values in the lexicographical order of the elements.</summary>
        /// <param name="words">The list of individual words.</param>
        /// <param name="withSeparators">If set to <c>true</c>, the converted text will contain the concenated 'words', separated by special separator elements. If set to <c>false</c>, the converted text will contain the concenated 'words' without separator elements.</param>
        /// <param name="padding">Number of additional elements reserved in the allocated <see cref="Text"/> array. This is neccessary for some algorithms. The additional elements will contain zero values.</param>
        /// <param name="customComparer">Provides a custom comparer. If you don't want to provide an own comparer, leave this argument <c>null</c>.</param>
        /// <returns>The integer text data, which holds the text converted to an integer alphabet.</returns>
        public static IntegerText FromWords(IEnumerable <string> words, bool withSeparators, int padding, IComparer <char> customComparer)
        {
            var result = new IntegerText();

            int totalNumberOfElements = 0;
            var sSet      = new SortedSet <char>();
            int listCount = 0;

            foreach (var list in words)
            {
                foreach (var ele in list)
                {
                    sSet.Add(ele);
                }
                totalNumberOfElements += list.Length;
                ++listCount;
            }

            int numberOfSeparators = (withSeparators ? listCount : 0);

            // preprocess the dictionary to give each unique element in the dictionary a unique number
            int startInt = 1 + numberOfSeparators; // list.Count integers offset for the separator char + the zero char to append for the suffix sort algorithm

            var dict = new Dictionary <char, int>();

            foreach (var key in sSet)
            {
                dict[key] = startInt++;
            }

            int[] text       = new int[totalNumberOfElements + numberOfSeparators + padding];
            int[] wordStarts = new int[listCount + 1];
            int   word       = 0;
            int   i          = 0;
            int   separator  = 1;

            foreach (var list in words)
            {
                foreach (var ele in list)
                {
                    text[i++] = dict[ele];
                }
                if (withSeparators)
                {
                    text[i++] = separator++; // add the separator
                }
                wordStarts[++word] = i;
            }

            result._alphabetSize       = dict.Count + numberOfSeparators;
            result._paddingLength      = padding;
            result._textLength         = totalNumberOfElements + numberOfSeparators;
            result._text               = text;
            result._numberOfWords      = listCount;
            result._wordStartPositions = wordStarts;

            return(result);
        }
Beispiel #3
0
 /// <summary>Constructs a new instance of the <see cref="GeneralizedSuffixArray"/> class from <see cref="IntegerText"/>.</summary>
 /// <param name="integerText">The integer text data.</param>
 public GeneralizedSuffixArray(IntegerText integerText)
     : this(integerText.Text, integerText.TextLength, integerText.NumberOfWords, integerText.WordStartPositions, integerText.AlphabetSize)
 {
 }