/// <summary>Constructs the generalized suffix array from separate 'words'. Each list in the parameter <paramref name="words"/> is treated as 'word'. The elements are treated as characters. The elements are converted /// to an integer alphabet by means of the <see cref="IntegerText"/> class.</summary> /// <typeparam name="T">The type of characters of the text. It must implement IComparable.</typeparam> /// <param name="words">The list of 'words'.</param> /// <param name="withSeparators">If set to <c>true</c>, the words were treated as individual words, and will be separated by special separator elements. If set to <c>false</c>, the converted text will contain the concenated 'words' without separator elements. If set to <c>false</c>, they will be concenated to form one single word.</param> /// <param name="useSortedMapping">If this parameter is true, a sorted mapping of the elements T to integers will be used. The type T then has to implement IComparable. If this parameter is <c>false</c>, a unsorted <see cref="System.Collections.Generic.HashSet<T>"/> will be used to make a unique mapping of the elements to integers.</param> /// <param name="customSortingComparer">If <paramref name="useSortedMapping"/> is <c>true</c>, you can here provide a custom comparer for the elements of type T. Otherwise, if you want to use the default comparer, leave this parameter <c>null</c>.</param> /// <returns>The generalized suffix array. Since each list in <paramref name="words"/> is treated as separate word, the generalized suffix array is prepared to search for the longest common substring in these words.</returns> public static GeneralizedSuffixArray FromWords <T>(IEnumerable <IEnumerable <T> > words, bool withSeparators, bool useSortedMapping, IComparer <T> customSortingComparer) { var integerText = IntegerText.FromWords <T>(words, true, 3, useSortedMapping, customSortingComparer); var result = new GeneralizedSuffixArray(integerText.Text, integerText.TextLength, integerText.NumberOfWords, integerText.WordStartPositions, integerText.AlphabetSize); return(result); }
/// <summary>Generates an integer text from words (= a collection of strings). The algorithm determines the lexicographical order of all elements in all lists /// and then maps each unique element to an integer value, with increasing values in the lexicographical order of the elements.</summary> /// <param name="words">The list of individual words.</param> /// <param name="withSeparators">If set to <c>true</c>, the converted text will contain the concenated 'words', separated by special separator elements. If set to <c>false</c>, the converted text will contain the concenated 'words' without separator elements.</param> /// <param name="padding">Number of additional elements reserved in the allocated <see cref="Text"/> array. This is neccessary for some algorithms. The additional elements will contain zero values.</param> /// <param name="customComparer">Provides a custom comparer. If you don't want to provide an own comparer, leave this argument <c>null</c>.</param> /// <returns>The integer text data, which holds the text converted to an integer alphabet.</returns> public static IntegerText FromWords(IEnumerable <string> words, bool withSeparators, int padding, IComparer <char> customComparer) { var result = new IntegerText(); int totalNumberOfElements = 0; var sSet = new SortedSet <char>(); int listCount = 0; foreach (var list in words) { foreach (var ele in list) { sSet.Add(ele); } totalNumberOfElements += list.Length; ++listCount; } int numberOfSeparators = (withSeparators ? listCount : 0); // preprocess the dictionary to give each unique element in the dictionary a unique number int startInt = 1 + numberOfSeparators; // list.Count integers offset for the separator char + the zero char to append for the suffix sort algorithm var dict = new Dictionary <char, int>(); foreach (var key in sSet) { dict[key] = startInt++; } int[] text = new int[totalNumberOfElements + numberOfSeparators + padding]; int[] wordStarts = new int[listCount + 1]; int word = 0; int i = 0; int separator = 1; foreach (var list in words) { foreach (var ele in list) { text[i++] = dict[ele]; } if (withSeparators) { text[i++] = separator++; // add the separator } wordStarts[++word] = i; } result._alphabetSize = dict.Count + numberOfSeparators; result._paddingLength = padding; result._textLength = totalNumberOfElements + numberOfSeparators; result._text = text; result._numberOfWords = listCount; result._wordStartPositions = wordStarts; return(result); }
/// <summary>Constructs a new instance of the <see cref="GeneralizedSuffixArray"/> class from <see cref="IntegerText"/>.</summary> /// <param name="integerText">The integer text data.</param> public GeneralizedSuffixArray(IntegerText integerText) : this(integerText.Text, integerText.TextLength, integerText.NumberOfWords, integerText.WordStartPositions, integerText.AlphabetSize) { }