private IndexData(TernarySearchTreeDictionary <NonNullImmutableList <WeightedEntry <TKey> > > data, IEqualityComparer <TKey> dataKeyComparer, bool validate) { if (data == null) { throw new ArgumentNullException("data"); } // If the constructor is called from a method within this class then the data should be known to be valid but if the constructor call was from // other code then perform some sanity checking on it if (validate) { var allValues = data.GetAllValues(); if (allValues.Any(v => v == null)) { throw new ArgumentException("data may not contain any null WeightedEntry list values"); } if (allValues.Any(v => v.Count == 0)) { throw new ArgumentException("data may not contain any empty WeightedEntry list values"); } } _data = data; _sourceLocationsAvailable = new Lazy <bool>(() => { var atLeastOneEntryIsMissingSourceLocations = _data.GetAllValues().Any(entries => entries.Any(entry => entry.SourceLocationsIfRecorded == null)); return(!atLeastOneEntryIsMissingSourceLocations); }); KeyComparer = dataKeyComparer ?? throw new ArgumentNullException("dataKeyComparer"); }
private void Write <TValue>(JsonWriter writer, TernarySearchTreeDictionary <TValue> value, JsonSerializer serializer) { // Note: Note using the ToDictionary method since that uses the KeyNormaliser as the IEqualityComparer on the dictionary, which is just one more complication - it's // simpler to just take lists of the keys and value and then to recombine when de-serialising serializer.Serialize( writer, new SerialisableData <TValue> { NormalisedKeysWithValues = value.GetAllNormalisedKeys().Zip( value.GetAllValues(), (normalisedKey, matches) => new KeyValuePair <string, TValue>(normalisedKey, matches) ), KeyNormaliser = value.KeyNormaliser } ); }
// ======================================================================================================================================================== // MAIN // ======================================================================================================================================================== static void Main(string[] args) { // The file "SampleData.dat" is derived from data obtained by querying the New York Times Article Search API in my FullTextIndexer project (and then // manipulated to remove dependencies to any classes in that work). From what I understand of the API's Terms of Use it's fine to distribute this // data here (it's essentially just a set of words that appear in some of their articles). I used that so that there was a real set of data to // operate on (there's 86,000-ish keys so it's not a small set but not enormous by any stretch). var keyNormaliser = new DefaultStringNormaliser(); var data = new Dictionary <string, float>(keyNormaliser); foreach (var token in File.ReadAllText("TokenList.txt").Split('\n').Select(t => t.Trim())) { var normalisedToken = keyNormaliser.GetNormalisedString(token); if ((normalisedToken != "") && !data.ContainsKey(normalisedToken)) { data.Add(normalisedToken, data.Count); } } Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data1.."); var data1 = new BitShiftingSearchDictionary <string, float>(data); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data2.."); var data2 = new BitShiftingStructSearchDictionaryWithKeyNotFoundNode <string, float>(data); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data3.."); var data3 = new BitShiftingStructSearchDictionary <string, float>(data); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data4 [TernarySearchTree-Unsorted].."); var data4 = new TernarySearchTreeDictionary <float>(data, new DefaultStringNormaliser()); Console.WriteLine(" > BalanceFactor: " + data4.GetBalanceFactor()); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data5 [TernarySearchTree-Alphabetical].."); var data5 = new TernarySearchTreeDictionary <float>(GetAlphabeticalSortedData(data), new DefaultStringNormaliser()); Console.WriteLine(" > BalanceFactor: " + data5.GetBalanceFactor()); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data6 [TernarySearchTree-RandomSorted].."); var data6 = new TernarySearchTreeDictionary <float>(GetRandomSortedData(data, 0), new DefaultStringNormaliser()); Console.WriteLine(" > BalanceFactor: " + data6.GetBalanceFactor()); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data7 [TernarySearchTree-SearchTreeSortedData].."); var data7 = new TernarySearchTreeDictionary <float>(GetSearchTreeSortedData <float>(data), new DefaultStringNormaliser()); Console.WriteLine(" > BalanceFactor: " + data7.GetBalanceFactor()); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " Generating data8.."); var data8 = new TernarySearchTreeStructDictionary <float>(GetSearchTreeSortedData <float>(data), new DefaultStringNormaliser()); Console.WriteLine(DateTime.Now.ToString("HH:mm:ss.fff") + " - Done"); var outerLoopRepeatCount = 50; var innerLoopsCount = 1000; // The reverseKeyChance value specifies how many of the keys that are taken as a random subset of the input data are reversed such that some of the requested // keys will exist in the data and some won't (a value of 0 means that no keys are reversed, 1 means that they all are). Having run these tests a few times // I've found the following results (in Release builds, which show better results for the search tree than Debug). // - reverseKeyChance 1 (none of the searched-for keys exist in the data): TernarySearchTree is slightly slower, 0.97x the speed of the standard dictionary // - reverseKeyChance 0.99: TernarySearchTree is slightly slower, 0.99x // - reverseKeyChance 0.98: TernarySearchTree is slightly faster, almost 1.05x // - reverseKeyChance 0.97: TernarySearchTree is slightly faster, almost 1.1x // - reverseKeyChance 0.9: TernarySearchTree is faster, over 1.2x // - reverseKeyChance 0.75: TernarySearchTree is faster, over 1.42x // - reverseKeyChance 0.5: TernarySearchTree is faster, over 1.9x // - reverseKeyChance 0 (all of the searched-for keys exist in the data): TernarySearchTree is faster, over 2.8x the speed of the standard dictionary var reverseKeyChance = 0.5f; var keysToRetrieve = GetKeysToRetrieve(data.Keys, 100, 0, reverseKeyChance); var TotalTime0 = new TimeSpan(0); var TotalTime1 = new TimeSpan(0); var TotalTime2 = new TimeSpan(0); var TotalTime3 = new TimeSpan(0); var TotalTime4 = new TimeSpan(0); var TotalTime5 = new TimeSpan(0); var TotalTime6 = new TimeSpan(0); var TotalTime7 = new TimeSpan(0); var TotalTime8 = new TimeSpan(0); for (var index = 0; index < outerLoopRepeatCount; index++) { var time8 = GetTimeForRetrievals <string, float>(data8, keysToRetrieve, innerLoopsCount); TotalTime8 = TotalTime8.Add(time8.TimeTaken); var time7 = GetTimeForRetrievals <string, float>(data7, keysToRetrieve, innerLoopsCount); TotalTime7 = TotalTime7.Add(time7.TimeTaken); var time6 = GetTimeForRetrievals <string, float>(data6, keysToRetrieve, innerLoopsCount); TotalTime6 = TotalTime6.Add(time6.TimeTaken); var time5 = GetTimeForRetrievals <string, float>(data5, keysToRetrieve, innerLoopsCount); TotalTime5 = TotalTime5.Add(time5.TimeTaken); var time4 = GetTimeForRetrievals <string, float>(data4, keysToRetrieve, innerLoopsCount); TotalTime4 = TotalTime4.Add(time4.TimeTaken); var time3 = GetTimeForRetrievals <string, float>(data3, keysToRetrieve, innerLoopsCount); TotalTime3 = TotalTime3.Add(time3.TimeTaken); var time2 = GetTimeForRetrievals <string, float>(data2, keysToRetrieve, innerLoopsCount); TotalTime2 = TotalTime2.Add(time2.TimeTaken); var time1 = GetTimeForRetrievals <string, float>(data1, keysToRetrieve, innerLoopsCount); TotalTime1 = TotalTime1.Add(time1.TimeTaken); var time0 = GetTimeForRetrievals <string, float>(data, keysToRetrieve, innerLoopsCount); TotalTime0 = TotalTime0.Add(time0.TimeTaken); Console.WriteLine(((float)((index + 1) * 100) / (float)outerLoopRepeatCount).ToString("0.000") + "% complete.."); } Console.WriteLine("reverseKeyChance: " + reverseKeyChance); var improvement1 = TotalTime0.TotalMilliseconds / TotalTime1.TotalMilliseconds; Console.WriteLine("improvement1 [BitShiftDictionary]: " + improvement1); var improvement2 = TotalTime0.TotalMilliseconds / TotalTime2.TotalMilliseconds; Console.WriteLine("improvement2 [BitShiftStructDictionaryWithKeyNotFound]: " + improvement2); var improvement3 = TotalTime0.TotalMilliseconds / TotalTime3.TotalMilliseconds; Console.WriteLine("improvement3 [BitShiftStructDictionary]: " + improvement3); var improvement4 = TotalTime0.TotalMilliseconds / TotalTime4.TotalMilliseconds; Console.WriteLine("improvement4 [TernarySearchTree-InsertedInOrder]: " + improvement4); var improvement5 = TotalTime0.TotalMilliseconds / TotalTime5.TotalMilliseconds; Console.WriteLine("improvement5 [TernarySearchTree-Alphabetical]: " + improvement5); var improvement6 = TotalTime0.TotalMilliseconds / TotalTime6.TotalMilliseconds; Console.WriteLine("improvement6 [TernarySearchTree-RandomSorted]: " + improvement6); var improvement7 = TotalTime0.TotalMilliseconds / TotalTime7.TotalMilliseconds; Console.WriteLine("improvement7 [TernarySearchTree-SearchTreeSortedData]: " + improvement7); var improvement8 = TotalTime0.TotalMilliseconds / TotalTime8.TotalMilliseconds; Console.WriteLine("improvement8 [TernarySearchTree-SearchTreeSortedData, struct store]: " + improvement8); Console.WriteLine("Press [Enter] to continue.."); Console.ReadLine(); }
public IndexData(TernarySearchTreeDictionary <NonNullImmutableList <WeightedEntry <TKey> > > data, IEqualityComparer <TKey> dataKeyComparer) : this(data, dataKeyComparer, validate : true) { }