private int GetPartitionIndex(object partitionKey) { ASCIIEncoding encoding = new ASCIIEncoding(); MurmurHash2Simple hasher = new MurmurHash2Simple(); int bytes = (int)hasher.Hash(encoding.GetBytes((string)partitionKey)); int partitionIndex = Math.Abs(bytes % this.partitions.Count); return(partitionIndex); }
private int GetHash(string value) { MurmurHash2Simple simple = new MurmurHash2Simple(); if (string.IsNullOrEmpty(value)) { throw new ArgumentNullException(nameof(value)); } if (value.Length > _maxSize) { throw new ArgumentException($"Максимальная длинна ключа составляет {_maxSize} символов.", nameof(value)); } var hash = Convert.ToInt32(simple.Hash(System.Text.Encoding.UTF8.GetBytes(value))); return(hash); }
/// <summary> /// this function can estimate the amount of time that is likely saved due to uint hash comparison by the apriori algorithm on a specific dataset /// </summary> /// <param name="attributes">number of attributes in the dataset header; @attribute a1 {..}</param> /// <param name="attribute_values">median of attribute value count in the attributes; @attribute .. { val1, val2, val3, .. }</param> /// <param name="dataset_columns">@data row (instances) count of the dataset</param> /// <param name="percentage">percentage (5%-85%) of item-pair checks that might be avoided by the algorithm (speeds it up)</param> /// <returns>[0] = calculated comparisons, [1] = uint comparison time, [2] = string comparison time (times are milliseconds) </returns> public static long[] run_int_vs_string_comparison(int attributes = 10, int attribute_values = 5, int dataset_columns = 50000, int percentage = 30) { //calculate the number of comparisons from the scenario long compare_num = (long)((double)(((attributes * attribute_values) * (attributes * attribute_values) / 2) * dataset_columns * attributes) * ((double)percentage / 100)); string _s1 = "association rules mining"; string _s2 = "apriori data mining algorithm"; string _s3 = "algorithm mining data apriori"; MurmurHash2Simple hash = new MurmurHash2Simple(); uint _i1 = get_representative_int_hash(hash, _s1); uint _i2 = get_representative_int_hash(hash, _s2); uint _i3 = get_representative_int_hash(hash, _s3); long uints = -1; long strings = -1; Stopwatch sw = new Stopwatch(); sw.Start(); long do_something_1 = 0; long do_something_2 = 0; //comparing hashes for (long i = 0; i < compare_num; i++) { if (i % 2 == 0) { if (_s1 == _s2) //doing the same thing will be optimized by the compiler.. { do_something_1++; } } else { if (_s1 == _s3) //therefore another one here { do_something_2++; } } } strings = sw.ElapsedMilliseconds; do_something_1 = 0; do_something_2 = 0; sw.Restart(); //comparing strings for (long i = 0; i < compare_num; i++) { if (i % 2 == 0) { if (_i1 == _i2) //doing the same thing will be optimized by the compiler.. { do_something_1++; } } else { if (_i1 == _i3) //therefore another one here { do_something_2++; } } } uints = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Compared {0} times; uints took {1} ms, strings took {2} ms.", compare_num, uints, strings)); return(new long[] { compare_num, uints, strings }); }
/// <summary> /// turns string into representative (murmur2) uint hash /// </summary> /// <param name="hash"></param> /// <param name="dataset_column"></param> /// <returns></returns> public static uint get_representative_int_hash(MurmurHash2Simple hash, string dataset_column) { return(hash.Hash(Encoding.UTF8.GetBytes(dataset_column))); }
private int GetPartitionIndex(object partitionKey) { ASCIIEncoding encoding = new ASCIIEncoding(); MurmurHash2Simple hasher = new MurmurHash2Simple(); int bytes = (int)hasher.Hash(encoding.GetBytes((string)partitionKey)); int partitionIndex = Math.Abs(bytes % this.partitions.Count); return partitionIndex; }