public void testBloomFilterByte() { BloomFilter bf = new BloomFilter(10000); byte val = Byte.MinValue; byte val1 = 1; byte val2 = 2; byte val3 = Byte.MaxValue; Assert.Equal(false, bf.testLong(val)); Assert.Equal(false, bf.testLong(val1)); Assert.Equal(false, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val); Assert.Equal(true, bf.testLong(val)); Assert.Equal(false, bf.testLong(val1)); Assert.Equal(false, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val1); Assert.Equal(true, bf.testLong(val)); Assert.Equal(true, bf.testLong(val1)); Assert.Equal(false, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val2); Assert.Equal(true, bf.testLong(val)); Assert.Equal(true, bf.testLong(val1)); Assert.Equal(true, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val3); Assert.Equal(true, bf.testLong(val)); Assert.Equal(true, bf.testLong(val1)); Assert.Equal(true, bf.testLong(val2)); Assert.Equal(true, bf.testLong(val3)); byte randVal = 0; for (int i = 0; i < COUNT; i++) { randVal = (byte)rand.Next(Byte.MaxValue); bf.addLong(randVal); } // last value should be present Assert.Equal(true, bf.testLong(randVal)); // most likely this value should not exist Assert.Equal(false, bf.testLong(unchecked((byte)-120))); Assert.Equal(7800, bf.sizeInBytes()); }
public void testBloomFilterString() { BloomFilter bf = new BloomFilter(100000); string val = "bloo"; string val1 = "bloom fil"; string val2 = "bloom filter"; string val3 = "cuckoo filter"; Assert.Equal(false, bf.testString(val)); Assert.Equal(false, bf.testString(val1)); Assert.Equal(false, bf.testString(val2)); Assert.Equal(false, bf.testString(val3)); bf.addString(val); Assert.Equal(true, bf.testString(val)); Assert.Equal(false, bf.testString(val1)); Assert.Equal(false, bf.testString(val2)); Assert.Equal(false, bf.testString(val3)); bf.addString(val1); Assert.Equal(true, bf.testString(val)); Assert.Equal(true, bf.testString(val1)); Assert.Equal(false, bf.testString(val2)); Assert.Equal(false, bf.testString(val3)); bf.addString(val2); Assert.Equal(true, bf.testString(val)); Assert.Equal(true, bf.testString(val1)); Assert.Equal(true, bf.testString(val2)); Assert.Equal(false, bf.testString(val3)); bf.addString(val3); Assert.Equal(true, bf.testString(val)); Assert.Equal(true, bf.testString(val1)); Assert.Equal(true, bf.testString(val2)); Assert.Equal(true, bf.testString(val3)); long randVal = 0; for (int i = 0; i < COUNT; i++) { randVal = rand.NextLong(); bf.addString(randVal.ToString(CultureInfo.InvariantCulture)); } // last value should be present Assert.Equal(true, bf.testString(randVal.ToString(CultureInfo.InvariantCulture))); // most likely this value should not exist Assert.Equal(false, bf.testString((-120L).ToString(CultureInfo.InvariantCulture))); Assert.Equal(77944, bf.sizeInBytes()); }
public void testBloomFilterFloat() { BloomFilter bf = new BloomFilter(10000); float val = Single.MinValue; float val1 = 1.1f; float val2 = 2.2f; float val3 = Single.MaxValue; Assert.Equal(false, bf.testDouble(val)); Assert.Equal(false, bf.testDouble(val1)); Assert.Equal(false, bf.testDouble(val2)); Assert.Equal(false, bf.testDouble(val3)); bf.addDouble(val); Assert.Equal(true, bf.testDouble(val)); Assert.Equal(false, bf.testDouble(val1)); Assert.Equal(false, bf.testDouble(val2)); Assert.Equal(false, bf.testDouble(val3)); bf.addDouble(val1); Assert.Equal(true, bf.testDouble(val)); Assert.Equal(true, bf.testDouble(val1)); Assert.Equal(false, bf.testDouble(val2)); Assert.Equal(false, bf.testDouble(val3)); bf.addDouble(val2); Assert.Equal(true, bf.testDouble(val)); Assert.Equal(true, bf.testDouble(val1)); Assert.Equal(true, bf.testDouble(val2)); Assert.Equal(false, bf.testDouble(val3)); bf.addDouble(val3); Assert.Equal(true, bf.testDouble(val)); Assert.Equal(true, bf.testDouble(val1)); Assert.Equal(true, bf.testDouble(val2)); Assert.Equal(true, bf.testDouble(val3)); float randVal = 0; for (int i = 0; i < COUNT; i++) { randVal = rand.NextFloat(); bf.addDouble(randVal); } // last value should be present Assert.Equal(true, bf.testDouble(randVal)); // most likely this value should not exist Assert.Equal(false, bf.testDouble(-120.2f)); Assert.Equal(7800, bf.sizeInBytes()); }
public void testBloomFilterLong() { BloomFilter bf = new BloomFilter(10000); long val = Int64.MinValue; long val1 = 1; long val2 = 2; long val3 = Int64.MaxValue; Assert.Equal(false, bf.testLong(val)); Assert.Equal(false, bf.testLong(val1)); Assert.Equal(false, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val); Assert.Equal(true, bf.testLong(val)); Assert.Equal(false, bf.testLong(val1)); Assert.Equal(false, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val1); Assert.Equal(true, bf.testLong(val)); Assert.Equal(true, bf.testLong(val1)); Assert.Equal(false, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val2); Assert.Equal(true, bf.testLong(val)); Assert.Equal(true, bf.testLong(val1)); Assert.Equal(true, bf.testLong(val2)); Assert.Equal(false, bf.testLong(val3)); bf.addLong(val3); Assert.Equal(true, bf.testLong(val)); Assert.Equal(true, bf.testLong(val1)); Assert.Equal(true, bf.testLong(val2)); Assert.Equal(true, bf.testLong(val3)); long randVal = 0; for (int i = 0; i < COUNT; i++) { randVal = rand.NextLong(); bf.addLong(randVal); } // last value should be present Assert.Equal(true, bf.testLong(randVal)); // most likely this value should not exist Assert.Equal(false, bf.testLong(-120)); Assert.Equal(7800, bf.sizeInBytes()); }
/** * Merge the specified bloom filter with current bloom filter. * * @param that - bloom filter to merge */ public void merge(BloomFilter that) { if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) { this.bitSet.putAll(that.bitSet); } else { throw new ArgumentException("BloomFilters are not compatible for merging." + " this - " + this.ToString() + " that - " + that.ToString()); } }
/** * Create a tree writer. * @param columnId the column id of the column to write * @param inspector the object inspector to use * @param schema the row schema * @param streamFactory limited access to the Writer's data. * @param nullable can the value be null? * @ */ protected TreeWriter( int columnId, ObjectInspector inspector, TypeDescription schema, StreamFactory streamFactory, bool nullable) { this.streamFactory = streamFactory; this.isCompressed = streamFactory.isCompressed(); this.id = columnId; this.inspector = inspector; if (nullable) { isPresentOutStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.PRESENT); isPresent = new BitFieldWriter(isPresentOutStream, 1); } else { isPresent = null; } this.foundNulls = false; createBloomFilter = streamFactory.getBloomFilterColumns()[columnId]; indexStatistics = ColumnStatisticsImpl.create(schema); stripeColStatistics = ColumnStatisticsImpl.create(schema); fileStatistics = ColumnStatisticsImpl.create(schema); childrenWriters = new TreeWriter[0]; rowIndex = OrcProto.RowIndex.CreateBuilder(); rowIndexEntry = OrcProto.RowIndexEntry.CreateBuilder(); rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry); stripeStatsBuilders = new List<OrcProto.StripeStatistics.Builder>(); if (streamFactory.buildIndex()) { rowIndexStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.ROW_INDEX); } else { rowIndexStream = null; } if (createBloomFilter) { bloomFilterEntry = OrcProto.BloomFilter.CreateBuilder(); bloomFilterIndex = OrcProto.BloomFilterIndex.CreateBuilder(); bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.BLOOM_FILTER); bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(), streamFactory.getBloomFilterFPP()); } else { bloomFilterEntry = null; bloomFilterIndex = null; bloomFilterStream = null; bloomFilter = null; } }
public void testIntNullSafeEqualsBloomFilter() { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addLong(i); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100)); Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addLong(15); Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testStringInBloomFilter() { List<object> args = new List<object>(); args.Add("str_15"); args.Add("str_19"); PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING, "x", null, args); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addString("str_" + i); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200")); Assert.Equal(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addString("str_19"); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addString("str_15"); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testDoubleEqualsBloomFilter() { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addDouble(i); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0)); Assert.Equal(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addDouble(15.0); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testIntInBloomFilter() { List<object> args = new List<object>(); args.Add(15L); args.Add(19L); PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG, "x", null, args); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addLong(i); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100)); Assert.Equal(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addLong(19); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addLong(15); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testDecimalNullSafeEqualsBloomFilter() { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x", HiveDecimal.Parse("15"), null); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addString(HiveDecimal.create(i).ToString()); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200")); Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addString(HiveDecimal.create(15).ToString()); Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testTimestampNullSafeEqualsBloomFilter() { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x", new Timestamp(15), null); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addLong((new Timestamp(i)).Milliseconds); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100)); Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addLong((new Timestamp(15)).Milliseconds); Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testStringNullSafeEqualsBloomFilter() { PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf( PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addString("str_" + i); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200")); Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addString("str_15"); Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testMerge() { BloomFilter bf = new BloomFilter(10000); string val = "bloo"; string val1 = "bloom fil"; string val2 = "bloom filter"; string val3 = "cuckoo filter"; bf.addString(val); bf.addString(val1); bf.addString(val2); bf.addString(val3); BloomFilter bf2 = new BloomFilter(10000); string v = "2_bloo"; string v1 = "2_bloom fil"; string v2 = "2_bloom filter"; string v3 = "2_cuckoo filter"; bf2.addString(v); bf2.addString(v1); bf2.addString(v2); bf2.addString(v3); Assert.Equal(true, bf.testString(val)); Assert.Equal(true, bf.testString(val1)); Assert.Equal(true, bf.testString(val2)); Assert.Equal(true, bf.testString(val3)); Assert.Equal(false, bf.testString(v)); Assert.Equal(false, bf.testString(v1)); Assert.Equal(false, bf.testString(v2)); Assert.Equal(false, bf.testString(v3)); bf.merge(bf2); Assert.Equal(true, bf.testString(val)); Assert.Equal(true, bf.testString(val1)); Assert.Equal(true, bf.testString(val2)); Assert.Equal(true, bf.testString(val3)); Assert.Equal(true, bf.testString(v)); Assert.Equal(true, bf.testString(v1)); Assert.Equal(true, bf.testString(v2)); Assert.Equal(true, bf.testString(v3)); }
public void testNullsInBloomFilter() { List<object> args = new List<object>(); args.Add(HiveDecimal.Parse("15")); args.Add(null); args.Add(HiveDecimal.Parse("19")); PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL, "x", null, args); BloomFilter bf = new BloomFilter(10000); for (int i = 20; i < 1000; i++) { bf.addString(HiveDecimal.create(i).ToString()); } ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false)); // hasNull is false, so bloom filter should return NO Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true)); // hasNull is true, so bloom filter should return YES_NO_NULL Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addString(HiveDecimal.create(19).ToString()); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); bf.addString(HiveDecimal.create(15).ToString()); Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf)); }
public void testBloomFilterBytes() { BloomFilter bf = new BloomFilter(10000); byte[] val = new byte[] { 1, 2, 3 }; byte[] val1 = new byte[] { 1, 2, 3, 4 }; byte[] val2 = new byte[] { 1, 2, 3, 4, 5 }; byte[] val3 = new byte[] { 1, 2, 3, 4, 5, 6 }; Assert.Equal(false, bf.test(val)); Assert.Equal(false, bf.test(val1)); Assert.Equal(false, bf.test(val2)); Assert.Equal(false, bf.test(val3)); bf.add(val); Assert.Equal(true, bf.test(val)); Assert.Equal(false, bf.test(val1)); Assert.Equal(false, bf.test(val2)); Assert.Equal(false, bf.test(val3)); bf.add(val1); Assert.Equal(true, bf.test(val)); Assert.Equal(true, bf.test(val1)); Assert.Equal(false, bf.test(val2)); Assert.Equal(false, bf.test(val3)); bf.add(val2); Assert.Equal(true, bf.test(val)); Assert.Equal(true, bf.test(val1)); Assert.Equal(true, bf.test(val2)); Assert.Equal(false, bf.test(val3)); bf.add(val3); Assert.Equal(true, bf.test(val)); Assert.Equal(true, bf.test(val1)); Assert.Equal(true, bf.test(val2)); Assert.Equal(true, bf.test(val3)); byte[] randVal = new byte[COUNT]; for (int i = 0; i < COUNT; i++) { rand.NextBytes(randVal); bf.add(randVal); } // last value should be present Assert.Equal(true, bf.test(randVal)); // most likely this value should not exist randVal[0] = 0; randVal[1] = 0; randVal[2] = 0; randVal[3] = 0; randVal[4] = 0; Assert.Equal(false, bf.test(randVal)); Assert.Equal(7800, bf.sizeInBytes()); }
private static void writeBloomFilterStats(JsonWriter writer, BloomFilter bf) { int bitCount = bf.getBitSize(); int popCount = 0; foreach (long l in bf.getBitSet()) { popCount += Long.NumberOfOnes(l); } int k = bf.getNumHashFunctions(); float loadFactor = (float)popCount / (float)bitCount; float expectedFpp = (float)Math.Pow(loadFactor, k); writer.key("numHashFunctions").value(k); writer.key("bitCount").value(bitCount); writer.key("popCount").value(popCount); writer.key("loadFactor").value(loadFactor); writer.key("expectedFpp").value(expectedFpp); }