public void testBloomFilterByte()
        {
            BloomFilter bf = new BloomFilter(10000);
            byte val = Byte.MinValue;
            byte val1 = 1;
            byte val2 = 2;
            byte val3 = Byte.MaxValue;

            Assert.Equal(false, bf.testLong(val));
            Assert.Equal(false, bf.testLong(val1));
            Assert.Equal(false, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(false, bf.testLong(val1));
            Assert.Equal(false, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val1);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(true, bf.testLong(val1));
            Assert.Equal(false, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val2);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(true, bf.testLong(val1));
            Assert.Equal(true, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val3);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(true, bf.testLong(val1));
            Assert.Equal(true, bf.testLong(val2));
            Assert.Equal(true, bf.testLong(val3));

            byte randVal = 0;
            for (int i = 0; i < COUNT; i++)
            {
                randVal = (byte)rand.Next(Byte.MaxValue);
                bf.addLong(randVal);
            }
            // last value should be present
            Assert.Equal(true, bf.testLong(randVal));
            // most likely this value should not exist
            Assert.Equal(false, bf.testLong(unchecked((byte)-120)));

            Assert.Equal(7800, bf.sizeInBytes());
        }
        public void testBloomFilterString()
        {
            BloomFilter bf = new BloomFilter(100000);
            string val = "bloo";
            string val1 = "bloom fil";
            string val2 = "bloom filter";
            string val3 = "cuckoo filter";

            Assert.Equal(false, bf.testString(val));
            Assert.Equal(false, bf.testString(val1));
            Assert.Equal(false, bf.testString(val2));
            Assert.Equal(false, bf.testString(val3));
            bf.addString(val);
            Assert.Equal(true, bf.testString(val));
            Assert.Equal(false, bf.testString(val1));
            Assert.Equal(false, bf.testString(val2));
            Assert.Equal(false, bf.testString(val3));
            bf.addString(val1);
            Assert.Equal(true, bf.testString(val));
            Assert.Equal(true, bf.testString(val1));
            Assert.Equal(false, bf.testString(val2));
            Assert.Equal(false, bf.testString(val3));
            bf.addString(val2);
            Assert.Equal(true, bf.testString(val));
            Assert.Equal(true, bf.testString(val1));
            Assert.Equal(true, bf.testString(val2));
            Assert.Equal(false, bf.testString(val3));
            bf.addString(val3);
            Assert.Equal(true, bf.testString(val));
            Assert.Equal(true, bf.testString(val1));
            Assert.Equal(true, bf.testString(val2));
            Assert.Equal(true, bf.testString(val3));

            long randVal = 0;
            for (int i = 0; i < COUNT; i++)
            {
                randVal = rand.NextLong();
                bf.addString(randVal.ToString(CultureInfo.InvariantCulture));
            }
            // last value should be present
            Assert.Equal(true, bf.testString(randVal.ToString(CultureInfo.InvariantCulture)));
            // most likely this value should not exist
            Assert.Equal(false, bf.testString((-120L).ToString(CultureInfo.InvariantCulture)));

            Assert.Equal(77944, bf.sizeInBytes());
        }
        public void testBloomFilterFloat()
        {
            BloomFilter bf = new BloomFilter(10000);
            float val = Single.MinValue;
            float val1 = 1.1f;
            float val2 = 2.2f;
            float val3 = Single.MaxValue;

            Assert.Equal(false, bf.testDouble(val));
            Assert.Equal(false, bf.testDouble(val1));
            Assert.Equal(false, bf.testDouble(val2));
            Assert.Equal(false, bf.testDouble(val3));
            bf.addDouble(val);
            Assert.Equal(true, bf.testDouble(val));
            Assert.Equal(false, bf.testDouble(val1));
            Assert.Equal(false, bf.testDouble(val2));
            Assert.Equal(false, bf.testDouble(val3));
            bf.addDouble(val1);
            Assert.Equal(true, bf.testDouble(val));
            Assert.Equal(true, bf.testDouble(val1));
            Assert.Equal(false, bf.testDouble(val2));
            Assert.Equal(false, bf.testDouble(val3));
            bf.addDouble(val2);
            Assert.Equal(true, bf.testDouble(val));
            Assert.Equal(true, bf.testDouble(val1));
            Assert.Equal(true, bf.testDouble(val2));
            Assert.Equal(false, bf.testDouble(val3));
            bf.addDouble(val3);
            Assert.Equal(true, bf.testDouble(val));
            Assert.Equal(true, bf.testDouble(val1));
            Assert.Equal(true, bf.testDouble(val2));
            Assert.Equal(true, bf.testDouble(val3));

            float randVal = 0;
            for (int i = 0; i < COUNT; i++)
            {
                randVal = rand.NextFloat();
                bf.addDouble(randVal);
            }
            // last value should be present
            Assert.Equal(true, bf.testDouble(randVal));
            // most likely this value should not exist
            Assert.Equal(false, bf.testDouble(-120.2f));

            Assert.Equal(7800, bf.sizeInBytes());
        }
        public void testBloomFilterLong()
        {
            BloomFilter bf = new BloomFilter(10000);
            long val = Int64.MinValue;
            long val1 = 1;
            long val2 = 2;
            long val3 = Int64.MaxValue;

            Assert.Equal(false, bf.testLong(val));
            Assert.Equal(false, bf.testLong(val1));
            Assert.Equal(false, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(false, bf.testLong(val1));
            Assert.Equal(false, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val1);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(true, bf.testLong(val1));
            Assert.Equal(false, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val2);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(true, bf.testLong(val1));
            Assert.Equal(true, bf.testLong(val2));
            Assert.Equal(false, bf.testLong(val3));
            bf.addLong(val3);
            Assert.Equal(true, bf.testLong(val));
            Assert.Equal(true, bf.testLong(val1));
            Assert.Equal(true, bf.testLong(val2));
            Assert.Equal(true, bf.testLong(val3));

            long randVal = 0;
            for (int i = 0; i < COUNT; i++)
            {
                randVal = rand.NextLong();
                bf.addLong(randVal);
            }
            // last value should be present
            Assert.Equal(true, bf.testLong(randVal));
            // most likely this value should not exist
            Assert.Equal(false, bf.testLong(-120));

            Assert.Equal(7800, bf.sizeInBytes());
        }
Example #5
0
 /**
  * Merge the specified bloom filter with current bloom filter.
  *
  * @param that - bloom filter to merge
  */
 public void merge(BloomFilter that)
 {
     if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions)
     {
         this.bitSet.putAll(that.bitSet);
     }
     else
     {
         throw new ArgumentException("BloomFilters are not compatible for merging." +
             " this - " + this.ToString() + " that - " + that.ToString());
     }
 }
Example #6
0
 /**
  * Create a tree writer.
  * @param columnId the column id of the column to write
  * @param inspector the object inspector to use
  * @param schema the row schema
  * @param streamFactory limited access to the Writer's data.
  * @param nullable can the value be null?
  * @
  */
 protected TreeWriter(
     int columnId,
     ObjectInspector inspector,
     TypeDescription schema,
     StreamFactory streamFactory,
     bool nullable)
 {
     this.streamFactory = streamFactory;
     this.isCompressed = streamFactory.isCompressed();
     this.id = columnId;
     this.inspector = inspector;
     if (nullable)
     {
         isPresentOutStream = streamFactory.createStream(id,
             OrcProto.Stream.Types.Kind.PRESENT);
         isPresent = new BitFieldWriter(isPresentOutStream, 1);
     }
     else
     {
         isPresent = null;
     }
     this.foundNulls = false;
     createBloomFilter = streamFactory.getBloomFilterColumns()[columnId];
     indexStatistics = ColumnStatisticsImpl.create(schema);
     stripeColStatistics = ColumnStatisticsImpl.create(schema);
     fileStatistics = ColumnStatisticsImpl.create(schema);
     childrenWriters = new TreeWriter[0];
     rowIndex = OrcProto.RowIndex.CreateBuilder();
     rowIndexEntry = OrcProto.RowIndexEntry.CreateBuilder();
     rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry);
     stripeStatsBuilders = new List<OrcProto.StripeStatistics.Builder>();
     if (streamFactory.buildIndex())
     {
         rowIndexStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.ROW_INDEX);
     }
     else
     {
         rowIndexStream = null;
     }
     if (createBloomFilter)
     {
         bloomFilterEntry = OrcProto.BloomFilter.CreateBuilder();
         bloomFilterIndex = OrcProto.BloomFilterIndex.CreateBuilder();
         bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Types.Kind.BLOOM_FILTER);
         bloomFilter = new BloomFilter(streamFactory.getRowIndexStride(), streamFactory.getBloomFilterFPP());
     }
     else
     {
         bloomFilterEntry = null;
         bloomFilterIndex = null;
         bloomFilterStream = null;
         bloomFilter = null;
     }
 }
        public void testIntNullSafeEqualsBloomFilter()
        {
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
                PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.LONG, "x", 15L, null);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addLong(i);
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
            Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addLong(15);
            Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testStringInBloomFilter()
        {
            List<object> args = new List<object>();
            args.Add("str_15");
            args.Add("str_19");
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
                (PredicateLeaf.Operator.IN, PredicateLeaf.Type.STRING,
                    "x", null, args);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addString("str_" + i);
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
            Assert.Equal(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addString("str_19");
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addString("str_15");
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testDoubleEqualsBloomFilter()
        {
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
                PredicateLeaf.Operator.EQUALS, PredicateLeaf.Type.FLOAT, "x", 15.0, null);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addDouble(i);
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDoubleStats(10.0, 100.0));
            Assert.Equal(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addDouble(15.0);
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testIntInBloomFilter()
        {
            List<object> args = new List<object>();
            args.Add(15L);
            args.Add(19L);
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
                (PredicateLeaf.Operator.IN, PredicateLeaf.Type.LONG,
                    "x", null, args);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addLong(i);
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createIntStats(10, 100));
            Assert.Equal(TruthValue.NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addLong(19);
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addLong(15);
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testDecimalNullSafeEqualsBloomFilter()
        {
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
                PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.DECIMAL, "x",
                HiveDecimal.Parse("15"),
                null);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addString(HiveDecimal.create(i).ToString());
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200"));
            Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addString(HiveDecimal.create(15).ToString());
            Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testTimestampNullSafeEqualsBloomFilter()
        {
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
                PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
                new Timestamp(15),
                null);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addLong((new Timestamp(i)).Milliseconds);
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createTimestampStats(10, 100));
            Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addLong((new Timestamp(15)).Milliseconds);
            Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testStringNullSafeEqualsBloomFilter()
        {
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf(
                PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.STRING, "x", "str_15", null);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addString("str_" + i);
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createStringStats("str_10", "str_200"));
            Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addString("str_15");
            Assert.Equal(TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testMerge()
        {
            BloomFilter bf = new BloomFilter(10000);
            string val = "bloo";
            string val1 = "bloom fil";
            string val2 = "bloom filter";
            string val3 = "cuckoo filter";
            bf.addString(val);
            bf.addString(val1);
            bf.addString(val2);
            bf.addString(val3);

            BloomFilter bf2 = new BloomFilter(10000);
            string v = "2_bloo";
            string v1 = "2_bloom fil";
            string v2 = "2_bloom filter";
            string v3 = "2_cuckoo filter";
            bf2.addString(v);
            bf2.addString(v1);
            bf2.addString(v2);
            bf2.addString(v3);

            Assert.Equal(true, bf.testString(val));
            Assert.Equal(true, bf.testString(val1));
            Assert.Equal(true, bf.testString(val2));
            Assert.Equal(true, bf.testString(val3));
            Assert.Equal(false, bf.testString(v));
            Assert.Equal(false, bf.testString(v1));
            Assert.Equal(false, bf.testString(v2));
            Assert.Equal(false, bf.testString(v3));

            bf.merge(bf2);

            Assert.Equal(true, bf.testString(val));
            Assert.Equal(true, bf.testString(val1));
            Assert.Equal(true, bf.testString(val2));
            Assert.Equal(true, bf.testString(val3));
            Assert.Equal(true, bf.testString(v));
            Assert.Equal(true, bf.testString(v1));
            Assert.Equal(true, bf.testString(v2));
            Assert.Equal(true, bf.testString(v3));
        }
        public void testNullsInBloomFilter()
        {
            List<object> args = new List<object>();
            args.Add(HiveDecimal.Parse("15"));
            args.Add(null);
            args.Add(HiveDecimal.Parse("19"));
            PredicateLeaf pred = TestSearchArgumentImpl.createPredicateLeaf
                (PredicateLeaf.Operator.IN, PredicateLeaf.Type.DECIMAL,
                    "x", null, args);
            BloomFilter bf = new BloomFilter(10000);
            for (int i = 20; i < 1000; i++)
            {
                bf.addString(HiveDecimal.create(i).ToString());
            }
            ColumnStatistics cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", false));
            // hasNull is false, so bloom filter should return NO
            Assert.Equal(TruthValue.NO, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            cs = ColumnStatisticsImpl.deserialize(createDecimalStats("10", "200", true));
            // hasNull is true, so bloom filter should return YES_NO_NULL
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addString(HiveDecimal.create(19).ToString());
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));

            bf.addString(HiveDecimal.create(15).ToString());
            Assert.Equal(TruthValue.YES_NO_NULL, RecordReaderImpl.evaluatePredicate(cs, pred, bf));
        }
        public void testBloomFilterBytes()
        {
            BloomFilter bf = new BloomFilter(10000);
            byte[] val = new byte[] { 1, 2, 3 };
            byte[] val1 = new byte[] { 1, 2, 3, 4 };
            byte[] val2 = new byte[] { 1, 2, 3, 4, 5 };
            byte[] val3 = new byte[] { 1, 2, 3, 4, 5, 6 };

            Assert.Equal(false, bf.test(val));
            Assert.Equal(false, bf.test(val1));
            Assert.Equal(false, bf.test(val2));
            Assert.Equal(false, bf.test(val3));
            bf.add(val);
            Assert.Equal(true, bf.test(val));
            Assert.Equal(false, bf.test(val1));
            Assert.Equal(false, bf.test(val2));
            Assert.Equal(false, bf.test(val3));
            bf.add(val1);
            Assert.Equal(true, bf.test(val));
            Assert.Equal(true, bf.test(val1));
            Assert.Equal(false, bf.test(val2));
            Assert.Equal(false, bf.test(val3));
            bf.add(val2);
            Assert.Equal(true, bf.test(val));
            Assert.Equal(true, bf.test(val1));
            Assert.Equal(true, bf.test(val2));
            Assert.Equal(false, bf.test(val3));
            bf.add(val3);
            Assert.Equal(true, bf.test(val));
            Assert.Equal(true, bf.test(val1));
            Assert.Equal(true, bf.test(val2));
            Assert.Equal(true, bf.test(val3));

            byte[] randVal = new byte[COUNT];
            for (int i = 0; i < COUNT; i++)
            {
                rand.NextBytes(randVal);
                bf.add(randVal);
            }
            // last value should be present
            Assert.Equal(true, bf.test(randVal));
            // most likely this value should not exist
            randVal[0] = 0;
            randVal[1] = 0;
            randVal[2] = 0;
            randVal[3] = 0;
            randVal[4] = 0;
            Assert.Equal(false, bf.test(randVal));

            Assert.Equal(7800, bf.sizeInBytes());
        }
Example #17
0
 private static void writeBloomFilterStats(JsonWriter writer, BloomFilter bf)
 {
     int bitCount = bf.getBitSize();
     int popCount = 0;
     foreach (long l in bf.getBitSet())
     {
         popCount += Long.NumberOfOnes(l);
     }
     int k = bf.getNumHashFunctions();
     float loadFactor = (float)popCount / (float)bitCount;
     float expectedFpp = (float)Math.Pow(loadFactor, k);
     writer.key("numHashFunctions").value(k);
     writer.key("bitCount").value(bitCount);
     writer.key("popCount").value(popCount);
     writer.key("loadFactor").value(loadFactor);
     writer.key("expectedFpp").value(expectedFpp);
 }