/** * Do the recursive parse of the Hive ExprNodeDesc into our ExpressionTree. * @param expression the Hive ExprNodeDesc */ private void parse(ExprNodeDesc expression) { // Most of the stuff we can handle are generic function descriptions, so // handle the special cases. if (expression.GetType() != typeof(ExprNodeGenericFuncDesc)) { // if it is a reference to a boolean column, covert it to a truth test. if (expression is ExprNodeColumnDesc) { ExprNodeColumnDesc columnDesc = (ExprNodeColumnDesc)expression; if (columnDesc.getTypeString().Equals("boolean")) { builder.equals(columnDesc.getColumn(), PredicateLeaf.Type.BOOLEAN, true); return; } } // otherwise, we don't know what to do so make it a maybe builder.literal(TruthValue.YES_NO_NULL); return; } // get the kind of expression ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc)expression; Type op = expr.getGenericUDF().GetType(); // handle the logical operators if (op == typeof(GenericUDFOPOr)) { builder.startOr(); addChildren(expr); builder.end(); } else if (op == typeof(GenericUDFOPAnd)) { builder.startAnd(); addChildren(expr); builder.end(); } else if (op == typeof(GenericUDFOPNot)) { builder.startNot(); addChildren(expr); builder.end(); } else if (op == typeof(GenericUDFOPEqual)) { createLeaf(PredicateLeaf.Operator.EQUALS, expr); } else if (op == typeof(GenericUDFOPNotEqual)) { builder.startNot(); createLeaf(PredicateLeaf.Operator.EQUALS, expr); builder.end(); } else if (op == typeof(GenericUDFOPEqualNS)) { createLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, expr); } else if (op == typeof(GenericUDFOPGreaterThan)) { builder.startNot(); createLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, expr); builder.end(); } else if (op == typeof(GenericUDFOPEqualOrGreaterThan)) { builder.startNot(); createLeaf(PredicateLeaf.Operator.LESS_THAN, expr); builder.end(); } else if (op == typeof(GenericUDFOPLessThan)) { createLeaf(PredicateLeaf.Operator.LESS_THAN, expr); } else if (op == typeof(GenericUDFOPEqualOrLessThan)) { createLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, expr); } else if (op == typeof(GenericUDFIn)) { createLeaf(PredicateLeaf.Operator.IN, expr, 0); } else if (op == typeof(GenericUDFBetween)) { createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1); } else if (op == typeof(GenericUDFOPNull)) { createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0); } else if (op == typeof(GenericUDFOPNotNull)) { builder.startNot(); createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0); builder.end(); // otherwise, we didn't understand it, so mark it maybe } else { builder.literal(TruthValue.YES_NO_NULL); } }
public void testSplitEliminationComplexExpr() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow)); using (Stream file = File.OpenWrite(testFilePath)) using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000)) { writeData(writer); } conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000"); conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "150000"); InputFormat @in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.ToString()); // predicate expression: userid <= 100 and subtype <= 1000.0 GenericUDF udf = new GenericUDFOPEqualOrLessThan(); List<ExprNodeDesc> childExpr = new List<ExprNodeDesc>(); ExprNodeColumnDesc col = new ExprNodeColumnDesc(typeof(long), "userid", "T", false); ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); childExpr.Add(col); childExpr.Add(con); ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); GenericUDF udf1 = new GenericUDFOPEqualOrLessThan(); List<ExprNodeDesc> childExpr1 = new List<ExprNodeDesc>(); ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(typeof(double), "subtype", "T", false); ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0); childExpr1.Add(col1); childExpr1.Add(con1); ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); GenericUDF udf2 = new GenericUDFOPAnd(); List<ExprNodeDesc> childExpr2 = new List<ExprNodeDesc>(); childExpr2.Add(en); childExpr2.Add(en1); ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); string sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); InputSplit[] splits = @in.getSplits(conf, 1); Assert.Equal(2, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(0.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // no stripe will satisfy the predicate Assert.Equal(0, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(1.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // only first stripe will satisfy condition and hence single split Assert.Equal(1, splits.Length); udf = new GenericUDFOPEqual(); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(80.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // first two stripes will satisfy condition and hence single split Assert.Equal(2, splits.Length); udf = new GenericUDFOPEqual(); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); udf1 = new GenericUDFOPEqual(); con1 = new ExprNodeConstantDesc(80.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // only second stripes will satisfy condition and hence single split Assert.Equal(1, splits.Length); }
public void testSplitEliminationSmallMaxSplit() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow)); using (Stream file = File.OpenWrite(testFilePath)) using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000)) { writeData(writer); } conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000"); conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "5000"); InputFormat @in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.ToString()); GenericUDF udf = new GenericUDFOPEqualOrLessThan(); List <ExprNodeDesc> childExpr = new List <ExprNodeDesc>(); ExprNodeColumnDesc col = new ExprNodeColumnDesc(typeof(long), "userid", "T", false); ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); childExpr.Add(col); childExpr.Add(con); ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); string sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); InputSplit[] splits = @in.getSplits(conf, 1); Assert.Equal(5, splits.Length); con = new ExprNodeConstantDesc(1); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(0, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(1, splits.Length); con = new ExprNodeConstantDesc(5); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(2, splits.Length); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(3, splits.Length); con = new ExprNodeConstantDesc(29); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(4, splits.Length); con = new ExprNodeConstantDesc(70); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(5, splits.Length); }
public void testSplitEliminationSmallMaxSplit() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow)); using (Stream file = File.OpenWrite(testFilePath)) using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000)) { writeData(writer); } conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000"); conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "5000"); InputFormat @in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.ToString()); GenericUDF udf = new GenericUDFOPEqualOrLessThan(); List<ExprNodeDesc> childExpr = new List<ExprNodeDesc>(); ExprNodeColumnDesc col = new ExprNodeColumnDesc(typeof(long), "userid", "T", false); ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); childExpr.Add(col); childExpr.Add(con); ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); string sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); InputSplit[] splits = @in.getSplits(conf, 1); Assert.Equal(5, splits.Length); con = new ExprNodeConstantDesc(1); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(0, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(1, splits.Length); con = new ExprNodeConstantDesc(5); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(2, splits.Length); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(3, splits.Length); con = new ExprNodeConstantDesc(29); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(4, splits.Length); con = new ExprNodeConstantDesc(70); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); sargStr = Utilities.serializeExpression(en); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); Assert.Equal(5, splits.Length); }
public void testSplitEliminationComplexExpr() { ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector(typeof(AllTypesRow)); using (Stream file = File.OpenWrite(testFilePath)) using (Writer writer = OrcFile.createWriter(testFilePath, file, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000)) { writeData(writer); } conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMINSPLITSIZE"), "1000"); conf.set(ShimLoader.getHadoopShims().getHadoopConfNames().get("MAPREDMAXSPLITSIZE"), "150000"); InputFormat @in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.ToString()); // predicate expression: userid <= 100 and subtype <= 1000.0 GenericUDF udf = new GenericUDFOPEqualOrLessThan(); List <ExprNodeDesc> childExpr = new List <ExprNodeDesc>(); ExprNodeColumnDesc col = new ExprNodeColumnDesc(typeof(long), "userid", "T", false); ExprNodeConstantDesc con = new ExprNodeConstantDesc(100); childExpr.Add(col); childExpr.Add(con); ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); GenericUDF udf1 = new GenericUDFOPEqualOrLessThan(); List <ExprNodeDesc> childExpr1 = new List <ExprNodeDesc>(); ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(typeof(double), "subtype", "T", false); ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0); childExpr1.Add(col1); childExpr1.Add(con1); ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); GenericUDF udf2 = new GenericUDFOPAnd(); List <ExprNodeDesc> childExpr2 = new List <ExprNodeDesc>(); childExpr2.Add(en); childExpr2.Add(en1); ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); string sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); InputSplit[] splits = @in.getSplits(conf, 1); Assert.Equal(2, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(0.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // no stripe will satisfy the predicate Assert.Equal(0, splits.Length); con = new ExprNodeConstantDesc(2); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(1.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // only first stripe will satisfy condition and hence single split Assert.Equal(1, splits.Length); udf = new GenericUDFOPEqual(); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); con1 = new ExprNodeConstantDesc(80.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // first two stripes will satisfy condition and hence single split Assert.Equal(2, splits.Length); udf = new GenericUDFOPEqual(); con = new ExprNodeConstantDesc(13); childExpr[1] = con; en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr); udf1 = new GenericUDFOPEqual(); con1 = new ExprNodeConstantDesc(80.0); childExpr1[1] = con1; en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1); childExpr2[0] = en; childExpr2[1] = en1; en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2); sargStr = Utilities.serializeExpression(en2); conf.set("hive.io.filter.expr.serialized", sargStr); splits = @in.getSplits(conf, 1); // only second stripes will satisfy condition and hence single split Assert.Equal(1, splits.Length); }