public override IRow Process(IRow input, IUpdatableRow output) { var s = input.Get <string>("name"); output.Set <string>("reversed", Reverse(s)); return(output.AsReadOnly()); }
/// https://docs.microsoft.com/en-us/azure/data-lake-analytics/data-lake-analytics-u-sql-programmability-guide#use-user-defined-extractors /// <summary>Extract is called at least once per vertex</summary> /// <param name="input">Wrapper for a Stream</param> /// <param name="output">IUpdatableRow uses a mutable builder pattern -- /// set individual fields with IUpdatableRow.Set, /// then build an immutable IRow by calling IUpdatableRow.AsReadOnly.</param> /// <returns>A sequence of IRows.</returns> public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // use XML Reader for streaming the XML to keep memory usage to a minimum using (XmlReader reader = XmlReader.Create(input.BaseStream)) { reader.MoveToContent(); // forward reader to next available Element while (reader.ReadToFollowing(this.elementName)) { // decouple from reader position with new subtreeReader // this prevents reader.ReadToFollowing() from skipping rows as its not forwarded now by ReadOuterXml() using (XmlReader subtreeReader = reader.ReadSubtree()) { subtreeReader.MoveToContent(); // Replace CRLF & CR & LF character (\r\n) by space ( ) within the XML to ensure the string fits in 1 row output.Set <string>(0, XElement.Parse(subtreeReader.ReadOuterXml()). ToString(SaveOptions.DisableFormatting). Replace("\r\n", " ").Replace('\n', ' ').Replace('\r', ' ')); // then call output.AsReadOnly to build an immutable IRow. yield return(output.AsReadOnly()); } } } }
public static void ExtractPropertyDoubleOpt(JsonTextReader jsonReader, IUpdatableRow output, ColumnInfo columnInfo) { jsonReader.Read(); if (columnInfo.IsRequired) { switch (jsonReader.TokenType) { case JsonToken.Integer: output.Set(columnInfo.Idx, (float)(long)jsonReader.Value); break; case JsonToken.Float: output.Set(columnInfo.Idx, (float)(double)jsonReader.Value); break; case JsonToken.Null: output.Set(columnInfo.Idx, (double?)null); break; default: throw new Exception("wrong data type"); } } }
protected virtual IEnumerable <IRow> Extract(Stream inputStream, IUpdatableRow output) { // Json.Net using (var reader = new JsonTextReader(new StreamReader(inputStream))) { // Parse Json one token at a time if (!reader.Read()) { yield break; } if (reader.TokenType != JsonToken.StartObject) { yield break; } var token = JToken.Load(reader); // Rows // All objects are represented as rows foreach (JObject o in SelectChildren(token, this.rowpath)) { // All fields are represented as columns this.JObjectToRow(o, output); yield return(output.AsReadOnly()); } } }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { string id; string from; string to; // 1. Collect partition informations. using (var reader = new StreamReader(input.BaseStream)) { string line = reader.ReadLine(); var parts = line.Split('\t'); id = parts[0]; from = parts[1]; to = parts[2]; } // 2. Read data source using partition information. using (var reader = ProviderFactory.CreateInstance(_cnxString, from, to)) { foreach (var row in reader.Rows) { output.Set("extractor_id", _id); output.Set("partition_id", id); output.Set("partition", row[0]); output.Set("value1", row[1]); output.Set("value2", row[2]); yield return(output.AsReadOnly()); } } // Add some latency to data read. Thread.Sleep(10000); }
public override IRow Process(IRow inRow, IUpdatableRow outRow) { var row = (ScopeEngineManaged.SqlIpRow)inRow; var output = (ScopeEngineManaged.SqlIpUpdatableRow)outRow; int exceptionIndex = 0; try { System.Int32 col_COLOR_ID = row.GetInternal <System.Int32>(0); System.String col_COLOR_NAME = row.GetInternal <System.String>(1); System.String col_COLOR_RGB = row.GetInternal <System.String>(2); System.String col_IS_TRANSPARENT = row.GetInternal <System.String>(3); output.SetInternal(0, col_COLOR_NAME.ToUpper()); exceptionIndex++; output.SetInternal(1, col_IS_TRANSPARENT.ToUpper() == "T" ? "Y" : "N"); exceptionIndex++; output.SetInternal(2, col_COLOR_ID); exceptionIndex++; output.SetInternal(3, col_COLOR_RGB); exceptionIndex++; } catch (Exception exception) { ScopeEngineManaged.UserExceptionHelper.WrapUserExpressionException(exceptionsInfo[exceptionIndex], ScopeEngineManaged.SqlHelper.Dump(row), exception); } return(output.AsReadOnly()); }
public override IEnumerable <IRow> Apply(IRow input, IUpdatableRow output) { DateTime startTime = input.Get <DateTime>(startColumn); DateTime endTime = input.Get <DateTime>(endColumn); var startValueCol = (from x in input.Schema where x.Name == startValueColumn select x).First(); if (startValueCol.Type == typeof(bool)) { var startValue = input.Get <bool>(startValueColumn); return(locf <bool>(startTime, endTime, startValue, output)); } else if (startValueCol.Type == typeof(int)) { var startValue = input.Get <int>(startValueColumn); return(locf <int>(startTime, endTime, startValue, output)); } else if (startValueCol.Type == typeof(double)) { var startValue = input.Get <double>(startValueColumn); return(locf <double>(startTime, endTime, startValue, output)); } else if (startValueCol.Type == typeof(string)) { var startValue = input.Get <string>(startValueColumn); return(locf <string>(startTime, endTime, startValue, output)); } else { return(null); } }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { if (input.Length == 0) { yield break; } var serializer = AvroSerializer.CreateGeneric(avroSchema); using (var genericReader = AvroContainer.CreateGenericReader(input.BaseStream)) { using (var reader = new SequentialReader <dynamic>(genericReader)) { foreach (var obj in reader.Objects) { foreach (var column in output.Schema) { output.Set(column.Name, obj[column.Name]); } yield return(output.AsReadOnly()); } } } }
public override IRow Process(IRow input, IUpdatableRow output) { output.Set <int>("DepID", input.Get <int>("DepID")); output.Set <string>("DepName", input.Get <string>("DepName")); output.Set <string>("HelloWorld", hw); return(output.AsReadOnly()); }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { string line; //Read the input line by line foreach (Stream current in input.Split(_row_delim)) { using (StreamReader streamReader = new StreamReader(current, this._encoding)) { line = streamReader.ReadToEnd().Trim(); LogRowParser splitter = new LogRowParser(); LogRowElements parts = new LogRowElements(); parts = splitter.ParseElements(line); output.Set <string>(0, parts.IP); output.Set <string>(1, parts.Identity); output.Set <string>(2, parts.UserId); output.Set <string>(3, parts.Timestamp); output.Set <string>(4, parts.Offset); output.Set <string>(5, parts.RequestMessage); output.Set <string>(6, parts.StatusCode); output.Set <string>(7, parts.Size); output.Set <string>(8, parts.Referer); output.Set <string>(9, parts.URL); output.Set <string>(10, parts.UserAgent); output.Set <string>(11, parts.Forwarded); yield return(output.AsReadOnly()); } } }
public override IRow Process(IRow input, IUpdatableRow output) { string UserID = input.Get <string>("UserID"); string Name = input.Get <string>("Name"); string Address = input.Get <string>("Address"); string City = input.Get <string>("City"); string State = input.Get <string>("State"); string PostalCode = input.Get <string>("PostalCode"); string Country = input.Get <string>("Country"); string Phone = input.Get <string>("Phone"); if (CountryTranslation.Keys.Contains(Country)) { Country = CountryTranslation[Country]; } output.Set <string>(0, UserID); output.Set <string>(1, Name); output.Set <string>(2, Address); output.Set <string>(3, City); output.Set <string>(4, State); output.Set <string>(5, PostalCode); output.Set <string>(6, Country); output.Set <string>(7, Phone); return(output.AsReadOnly()); }
protected virtual void LineToRow(string line, IUpdatableRow row) { int index = 0; foreach (var map in _fieldMap) { if (line.Length < map.Key + map.Value) { index++; continue; } if (index < row.Schema.Count && row.Schema[index].Type != typeof(string)) { var typeConverter = TypeDescriptor.GetConverter(row.Schema[index].Type); if (typeConverter != null && typeConverter.CanConvertFrom(typeof(string))) { row.Set(index, typeConverter.ConvertFromString(line.Substring(map.Key, map.Value))); } } else { row.Set(index, line.Substring(map.Key, map.Value)); } index++; } }
public override IRow Process(IRow inRow, IUpdatableRow outRow) { var row = (ScopeEngineManaged.SqlIpRow)inRow; var output = (ScopeEngineManaged.SqlIpUpdatableRow)outRow; int exceptionIndex = 0; try { System.String col_location = row.GetInternal <System.String>(0); System.String col_device = row.GetInternal <System.String>(1); System.String col_custom = row.GetInternal <System.String>(2); output.SetInternal(0, Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(col_location)); exceptionIndex++; output.SetInternal(1, Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(col_device)); exceptionIndex++; output.SetInternal(2, Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(col_custom, "dimensions[0]")); exceptionIndex++; output.SetInternal(3, Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(col_custom, "dimensions[1]")); exceptionIndex++; output.SetInternal(4, Microsoft.Analytics.Samples.Formats.Json.JsonFunctions.JsonTuple(col_custom, "dimensions[2]")); exceptionIndex++; } catch (Exception exception) { ScopeEngineManaged.UserExceptionHelper.WrapUserExpressionException(exceptionsInfo[exceptionIndex], ScopeEngineManaged.SqlHelper.Dump(row), exception); } return(output.AsReadOnly()); }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { string line; using (StreamReader streamReader = new StreamReader(input.BaseStream, Encoding.UTF8)) { while ((line = streamReader.ReadLine()) != null) { var jObject = JsonConvert.DeserializeObject <JObject>(line); foreach (var column in output.Schema) { if (column.Type == typeof(string)) { output.Set(column.Name, jObject[column.Name].ToString()); } if (column.Type == typeof(DateTime)) { output.Set(column.Name, (DateTime.Parse(jObject[column.Name].ToString()))); } } yield return(output.AsReadOnly()); } } yield break; }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow outputrow) { outputrow.Set <long>("GC_TotalMem_Start", GC.GetTotalMemory(true)); outputrow.Set <long>("MaxUDOMemory", MyLimits.MaxUdoMemory); var buff_idx = 0; var failed = false; var gc_mem = GC.GetTotalMemory(true); try { while (buff_idx < no_buff) { alloc_mem[buff_idx] = new byte[increment]; alloc_mem[buff_idx][0] = 1; // to avoid it being optimized away buff_idx++; gc_mem = GC.GetTotalMemory(true); } } catch (Exception e) { failed = true; outputrow.Set <string>("error", e.Message); } outputrow.Set <long>("GC_TotalMem_End", gc_mem); outputrow.Set <bool>("failed", failed); outputrow.Set <long>("alloc_sz", buff_idx * increment); yield return(outputrow.AsReadOnly()); }
public static void Set <T>(this IUpdatableRow row, ColumnInfo columnInfo, T value) { if (columnInfo.IsRequired) { row.Set(columnInfo.Idx, value); } }
public override IEnumerable <IRow> Reduce(IRowset input, IUpdatableRow output) { int acc = 0; int max = 0; foreach (var row in input.Rows) { var timestamp = row.Get <DateTime>("timestamp"); var op = row.Get <string>("op"); if (op == "start") { acc++; } else { acc--; if (acc < 0) { acc = 0; } } max = System.Math.Max(max, acc); } output.Set <string>("cohort", "FOO"); output.Set <int>("max", max); yield return(output.AsReadOnly()); }
public override IRow Process(IRow input, IUpdatableRow output) { double lat = input.Get <double>(latColumn); double lon = input.Get <double>(lonColumn); GeoLocation loc = new GeoLocation { Longitude = lon, Latitude = lat }; var country = _service.FindCountry(loc); var USstates = _service.FindUsaState(loc); if (country != null && country.Name != null) { output.Set <string>("country", country.Name); } else { output.Set <string>("country", ""); } if (USstates != null && USstates.Name != null) { output.Set <string>("USstates", USstates.Name); } else { output.Set <string>("USstates", ""); } return(output.AsReadOnly()); }
public void TestMyProcessor() { //Schema: "a:int, b:int" USqlColumn <int> col1 = new USqlColumn <int>("a"); USqlColumn <int> col2 = new USqlColumn <int>("b"); List <IColumn> columns = new List <IColumn> { col1, col2 }; USqlSchema schema = new USqlSchema(columns); //Generate one row with specified column values object[] values = new object[2] { 2, 3 }; IRow input = new USqlRow(schema, values); IUpdatableRow output = input.AsUpdatable(); //Create UDO instance MyProcessor processor = new MyProcessor(floor: 4); IRow newOutput = processor.Process(input, output); //Verify results Assert.IsTrue(newOutput.Schema.Count == 2); Assert.IsTrue(newOutput.Get <int>(0) == 2); Assert.IsTrue(newOutput.Get <int>(1) == 4); }
public override IRow Process(IRow input, IUpdatableRow output) { var tag = input.Get<string>("Tag"); var category = input.Get<string>("Category"); category = "other"; foreach (var cat in categoryMapper) { var categoryName = cat.Key; var listOfPrefixes = cat.Value; var found = false; foreach (var pref in listOfPrefixes) { if (tag.StartsWith(pref)) { category = categoryName; found = true; break; } } if (found) { break; } } output.Set("Category", category); return output.AsReadOnly(); }
/// <summary>Extract is called at least once per vertex</summary> /// <param name="input">Wrapper for a Stream</param> /// <param name="output">IUpdatableRow uses a mutable builder pattern -- /// set individual fields with IUpdatableRow.Set, then build an immutable IRow by /// calling IUpdatableRow.AsReadOnly.</param> /// <returns>A sequence of IRows.</returns> public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // Make sure that all requested columns are of type string IColumn column = output.Schema.FirstOrDefault(col => col.Type != typeof(string)); if (column != null) { throw new ArgumentException(string.Format("Column '{0}' must be of type 'string', not '{1}'", column.Name, column.Type.Name)); } XmlDocument xmlDocument = new XmlDocument(); xmlDocument.Load(input.BaseStream); foreach (XmlNode xmlNode in xmlDocument.DocumentElement.SelectNodes(this.rowPath)) { // IUpdatableRow implements a builder pattern to save memory allocations, // so call output.Set in a loop foreach (IColumn col in output.Schema) { var explicitColumnMapping = this.columnPaths.FirstOrDefault(columnPath => columnPath.Value == col.Name); XmlNode xml = xmlNode.SelectSingleNode(explicitColumnMapping.Key ?? col.Name); output.Set(explicitColumnMapping.Value ?? col.Name, xml == null ? null : xml.InnerXml); } // then call output.AsReadOnly to build an immutable IRow. yield return(output.AsReadOnly()); } }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { var avschema = Avro.Schema.Parse(avroSchema); var reader = new GenericDatumReader <GenericRecord>(avschema, avschema); using (var ms = new MemoryStream()) { CreateSeekableStream(input, ms); ms.Position = 0; var fileReader = DataFileReader <GenericRecord> .OpenReader(ms, avschema); while (fileReader.HasNext()) { var avroRecord = fileReader.Next(); foreach (var column in output.Schema) { if (avroRecord[column.Name] != null) { output.Set(column.Name, avroRecord[column.Name]); } else { output.Set <object>(column.Name, null); } yield return(output.AsReadOnly()); } } } }
/// <summary/> public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // Json.Net using (var reader = new JsonTextReader(new StreamReader(input.BaseStream))) { // Parse Json one token at a time while (reader.Read()) { if (reader.TokenType == JsonToken.StartObject) { var token = JToken.Load(reader); // Rows // All objects are represented as rows foreach (JObject o in SelectChildren(token, this.rowpath)) { // All fields are represented as columns this.JObjectToRow(o, output); yield return(output.AsReadOnly()); } } } } }
/// <summary/> public override IEnumerable <IRow> Combine(IRowset left, IRowset right, IUpdatableRow output) { var buffer = new List <Tuple <int, string> >(); foreach (var row2 in right.Rows) { buffer.Add(Tuple.Create <int, string>( row2.Get <int>("employee_id"), row2.Get <string>("employee_name") )); } foreach (var row in left.Rows) { foreach (var tuple in buffer) { if (row.Get <int>("employee_id") == tuple.Item1) { output.Set("employee_id", tuple.Item1); output.Set("employee_name", tuple.Item2); output.Set("department_name", row.Get <string>("department_name")); yield return(output.AsReadOnly()); } } } }
public override IRow Process(IRow inRow, IUpdatableRow outRow) { var row = (ScopeEngineManaged.SqlIpRow)inRow; var output = (ScopeEngineManaged.SqlIpUpdatableRow)outRow; int exceptionIndex = 0; try { System.Int32 col_INVENTORY_ID = row.GetInternal <System.Int32>(0); System.Int32 col_INVENTORY_VERSION = row.GetInternal <System.Int32>(1); System.String col_SET_NUMBER = row.GetInternal <System.String>(2); output.SetInternal(0, col_SET_NUMBER.IndexOf("-") >= 0 ? col_SET_NUMBER.Substring(0, col_SET_NUMBER.IndexOf("-")) : col_SET_NUMBER); exceptionIndex++; output.SetInternal(1, col_SET_NUMBER.IndexOf("-") >= 0 ? col_SET_NUMBER.Substring(col_SET_NUMBER.IndexOf("-") + 1) : null); exceptionIndex++; output.SetInternal(2, col_INVENTORY_ID); exceptionIndex++; output.SetInternal(3, col_INVENTORY_VERSION); exceptionIndex++; } catch (Exception exception) { ScopeEngineManaged.UserExceptionHelper.WrapUserExpressionException(exceptionsInfo[exceptionIndex], ScopeEngineManaged.SqlHelper.Dump(row), exception); } return(output.AsReadOnly()); }
public override IEnumerable <IRow> Reduce(IRowset input, IUpdatableRow output) { // Cache the rows in the input rowset (should be records for a single vehicle registration) // Only save rows where the vehicle is not marked as having been recovered var stolenVehicleRecords = (from row in input.Rows select new StolenVehicleRecord( row.Get <string>("VehicleRegistration"), row.Get <string>("DateStolen"), row.Get <string>("DateRecovered") )).ToList(); // If there aren't any items in the stolenVehicleRecords list, then this vehicle is not stolen so skip over it if (stolenVehicleRecords.Count > 0) { // Sort the data in the stolenVehicleRecords list by DateStolen in descending order, so that the most recent record occurs first stolenVehicleRecords.Sort(); // Retrieve the first record in the stolenVehicleRecords list - this is the most recent record of the vehicle having been stolen var stolenVehicleRecord = stolenVehicleRecords.First(); // If the record does not have a recovery date, then output it, otherwise the vehicle is considered to have been recovered and is no longer stolen if (stolenVehicleRecord.DateRecovered == null) { output.Set <string>("VehicleRegistration", stolenVehicleRecord.VehicleRegistration); output.Set <DateTime>("DateStolen", stolenVehicleRecord.DateStolen); yield return(output.AsReadOnly()); } } }
private void mapToColumns(JObject obj, IUpdatableRow output) { var json = JsonConvert.SerializeObject(obj); var genre = JsonFunctions.JsonTuple(json, "$.data.author"); //Console.WriteLine("Genre: "+genre.Count()); /* * genre.Values.ToList().ForEach(g => { * //Console.WriteLine(g); * output.Set("contexts.data.genre", g); * } * ); */ var keys = genre.Keys; foreach (var key in keys) { Console.WriteLine($"{key}: {genre[key]}"); } //Console.WriteLine($"data: {genre["data.breadcrumb"]}"); //setting it to null for second object //work on it //create a condition and add second object if not null output.Set("contexts.data.genre", genre["data.genre"]); }
/// <summary> /// /// </summary> /// <param name="input"></param> /// <param name="output"></param> /// <returns></returns> public override IEnumerable<IRow> Reduce(IRowset input, IUpdatableRow output) { int count = 0; int[] colValues = new int[colNames.Length]; foreach (IRow row in input.Rows) { if (count == 0) { colValues[(int)ColNames.id] = int.Parse(row.Get<string>("id").ToString()); colValues[(int)ColNames.loc] = location.GetValue(row.Get<string>("loc").ToString()); colValues[(int)ColNames.fs] = int.Parse(row.Get<string>("fs").ToString()); colValues[(int)ColNames.tr] = int.Parse(row.Get<string>("tr").ToString()); colValues[(int)ColNames.st] = sevType.GetValue(row.Get<string>("st").ToString()); } colValues[eventType.GetValue(row.Get<string>("et").ToString())] = 1; int vol = int.Parse(row.Get<string>("vol").ToString()); colValues[logFeature.GetValue(row.Get<string>("lf").ToString())] = vol; colValues[resType.GetValue(row.Get<string>("rt").ToString())] = 1; count++; } // Write output for (int n = (int)ColNames.lf_1; n < colValues.Length; n++) { string colName = colNames[n]; output.Set(colName, colValues[n].ToString()); } yield return output.AsReadOnly(); }
public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { string line; //Read the input line by line foreach (Stream current in input.Split(_encoding.GetBytes("\r\n"))) { using (StreamReader streamReader = new StreamReader(current, this._encoding)) { line = streamReader.ReadToEnd().Trim(); //Split the input by the column delimiter string[] parts = line.Split(this._col_delim); int count = 0; foreach (string part in parts) { //If its the second column, treat it in a special way, split the column into first name and last name columns if (count == 1) { string[] name = part.Trim().Split(' '); output.Set <string>(count, name[0]); count += 1; output.Set <string>(count, name[1]); } else { output.Set <string>(count, part); } count += 1; } } yield return(output.AsReadOnly()); } yield break; }
public void TestMyProcessor() { // Define the schema for processor input rowset // Schema: "a:int, b:int" // USqlColumn <int> col1 = new USqlColumn <int>("col1"); USqlColumn <int> col2 = new USqlColumn <int>("col2"); List <IColumn> columns = new List <IColumn> { col1, col2 }; USqlSchema schema = new USqlSchema(columns); // Generate one row with specified column values as input rowset // object[] values = new object[2] { 0, 0 }; IRow input = new USqlRow(schema, values); IUpdatableRow output = input.AsUpdatable(); // Create processor instance for testing and run the processor with fake input // MyProcessor processor = new MyProcessor(); IRow newOutput = processor.Process(input, output); //Verify results for processor output // Assert.IsTrue(newOutput.Schema.Count == 2); Assert.IsTrue(newOutput.Get <int>(0) == 1); Assert.IsTrue(newOutput.Get <int>(1) == 5); }
/// <summary>Apply is called at least once per instance</summary> /// <param name="input">A SQLIP row</param> /// <param name="output">A SQLIP updatable row.</param> /// <returns>IEnumerable of IRow, one IRow per SQLIP row.</returns> /// <remarks>Because applier constructor arguments cannot depend on /// column references, the name of the column to parse is given as a string. Then /// the actual column value is obtained by calling IRow.Get. The rest of the code /// is the same as XmlDomExtractor.</remarks> public override IEnumerable<IRow> Apply(IRow input, IUpdatableRow output) { // Make sure that all requested columns are of type string IColumn column = output.Schema.FirstOrDefault(col => col.Type != typeof(string)); if (column != null) { throw new ArgumentException(string.Format("Column '{0}' must be of type 'string', not '{1}'", column.Name, column.Type.Name)); } XmlDocument xmlDocument = new XmlDocument(); xmlDocument.LoadXml(input.Get<string>(this.xmlColumnName)); foreach (XmlNode xmlNode in xmlDocument.DocumentElement.SelectNodes(this.rowPath)) { // IUpdatableRow implements a builder pattern to save memory allocations, // so call output.Set in a loop foreach(IColumn col in output.Schema) { var explicitColumnMapping = this.columnPaths.FirstOrDefault(columnPath => columnPath.Value == col.Name); XmlNode xml = xmlNode.SelectSingleNode(explicitColumnMapping.Key ?? col.Name); output.Set(explicitColumnMapping.Value ?? col.Name, xml == null ? null : xml.InnerXml); } // then call output.AsReadOnly to build an immutable IRow. yield return output.AsReadOnly(); } }
private static IEnumerable <IRow> ExtractInternal(IUpdatableRow output, Stream input) { if (!input.CanSeek) { throw new ArgumentOutOfRangeException(nameof(input), "Input stream must be seekable for ORC reader. Enable the hack to copy to a Memory Stream or to a non-Persisted Memory Mapped file. The hack is the default setting."); } using (var fileTail = new FileTail(input)) { var stripes = fileTail.GetStripeCollection(); var columnsToRead = GetIntersectedColumnMetadata(output.Schema, fileTail).ToArray(); foreach (var stripe in stripes) { var extractedColumns = ReadStripe(stripe, columnsToRead).ToArray(); for (int i = 0; i < (int)stripe.NumRows; i++) { foreach (var col in extractedColumns) { var outputColumn = col.Item1.USqlProjectionColumnIndex; var value = col.Item2?.GetValue(i) ?? col.Item1.USqlProjectionColumn.DefaultValue; output.Set(outputColumn, value); } yield return(output.AsReadOnly()); } } } }
// void OutputValueAtCol_I(string c, int i, IUpdatableRow outputrow) // // Helper function that takes the string value c and puts it into the column at position i in the output row. // The value will be cast to the expected type of the column. private void OutputValueAtCol_I(string c, int i, IUpdatableRow outputrow) { ISchema schema = outputrow.Schema; if (schema[i].Type == typeof(SqlMap<string, string>)) { c = DriverFunctions.RemoveOptionalQuotes(c); SqlMap<string, string> scopeMap = String.IsNullOrEmpty(c) ? null : DriverFunctions.ReadStringMap(c, this._map_item_delim, this._map_kv_delim); outputrow.Set<SqlMap<string, string>>(i, scopeMap); } else if (schema[i].Type == typeof(SqlArray<int>)) { c = DriverFunctions.RemoveOptionalQuotes(c); SqlArray<int> scopeArray = String.IsNullOrEmpty(c) ? null : DriverFunctions.ReadIntArray(c, this._array_item_delim); outputrow.Set<SqlArray<int>>(i, scopeArray); } else if (schema[i].Type == typeof(int)) { int num = Convert.ToInt32(c); outputrow.Set<int>(i, num); } else if (schema[i].Type == typeof(int?)) { int? num2 = (c == "") ? null : new int?(Convert.ToInt32(c)); outputrow.Set<int?>(i, num2); } else if (schema[i].Type == typeof(long)) { long num3 = Convert.ToInt64(c); outputrow.Set<long>(i, num3); } else if (schema[i].Type == typeof(long?)) { long? num4 = (c == "") ? null : new long?(Convert.ToInt64(c)); outputrow.Set<long?>(i, num4); } else if (schema[i].Type == typeof(DateTime)) { DateTime dateTime = Convert.ToDateTime(c); outputrow.Set<DateTime>(i, dateTime); } else if (schema[i].Type == typeof(DateTime?)) { DateTime? dateTime2 = (c == "") ? null : new DateTime?(Convert.ToDateTime(c)); outputrow.Set<DateTime?>(i, dateTime2); } else if (schema[i].Type == typeof(string)) { string text = DriverFunctions.RemoveOptionalQuotes(c); outputrow.Set<string>(i, text); } else { outputrow.Set<string>(i, c); } }
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { using(var reader = new StreamReader(input.BaseStream)) { string line; while ((line = reader.ReadLine()) != null) { LineToRow(line, output); yield return output.AsReadOnly(); } } }
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { using (var reader = new JsonTextReader(new StreamReader(input.BaseStream, Encoding.UTF8))) { reader.SupportMultipleContent = true; while (reader.Read()) { var row = JToken.ReadFrom(reader); var size = 0; var flattendData = GHInsights.USql.Utility.FlattenJson(row, ref size); if (size < (_dataLakeMaxRowSize)) { output.Set(_outputColumnName, new SqlMap<string, byte[]>(flattendData)); } else { var compressedData = GHInsights.USql.Utility.GzipByteArray(Encoding.UTF8.GetBytes(row.ToString(Formatting.None))); if (compressedData.Length < (_dataLakeMaxRowSize)) { var compressedRow = new Dictionary<string, byte[]> { { "!CompressedRow", compressedData } }; output.Set(_outputColumnName, new SqlMap<string, byte[]>(compressedRow)); } else { //throw new ArgumentOutOfRangeException($"Resulting SqlMap is too large: {size} - {row.ToString(Formatting.None).Substring(0,100)}"); var error = new Dictionary<string, byte[]> { { "!RowExtractorError", Encoding.UTF8.GetBytes($"Resulting SqlMap is too large: OriginalSize:{size} CompressedSize: {compressedData.Length} - {row.ToString(Formatting.None).Substring(0, 100)}") } }; output.Set(_outputColumnName, new SqlMap<string, byte[]>(error)); } } yield return output.AsReadOnly(); } } }
public override IRow Process(IRow input, IUpdatableRow output) { var img = input.Get<byte[]>("image_data"); // load image only once into memory per row using (StreamImage inImage = new StreamImage(img)) { output.SetColumnIfExists("equipment_make", inImage.getStreamImageProperty(ImageProperties.equipment_make)); output.SetColumnIfExists("equipment_model", inImage.getStreamImageProperty(ImageProperties.equipment_model)); output.SetColumnIfExists("description", inImage.getStreamImageProperty(ImageProperties.description)); output.SetColumnIfExists("copyright", inImage.getStreamImageProperty(ImageProperties.copyright)); output.SetColumnIfExists("thumbnail", inImage.scaleStreamImageTo(150, 150)); } return output.AsReadOnly(); }
public override IEnumerable<IRow> Reduce(IRowset input, IUpdatableRow output) { // Init aggregation values bool first_row_processed = false; var begin = DateTime.MaxValue; // Dummy value to make compiler happy var end = DateTime.MinValue; // Dummy value to make compiler happy // requires that the reducer is PRESORTED on begin and READONLY on the reduce key. foreach (var row in input.Rows) { // Initialize the first interval with the first row if i is 0 if (!first_row_processed) { first_row_processed = true; // mark that we handled the first row begin = row.Get<DateTime>("begin"); end = row.Get<DateTime>("end"); // If the end is just a time and not a date, it can be earlier than the begin, indicating it is on the next day. // This let's fix up the end to the next day in that case if (end < begin) { end = end.AddDays(1); } } else { var b = row.Get<DateTime>("begin"); var e = row.Get<DateTime>("end"); // fix up the date if end is earlier than begin if (e < b) { e = e.AddDays(1); } // if the begin is still inside the interval, increase the interval if it is longer if (b <= end) { // if the new end time is later than the current, extend the interval if (e > end) { end = e; } } else // output the previous interval and start a new one { output.Set<DateTime>("begin", begin); output.Set<DateTime>("end", end); yield return output.AsReadOnly(); begin = b; end = e; } // if } // if } // foreach // now output the last interval output.Set<DateTime>("begin", begin); output.Set<DateTime>("end", end); yield return output.AsReadOnly(); }
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { using (XmlReader reader = XmlReader.Create(input.BaseStream)) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "row") { foreach (IColumn column in output.Schema) { string rawValue = reader.GetAttribute(column.Name); if (rawValue == null) { output.Set(column.Name, column.DefaultValue); } else { if (column.Type == typeof(string)) { string simplifiedValue = Simplify(rawValue); int byteCount = Encoding.UTF8.GetByteCount(simplifiedValue); if (byteCount > Constants.Limits.StringSizeInBytes) // 128kB { simplifiedValue = ShortenWithinBoundries(simplifiedValue); } output.Set(column.Name, simplifiedValue); } else { var typeConverter = TypeDescriptor.GetConverter(column.Type); var castedValue = typeConverter.ConvertFromString(rawValue); output.Set(column.Name, castedValue); } } } yield return output.AsReadOnly(); } } } }
// IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow outputrow) // // Actual implementation of DriverExtractor that overwrites the Extract method of IExtractor. public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow outputrow) { foreach (Stream current in input.Split(this._row_delim)) { using (StreamReader streamReader = new StreamReader(current, this._encoding)) { int num = 0; string[] array = streamReader.ReadToEnd().Split(new string[]{this._col_delim}, StringSplitOptions.None); for (int i = 0; i < array.Length; i++) { string c = array[i]; this.OutputValueAtCol_I(c, num++, outputrow); } } yield return outputrow.AsReadOnly(); } yield break; }
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { var serializer = AvroSerializer.CreateGeneric(avroSchema); using (var genericReader = AvroContainer.CreateGenericReader(input.BaseStream)) { using (var reader = new SequentialReader<dynamic>(genericReader)) { foreach (var obj in reader.Objects) { foreach (var column in output.Schema) { output.Set(column.Name, obj[column.Name]); } yield return output.AsReadOnly(); } } } }
/// <summary/> public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // Json.Net using(var reader = new JsonTextReader(new StreamReader(input.BaseStream))) { // Parse Json // TODO: Json.Net fails with empty input files var root = JToken.ReadFrom(reader); // Rows // All objects are represented as rows foreach(JObject o in SelectChildren(root, this.rowpath)) { // All fields are represented as columns this.JObjectToRow(o, output); yield return output.AsReadOnly(); } } }
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { List<IRow> rows = new List<IRow>(); XmlDocument xmlDocument = new XmlDocument(); xmlDocument.Load(input.BaseStream); foreach (XmlNode xmlNode in xmlDocument.DocumentElement.SelectNodes(this.m_XPath)) { foreach (IColumn col in output.Schema) { XmlNode xml = xmlNode.SelectSingleNode(col.Name); if (xml != null) { object val = Convert.ChangeType(xml.InnerXml, col.Type); output.Set(col.Name, val); } } yield return output.AsReadOnly(); } }
protected virtual void LineToRow(string line, IUpdatableRow row) { int index = 0; foreach(var map in _fieldMap) { if (line.Length < map.Key + map.Value) { index++; continue; } if (index < row.Schema.Count && row.Schema[index].Type != typeof(string)) { var typeConverter = TypeDescriptor.GetConverter(row.Schema[index].Type); if (typeConverter != null && typeConverter.CanConvertFrom(typeof(string))) { row.Set(index, typeConverter.ConvertFromString(line.Substring(map.Key, map.Value))); } } else row.Set(index, line.Substring(map.Key, map.Value)); index++; } }
// IRow Process(IRow input, IUpdatableRow output) // // Actual implementatoin of the user-defined processor. Overwrites the Process method of IProcessor. public override IRow Process(IRow input, IUpdatableRow output) { List<string> list = new List<string>(); foreach (var current in input.Schema) { if (current.Type.IsGenericType && current.Type.GetGenericTypeDefinition() == typeof(SqlMap) && current.Type.GetGenericArguments()[0] == typeof(string)) { list.Add(current.Name); } } Dictionary<string, ArrayList> maps_to_be_changed = new Dictionary<string, ArrayList>(); foreach (var current2 in output.Schema) { bool flag = list.Contains(current2.Name); if (-1 < input.Schema.IndexOf(current2.Name) && !flag) { output.Set<object>(current2.Name, input.Get<object>(current2.Name)); } else if (!flag) { foreach (string current3 in list) { SqlMap<string, string> sqlMap = input.Get<SqlMap<string, string>>(current3); SqlArray<string> sqlArray = null; List<string> list2 = null; if (sqlMap != null) { sqlArray = sqlMap.Keys; if (sqlMap.Values != null) { list2 = sqlMap.Values.ToList<string>(); } } int num = (sqlArray == null) ? -1 : sqlArray.ToList<string>().IndexOf(current2.Name); if (num != -1) { output.Set<string>(current2.Name, list2[num]); if (maps_to_be_changed.Keys.Contains(current3)) { maps_to_be_changed[current3].Add(current2.Name); } else { maps_to_be_changed.Add(current3, new ArrayList { current2.Name }); } break; } output.Set<object>(current2.Name, current2.Type.IsValueType ? Activator.CreateInstance(current2.Type) : null); } } } using (IEnumerator<IColumn> enumerator = output.Schema.GetEnumerator()) { while (enumerator.MoveNext()) { IColumn out_col = enumerator.Current; bool flag = list.Contains(out_col.Name); if (flag) { SqlMap<string, string> sqlMap = input.Get<SqlMap<string, string>>(out_col.Name); if (maps_to_be_changed != null && maps_to_be_changed.Keys.Contains(out_col.Name)) { sqlMap = new SqlMap<string, string>( from kvp in sqlMap where !maps_to_be_changed[out_col.Name].Contains(kvp.Key) select kvp); } output.Set<SqlMap<string, string>>(out_col.Name, sqlMap); } } } return output.AsReadOnly(); }
/// <summary>Extract is called at least once per instance</summary> /// <param name="input">Wrapper for a Stream</param> /// <param name="output">IUpdatableRow uses a mutable builder pattern -- /// set individual fields with IUpdatableRow.Set, then build an immutable IRow by /// calling IUpdatableRow.AsReadOnly.</param> /// <returns>IEnumerable of IRow, one IRow per SQLIP row.</returns> public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // Make sure that all requested columns are of type string IColumn column = output.Schema.FirstOrDefault(col => col.Type != typeof(string)); if (column != null) { throw new ArgumentException(string.Format("Column '{0}' must be of type 'string', not '{1}'", column.Name, column.Type.Name)); } var state = new ParseState(); state.ClearAndJump(ParseLocation.Row); using (var reader = XmlReader.Create(input.BaseStream)) { while (reader.Read()) { switch (state.Location) { case ParseLocation.Row: // when looking for a new row, we are only interested in elements // whose name matches the requested row element if (reader.NodeType == XmlNodeType.Element && reader.Name == this.rowPath) { // when found, clear the IUpdatableRow's memory // (this is no provided Clear method) for (int i = 0; i < output.Schema.Count; i++) { output.Set<string>(i, null); } state.ClearAndJump(ParseLocation.Column); } break; case ParseLocation.Column: // When looking for a new column, we are interested in elements // whose name is a key in the columnPaths map or // whose name is in the requested output schema. // This indicates a column whose value needs to be read, // so prepare for reading it by clearing elementValue. if (reader.NodeType == XmlNodeType.Element && (this.columnPaths.ContainsKey(reader.Name) || output.Schema.Select(c => c.Name).Contains(reader.Name))) { if (reader.IsEmptyElement) { // For an empty element, set an empty string // and immediately jump to looking for the next column output.Set(this.columnPaths[reader.Name] ?? reader.Name, state.ReadElementValue()); state.ClearAndJump(ParseLocation.Column); } else { state.Location = ParseLocation.Data; state.ElementName = reader.Name; state.ClearElementValue(); } } else if (reader.NodeType == XmlNodeType.EndElement && reader.Name == this.rowPath) { // The other interesting case is an end element whose name matches // the current row element. This indicates the end of a row, // so yield the now-complete row and jump to looking for // another row. yield return output.AsReadOnly(); state.ClearAndJump(ParseLocation.Row); } break; case ParseLocation.Data: // Most of the code for reading the value of a column // deals with re-creating the inner XML from discrete elements. // The only jump occurs when the reader hits an end element // whose name matches the current column. In this case, we // need to write the accumulated value to the appropriate // column in the output row. switch (reader.NodeType) { case XmlNodeType.EndElement: if (reader.Name == state.ElementName) { output.Set(this.columnPaths[state.ElementName] ?? state.ElementName, state.ReadElementValue()); state.ClearAndJump(ParseLocation.Column); } else { state.ElementWriter.WriteEndElement(); } break; case XmlNodeType.Element: state.ElementWriter.WriteStartElement(reader.Name); state.ElementWriter.WriteAttributes(reader, false); if (reader.IsEmptyElement) { state.ElementWriter.WriteEndElement(); } break; case XmlNodeType.CDATA: state.ElementWriter.WriteCData(reader.Value); break; case XmlNodeType.Comment: state.ElementWriter.WriteComment(reader.Value); break; case XmlNodeType.ProcessingInstruction: state.ElementWriter.WriteProcessingInstruction(reader.Name, reader.Value); break; default: state.ElementWriter.WriteString(reader.Value); break; } break; default: throw new NotImplementedException("StreamFromXml has not implemented a new member of the ParseLocation enum"); } } if (state.Location != ParseLocation.Row) { throw new ArgumentException("XML document ended without proper closing tags"); } } }
/// <summary/> protected virtual void JObjectToRow(JObject o, IUpdatableRow row) { foreach(var c in row.Schema) { JToken token = null; object value = c.DefaultValue; // All fields are represented as columns // Note: Each JSON row/payload can contain more or less columns than those specified in the row schema // We simply update the row for any column that matches (and in any order). if(o.TryGetValue(c.Name, out token) && token != null) { // Note: We simply delegate to Json.Net for all data conversions // For data conversions beyond what Json.Net supports, do an explicit projection: // ie: SELECT DateTime.Parse(datetime) AS datetime, ... // Note: Json.Net incorrectly returns null even for some non-nullable types (sbyte) // We have to correct this by using the default(T) so it can fit into a row value value = JsonFunctions.ConvertToken(token, c.Type) ?? c.DefaultValue; } // Update row.Set<object>(c.Name, value); } }
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { byte[] imageArray = ImageOps.GetByteArrayforImage(input.BaseStream); output.Set<byte[]>(0, imageArray); yield return output.AsReadOnly(); }
// IRow Process(IRow input, IUpdatableRow output) // // Actual implementatoin of the user-defined processor. Overwrites the Process method of IProcessor. public override IRow Process(IRow input, IUpdatableRow output) { string text = input.Get<string>("country"); if (EnglishCountryNames.CountryTranslation.Keys.Contains(text)) { text = EnglishCountryNames.CountryTranslation[text]; } output.Set<string>("country", text); return output.AsReadOnly(); }
/// <summary/> public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { if (input.Length == 0) yield break; using (var reader = new JsonTextReader(new StreamReader(input.BaseStream))) { IColumn currentColumn = null; StringBuilder valueBuilder = null; JsonTextWriter writer = null; var startedGlobalObjects = 0; var startedLocalObjects = 0; var startedGlobalArrays = 0; var startedLocalArrays = 0; while (reader.Read()) { switch (reader.TokenType) { case JsonToken.StartArray: startedGlobalArrays++; if (currentColumn != null && currentColumn.Type == typeof(string)) { if (writer == null) { valueBuilder = new StringBuilder(); writer = new JsonTextWriter(new StringWriter(valueBuilder)); } startedLocalArrays++; writer.WriteStartArray(); } break; case JsonToken.EndArray: startedGlobalArrays--; if (writer != null) { startedLocalArrays--; writer.WriteEndArray(); } if (currentColumn != null && valueBuilder != null && startedLocalArrays == 0 && startedLocalObjects == 0) { output.Set(currentColumn.Name, valueBuilder.ToString()); writer = null; valueBuilder = null; currentColumn = null; } if (startedGlobalArrays == 0) { yield break; } break; case JsonToken.StartObject: startedGlobalObjects++; if (currentColumn != null && currentColumn.Type == typeof(string)) { if (writer == null) { valueBuilder = new StringBuilder(); writer = new JsonTextWriter(new StringWriter(valueBuilder)); } startedLocalObjects++; writer.WriteStartObject(); } break; case JsonToken.EndObject: startedGlobalObjects--; if (writer != null) { startedLocalObjects--; writer.WriteEndObject(); } if (currentColumn != null && valueBuilder != null && startedLocalArrays == 0 && startedLocalObjects == 0) { output.Set(currentColumn.Name, valueBuilder.ToString()); writer = null; valueBuilder = null; currentColumn = null; } if (startedGlobalObjects == 0) yield return output.AsReadOnly(); break; case JsonToken.PropertyName: if (writer != null) { writer.WritePropertyName(reader.Value.ToString()); } else { var currentPropertyName = reader.Value.ToString(); currentColumn = output.Schema .FirstOrDefault(s => s.Name == currentPropertyName); if (currentColumn == null) reader.Skip(); } break; case JsonToken.String: case JsonToken.Boolean: case JsonToken.Bytes: case JsonToken.Date: case JsonToken.Integer: case JsonToken.Float: if (writer != null) { writer.WriteValue(reader.Value); } else if (currentColumn != null) { var typeConverter = TypeDescriptor.GetConverter(currentColumn.Type); if (typeConverter != null && typeConverter.CanConvertFrom(reader.ValueType)) { output.Set(currentColumn.Name, typeConverter.ConvertFrom(reader.Value)); } else output.Set(currentColumn.Name, reader.Value); currentColumn = null; } break; case JsonToken.Null: if (writer != null) { writer.WriteNull(); } else if (currentColumn != null) { output.Set(currentColumn.Name, currentColumn.DefaultValue); currentColumn = null; } break; case JsonToken.StartConstructor: writer?.WriteStartConstructor(reader.Value.ToString()); break; case JsonToken.EndConstructor: writer?.WriteEndConstructor(); break; case JsonToken.Comment: writer?.WriteComment(reader.Value.ToString()); break; case JsonToken.Raw: writer?.WriteRaw(reader.Value.ToString()); break; case JsonToken.None: case JsonToken.Undefined: // ignore break; default: throw new NotImplementedException(); } } while (reader.TokenType != JsonToken.None); } }