/// <summary>Extract is called at least once per instance</summary> /// <param name="input">Wrapper for a Stream</param> /// <param name="output">IUpdatableRow uses a mutable builder pattern -- /// set individual fields with IUpdatableRow.Set, then build an immutable IRow by /// calling IUpdatableRow.AsReadOnly.</param> /// <returns>IEnumerable of IRow, one IRow per SQLIP row.</returns> public override IEnumerable <IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // Make sure that all requested columns are of type string IColumn column = output.Schema.FirstOrDefault(col => col.Type != typeof(string)); if (column != null) { throw new ArgumentException(string.Format("Column '{0}' must be of type 'string', not '{1}'", column.Name, column.Type.Name)); } var state = new ParseState(); state.ClearAndJump(ParseLocation.Row); using (var reader = XmlReader.Create(input.BaseStream)) { while (reader.Read()) { switch (state.Location) { case ParseLocation.Row: // when looking for a new row, we are only interested in elements // whose name matches the requested row element if (reader.NodeType == XmlNodeType.Element && reader.Name == this.rowPath) { // when found, clear the IUpdatableRow's memory // (this is no provided Clear method) for (int i = 0; i < output.Schema.Count; i++) { output.Set <string>(i, null); } state.ClearAndJump(ParseLocation.Column); } break; case ParseLocation.Column: // When looking for a new column, we are interested in elements // whose name is a key in the columnPaths map or // whose name is in the requested output schema. // This indicates a column whose value needs to be read, // so prepare for reading it by clearing elementValue. if (reader.NodeType == XmlNodeType.Element && (this.columnPaths.ContainsKey(reader.Name) || output.Schema.Select(c => c.Name).Contains(reader.Name))) { if (reader.IsEmptyElement) { // For an empty element, set an empty string // and immediately jump to looking for the next column output.Set(this.columnPaths[reader.Name] ?? reader.Name, state.ReadElementValue()); state.ClearAndJump(ParseLocation.Column); } else { state.Location = ParseLocation.Data; state.ElementName = reader.Name; state.ClearElementValue(); } } else if (reader.NodeType == XmlNodeType.EndElement && reader.Name == this.rowPath) { // The other interesting case is an end element whose name matches // the current row element. This indicates the end of a row, // so yield the now-complete row and jump to looking for // another row. yield return(output.AsReadOnly()); state.ClearAndJump(ParseLocation.Row); } break; case ParseLocation.Data: // Most of the code for reading the value of a column // deals with re-creating the inner XML from discrete elements. // The only jump occurs when the reader hits an end element // whose name matches the current column. In this case, we // need to write the accumulated value to the appropriate // column in the output row. switch (reader.NodeType) { case XmlNodeType.EndElement: if (reader.Name == state.ElementName) { output.Set(this.columnPaths[state.ElementName] ?? state.ElementName, state.ReadElementValue()); state.ClearAndJump(ParseLocation.Column); } else { state.ElementWriter.WriteEndElement(); } break; case XmlNodeType.Element: state.ElementWriter.WriteStartElement(reader.Name); state.ElementWriter.WriteAttributes(reader, false); if (reader.IsEmptyElement) { state.ElementWriter.WriteEndElement(); } break; case XmlNodeType.CDATA: state.ElementWriter.WriteCData(reader.Value); break; case XmlNodeType.Comment: state.ElementWriter.WriteComment(reader.Value); break; case XmlNodeType.ProcessingInstruction: state.ElementWriter.WriteProcessingInstruction(reader.Name, reader.Value); break; default: state.ElementWriter.WriteString(reader.Value); break; } break; default: throw new NotImplementedException("StreamFromXml has not implemented a new member of the ParseLocation enum"); } } if (state.Location != ParseLocation.Row) { throw new ArgumentException("XML document ended without proper closing tags"); } } }
/// <summary>Extract is called at least once per instance</summary> /// <param name="input">Wrapper for a Stream</param> /// <param name="output">IUpdatableRow uses a mutable builder pattern -- /// set individual fields with IUpdatableRow.Set, then build an immutable IRow by /// calling IUpdatableRow.AsReadOnly.</param> /// <returns>IEnumerable of IRow, one IRow per SQLIP row.</returns> public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output) { // Make sure that all requested columns are of type string IColumn column = output.Schema.FirstOrDefault(col => col.Type != typeof(string)); if (column != null) { throw new ArgumentException(string.Format("Column '{0}' must be of type 'string', not '{1}'", column.Name, column.Type.Name)); } var state = new ParseState(); state.ClearAndJump(ParseLocation.Row); using (var reader = XmlReader.Create(input.BaseStream)) { while (reader.Read()) { switch (state.Location) { case ParseLocation.Row: // when looking for a new row, we are only interested in elements // whose name matches the requested row element if (reader.NodeType == XmlNodeType.Element && reader.Name == this.rowPath) { // when found, clear the IUpdatableRow's memory // (this is no provided Clear method) for (int i = 0; i < output.Schema.Count; i++) { output.Set<string>(i, null); } state.ClearAndJump(ParseLocation.Column); } break; case ParseLocation.Column: // When looking for a new column, we are interested in elements // whose name is a key in the columnPaths map or // whose name is in the requested output schema. // This indicates a column whose value needs to be read, // so prepare for reading it by clearing elementValue. if (reader.NodeType == XmlNodeType.Element && (this.columnPaths.ContainsKey(reader.Name) || output.Schema.Select(c => c.Name).Contains(reader.Name))) { if (reader.IsEmptyElement) { // For an empty element, set an empty string // and immediately jump to looking for the next column output.Set(this.columnPaths[reader.Name] ?? reader.Name, state.ReadElementValue()); state.ClearAndJump(ParseLocation.Column); } else { state.Location = ParseLocation.Data; state.ElementName = reader.Name; state.ClearElementValue(); } } else if (reader.NodeType == XmlNodeType.EndElement && reader.Name == this.rowPath) { // The other interesting case is an end element whose name matches // the current row element. This indicates the end of a row, // so yield the now-complete row and jump to looking for // another row. yield return output.AsReadOnly(); state.ClearAndJump(ParseLocation.Row); } break; case ParseLocation.Data: // Most of the code for reading the value of a column // deals with re-creating the inner XML from discrete elements. // The only jump occurs when the reader hits an end element // whose name matches the current column. In this case, we // need to write the accumulated value to the appropriate // column in the output row. switch (reader.NodeType) { case XmlNodeType.EndElement: if (reader.Name == state.ElementName) { output.Set(this.columnPaths[state.ElementName] ?? state.ElementName, state.ReadElementValue()); state.ClearAndJump(ParseLocation.Column); } else { state.ElementWriter.WriteEndElement(); } break; case XmlNodeType.Element: state.ElementWriter.WriteStartElement(reader.Name); state.ElementWriter.WriteAttributes(reader, false); if (reader.IsEmptyElement) { state.ElementWriter.WriteEndElement(); } break; case XmlNodeType.CDATA: state.ElementWriter.WriteCData(reader.Value); break; case XmlNodeType.Comment: state.ElementWriter.WriteComment(reader.Value); break; case XmlNodeType.ProcessingInstruction: state.ElementWriter.WriteProcessingInstruction(reader.Name, reader.Value); break; default: state.ElementWriter.WriteString(reader.Value); break; } break; default: throw new NotImplementedException("StreamFromXml has not implemented a new member of the ParseLocation enum"); } } if (state.Location != ParseLocation.Row) { throw new ArgumentException("XML document ended without proper closing tags"); } } }