public void Read_multiple_with_missing_data() { var schema = new Schema( new DataField <int?>("id"), new DataField <string>("country"), new StructField("population", new DataField <int?>("year"), new DataField <int?>("amount"), new DataField <int?>("diff")), new DataField <string>("comment")); var extractor = new JsonDataExtractor(schema); JObject doc1 = JObject.Parse(ReadJson("infer00.json")); JObject doc2 = JObject.Parse(ReadJson("infer01.json")); var ds = new DataSet(schema); extractor.AddRow(ds, doc1); extractor.AddRow(ds, doc2); Assert.Equal(2, ds.RowCount); Assert.Equal("{123;UK;{2016;111;<null>};<null>}", ds[0].ToString()); Assert.Equal("{123;UK;{2017;222;111};no comments}", ds[1].ToString()); }
public static DataSet ToParquetDataSet(this JObject jObject, PSchema schema) { if (schema == null) { throw new ArgumentNullException(nameof(schema)); } //convert data var dataExtractor = new JsonDataExtractor(schema); var ds = new DataSet(schema); dataExtractor.AddRow(ds, jObject); return(ds); }
public static DataSet ToParquetDataSet(this JObject jObject) { //extract schema var schemaExtractor = new JsonSchemaExtractor(); schemaExtractor.Analyze(jObject); PSchema schema = schemaExtractor.GetSchema(); //convert data var dataExtractor = new JsonDataExtractor(schema); var ds = new DataSet(schema); dataExtractor.AddRow(ds, jObject); return(ds); }
//[Fact] public void TempTest() { var dir = new DirectoryInfo(@"C:\Users\ivang\Downloads\Fullfeed-20170330004044"); FileInfo[] files = dir.GetFiles(); JObject[] jos = files .Select(fi => JObject.Parse(System.IO.File.ReadAllText(fi.FullName))) .Take(1000) .ToArray(); var inferrer = new JsonSchemaInferring(); Schema schema = inferrer.InferSchema(jos); var extractor = new JsonDataExtractor(schema); var ds = new DataSet(schema); for (int i = 0; i < jos.Length; i++) { extractor.AddRow(ds, jos[i]); } ParquetWriter.WriteFile(ds, "c:\\tmp\\com.parquet"); }