public NOMIS2011Census() { //define field names on NOMIS website catalogue page that we require for processing //TitleField = "Description"; //LinkField = "oaurl"; //TagsField = ""; //DescriptionField = ""; //doesn't exist CSVCatalogue reader = new CSVCatalogue(); this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile)); //FileFilterOptions = new FileFilter(FileFilterEnum.Top, ""); FileFilterOptions = new FileFilter(FileFilterEnum.Pattern, "DATA.CSV"); //had to change this to prevent returning CODE0.CSV file instead //add weights for geometry to favour 2011 datasets over the older ones SetGeometryHint("OA_2011", 2.0f); SetGeometryHint("OA", 0.1f); SetGeometryHint("LSOA_2011", 2.0f); SetGeometryHint("LSOA", 0.1f); SetGeometryHint("MSOA_2011", 2.0f); SetGeometryHint("MSOA", 0.1f); //then create a schema to describe what the columns are Schema = new DatastoreSchema(); Schema.AddField("TableName", SemanticFieldType.UniqueKey); Schema.AddField("Description", SemanticFieldType.Title); Schema.AddField("oaurl", SemanticFieldType.Link); //there are two links to data here - oa/lsoa/msoa or wards (below) //Schema.AddField("wardurl", SemanticFieldType.Link); //Now build a table of description text for every variable using the variables file. //This is a quick lookup between variable code and plain text which is used for writing out data file. This is //duplicated in the data table loading below. //VariableNameDescriptionText = new Dictionary<string, string>(); //using (TextReader varsFile = File.OpenText(Path.Combine(DataRootDir, VariablesFile))) //{ // string Line = varsFile.ReadLine(); //skip header // while ((Line = varsFile.ReadLine()) != null) // { // string[] Fields = CSVCatalogue.ParseCSVLine(Line); //need to do this for the quoted final column // //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription // //KS101EW0001,Count,Person,All categories: Sex // //KS101EW0002,Count,Person,Males // VariableNameDescriptionText.Add(Fields[0], Fields[3]); // } // varsFile.Close(); //} //This is a full DataTable containing all the data about each individual variable from the variable lookup: //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription //KS101EW0001,Count,Person,All categories: Sex //KS101EW0002,Count,Person,Males //Used for the short and long description text. CSVCatalogue VarCatalogue = new CSVCatalogue(); VariableMetaData = VarCatalogue.ReadCatalogue(Path.Combine(DataRootDir, VariablesFile)); VariableMetaData.PrimaryKey = new DataColumn[] { VariableMetaData.Columns["ColumnVariableCode"] }; }
/// <summary> /// Most of this code comes from AnalyseCorrelationData, but it returns a mapping between the dataset index and a plain text name containing the dataset table and variable /// </summary> /// <returns></returns> public Dictionary <int, string> GetDescriptionForIndex() { Dictionary <int, string> Result = new Dictionary <int, string>(); //load mapping between unique dataset field code and plain text description into hash Dictionary <string, string> variables = new Dictionary <string, string>(); using (TextReader varsFile = File.OpenText(Path.Combine(DataRootDir, "NOMIS2011Variables.txt"))) { string Line = varsFile.ReadLine(); //skip header while ((Line = varsFile.ReadLine()) != null) { string[] Fields = CSVCatalogue.ParseCSVLine(Line); //need to do this for the quoted final column //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription //KS101EW0001,Count,Person,All categories: Sex //KS101EW0002,Count,Person,Males variables.Add(Fields[0], Fields[3]); } varsFile.Close(); } //load mapping between major/minor index and the unique column code //I'm not actually using the two index dictionaries, but keep it in anyway Dictionary <string, string> indexToFieldName = new Dictionary <string, string>(); Dictionary <string, string> indexToTableName = new Dictionary <string, string>(); using (TextReader mapIndexFile = File.OpenText(Path.Combine(ImageDirectory, "mapindex.csv"))) { string Line = mapIndexFile.ReadLine(); //skip header int index = 0; while ((Line = mapIndexFile.ReadLine()) != null) { //major_index,minor_index,data_uri,uniquekey,title,column //0,0,"file:///c:/richard/wxtemp/Datastores/ks101ew_2011_oa/ks101ew_2011oa/KS101EWDATA_LSOA.csv","KS101EW","Usual Resident Population","KS101EW0001" string[] Fields = CSVCatalogue.ParseCSVLine(Line); indexToFieldName.Add(Fields[0] + "-" + Fields[1], Fields[5]); indexToTableName.Add(Fields[0] + "-" + Fields[1], Fields[4]); //Result.Add(Fields[0] + "-" + Fields[1], Fields[5] + " " + Fields[4]); Result.Add(index, Fields[5] + " " + Fields[4] + " " + variables[Fields[5]]); ++index; } mapIndexFile.Close(); } return(Result); }
//constructor? public LondonDatastore() { //define field names in LondonDatastore data that we require for processing //TitleField = "TITLE"; //LinkField = "CSV_URL"; //TagsField = ""; //DescriptionField = "LONGDESC"; CSVCatalogue reader = new CSVCatalogue(); this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile)); //then create a schema to describe what the columns are //define field names in LondonDatastore data that we require for processing Schema = new DatastoreSchema(); Schema.AddField("TITLE", SemanticFieldType.Title); Schema.AddField("LONGDESC", SemanticFieldType.Description); Schema.AddField("CSV_URL", SemanticFieldType.Link); }
/// <summary> /// Load the NOMIS variables file, mapindex.csv file and imatch-sorted.csv file and write out plain text descriptions of everything that we think matches. /// TODO: need some sort of datastore neutral way of doing this for everything, not just NOMIS /// </summary> public void AnalyseCorrelationData() { //load mapping between unique dataset field code and plain text description into hash Dictionary <string, string> variables = new Dictionary <string, string>(); using (TextReader varsFile = File.OpenText(Path.Combine(DataRootDir, "NOMIS2011Variables.txt"))) { string Line = varsFile.ReadLine(); //skip header while ((Line = varsFile.ReadLine()) != null) { string[] Fields = CSVCatalogue.ParseCSVLine(Line); //need to do this for the quoted final column //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription //KS101EW0001,Count,Person,All categories: Sex //KS101EW0002,Count,Person,Males variables.Add(Fields[0], Fields[3]); } varsFile.Close(); } //load mapping between major/minor index and the unique column code Dictionary <string, string> indexToFieldName = new Dictionary <string, string>(); Dictionary <string, string> indexToTableName = new Dictionary <string, string>(); using (TextReader mapIndexFile = File.OpenText(Path.Combine(ImageDirectory, "mapindex.csv"))) { string Line = mapIndexFile.ReadLine(); //skip header while ((Line = mapIndexFile.ReadLine()) != null) { //major_index,minor_index,data_uri,uniquekey,title,column //0,0,"file:///c:/richard/wxtemp/Datastores/ks101ew_2011_oa/ks101ew_2011oa/KS101EWDATA_LSOA.csv","KS101EW","Usual Resident Population","KS101EW0001" string[] Fields = CSVCatalogue.ParseCSVLine(Line); indexToFieldName.Add(Fields[0] + "-" + Fields[1], Fields[5]); indexToTableName.Add(Fields[0] + "-" + Fields[1], Fields[4]); } mapIndexFile.Close(); } //now read the data and write out plain text descriptions of the matches that are found using (TextReader matchFile = File.OpenText(Path.Combine(ImageDirectory, "GreenMatch\\imatch-sorted.csv"))) { //imajor, iminor, jmajor, jminor, value (would have i,j and two filenames, but had to remove them as the csvfix sort required too much memory) //0,1,10,1,5.90647686550526 string Line = ""; while ((Line = matchFile.ReadLine()) != null) { string[] Fields = CSVCatalogue.ParseCSVLine(Line); //int imajor = Convert.ToInt32(Fields[0]); //int iminor = Convert.ToInt32(Fields[1]); //int jmajor = Convert.ToInt32(Fields[2]); //int jminor = Convert.ToInt32(Fields[3]); float value = Convert.ToSingle(Fields[4]); if (value > 20.0f) { break; //it's a sorted list and 20 is just about on the first knee of the curve } string I = Fields[0] + "-" + Fields[1]; string J = Fields[2] + "-" + Fields[3]; if (I != J) //filter out everything matching itself { string ITable = indexToTableName[I]; //get the names of the tables where the data comes from using the major/minor indexes string JTable = indexToTableName[J]; string IColumn = indexToFieldName[I]; //get unique column codes from major/minor map numbers string JColumn = indexToFieldName[J]; string IText = variables[IColumn]; //use the two unique column codes to lookup the text descriptions string JText = variables[JColumn]; System.Diagnostics.Debug.WriteLine(value + "," + IColumn + "," + JColumn + ",\"(" + ITable + ") " + IText + " AND (" + JTable + ") " + JText + "\""); } } } }
public GovDatastore() { //define field names in GovDatastore data that we require for processing //TitleField = "title"; //LinkField = "resource-0-url"; //TagsField = "tags"; //DescriptionField = "notes_rendered"; CSVCatalogue reader = new CSVCatalogue(); reader.LineEndings = "\r"; //override line endings as this catalogue file only uses a CR this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile)); this.ResourcesDT = reader.ReadCatalogue(Path.Combine(DataRootDir, CatResourcesFile)); //datasets has: Name,Title,URL,Organization,Top level organisation,License,Published,NII,Location,Import source,Author,Geographic Coverage,Isopen,License,License Id,Maintainer,Mandate,Metadata Created,Metadata Modified,Notes,Odi Certificate,ODI Certificate URL,Tags,Temporal Coverage From,Temporal Coverage To,Primary Theme,Secondary Themes,Update Frequency,Version //resources has: Dataset Name,URL,Format,Description,Resource ID,Position,Date,Organization,Top level organization //so join on Name and Dataset Name //todo: this doesn't work as the dataset name in the resources file isn't unique - it contains multiple entries for all the resources attached to a dataset. //this means that you're going to have to handle two tables and merge the descriptions together somehow. //TODO: none of this works yet /*DataColumn DatasetNameCol = resource.Columns["Dataset Name"]; * resource.PrimaryKey = new DataColumn[] { DatasetNameCol }; * //create the new columns in catalogue * //foreach (DataColumn col in resource.Columns) * //{ * // if (col.ColumnName == "URL") this.Catalogue.Columns.Add("URL2"); //there's already one in the catalogue csv file * // if (col.ColumnName != "Dataset Name") this.Catalogue.Columns.Add(col.ColumnName, typeof(string)); * //} * //Manually add columns because of the duplicates * this.Catalogue.Columns.Add("URL2", typeof(string)); * this.Catalogue.Columns.Add("Format", typeof(string)); * this.Catalogue.Columns.Add("Description", typeof(string)); * this.Catalogue.Columns.Add("Resource ID", typeof(string)); * this.Catalogue.Columns.Add("Position", typeof(string)); * this.Catalogue.Columns.Add("Date", typeof(string)); * //now add elements to row, joining in name and Dataset Namerows * foreach (DataRow row in this.Catalogue.Rows) * { * string DatasetName = row["Name"] as string; * DataRow ResRow = resource.Rows.Find(DatasetName); * if (ResRow == null) * { * System.Diagnostics.Debug.WriteLine("Error: resource " + DatasetName + " not found in catalogue"); * } * else * { * row["URL2"] = ResRow["URL"]; * row["Format"] = ResRow["Format"]; * row["Description"] = ResRow["Description"]; * row["Resource ID"] = ResRow["Resource ID"]; * row["Position"] = ResRow["Position"]; * row["Date"] = ResRow["Date"]; * } * }*/ //resource-0-format is CSV (also look at RDF etc) //also note bbox-east-long, bbox-north-lat, bbox-south-lat, bbox-west-long, spatial-reference-system and spatial contains a polygon box //then create a schema to describe what the columns are //define field names in GovDatastore data that we require for processing //2012 schema //Schema = new DatastoreSchema(); //Schema.AddField("title", SemanticFieldType.Title); //Schema.AddField("notes_rendered", SemanticFieldType.Description); //Schema.AddField("resource-0-url", SemanticFieldType.Link); //Schema.AddField("tags", SemanticFieldType.Tags); //2016 schema //as of 4 April 2016, the data now looks like this: //Name,Title,URL,Organization,Top level organisation,License,Published,NII,Location,Import source,Author,Geographic Coverage,Isopen,License,License Id,Maintainer,Mandate,Metadata Created,Metadata Modified,Notes,Odi Certificate,ODI Certificate URL,Tags,Temporal Coverage From,Temporal Coverage To,Primary Theme,Secondary Themes,Update Frequency,Version Schema = new DatastoreSchema(); Schema.AddField("Title", SemanticFieldType.Title); //Schema.AddField("Notes", SemanticFieldType.Description); Schema.AddField("Description", SemanticFieldType.Description); //Schema.AddField("URL", SemanticFieldType.Link); Schema.AddField("URL2", SemanticFieldType.Link); Schema.AddField("Tags", SemanticFieldType.Tags); }