public NOMIS2011Census() { //define field names on NOMIS website catalogue page that we require for processing //TitleField = "Description"; //LinkField = "oaurl"; //TagsField = ""; //DescriptionField = ""; //doesn't exist CSVCatalogue reader = new CSVCatalogue(); this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile)); //FileFilterOptions = new FileFilter(FileFilterEnum.Top, ""); FileFilterOptions = new FileFilter(FileFilterEnum.Pattern, "DATA.CSV"); //had to change this to prevent returning CODE0.CSV file instead //add weights for geometry to favour 2011 datasets over the older ones SetGeometryHint("OA_2011", 2.0f); SetGeometryHint("OA", 0.1f); SetGeometryHint("LSOA_2011", 2.0f); SetGeometryHint("LSOA", 0.1f); SetGeometryHint("MSOA_2011", 2.0f); SetGeometryHint("MSOA", 0.1f); //then create a schema to describe what the columns are Schema = new DatastoreSchema(); Schema.AddField("TableName", SemanticFieldType.UniqueKey); Schema.AddField("Description", SemanticFieldType.Title); Schema.AddField("oaurl", SemanticFieldType.Link); //there are two links to data here - oa/lsoa/msoa or wards (below) //Schema.AddField("wardurl", SemanticFieldType.Link); //Now build a table of description text for every variable using the variables file. //This is a quick lookup between variable code and plain text which is used for writing out data file. This is //duplicated in the data table loading below. //VariableNameDescriptionText = new Dictionary<string, string>(); //using (TextReader varsFile = File.OpenText(Path.Combine(DataRootDir, VariablesFile))) //{ // string Line = varsFile.ReadLine(); //skip header // while ((Line = varsFile.ReadLine()) != null) // { // string[] Fields = CSVCatalogue.ParseCSVLine(Line); //need to do this for the quoted final column // //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription // //KS101EW0001,Count,Person,All categories: Sex // //KS101EW0002,Count,Person,Males // VariableNameDescriptionText.Add(Fields[0], Fields[3]); // } // varsFile.Close(); //} //This is a full DataTable containing all the data about each individual variable from the variable lookup: //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription //KS101EW0001,Count,Person,All categories: Sex //KS101EW0002,Count,Person,Males //Used for the short and long description text. CSVCatalogue VarCatalogue = new CSVCatalogue(); VariableMetaData = VarCatalogue.ReadCatalogue(Path.Combine(DataRootDir, VariablesFile)); VariableMetaData.PrimaryKey = new DataColumn[] { VariableMetaData.Columns["ColumnVariableCode"] }; }
//constructor? public LondonDatastore() { //define field names in LondonDatastore data that we require for processing //TitleField = "TITLE"; //LinkField = "CSV_URL"; //TagsField = ""; //DescriptionField = "LONGDESC"; CSVCatalogue reader = new CSVCatalogue(); this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile)); //then create a schema to describe what the columns are //define field names in LondonDatastore data that we require for processing Schema = new DatastoreSchema(); Schema.AddField("TITLE", SemanticFieldType.Title); Schema.AddField("LONGDESC", SemanticFieldType.Description); Schema.AddField("CSV_URL", SemanticFieldType.Link); }
/// <summary> /// Build data to allow creation on MapTube. /// Output is a set of directories in the outputDirectory, with one for each file source. /// Also included is a SQL insert file which can be used to add the maps to the database. /// </summary> /// <param name="outputDirectory"></param> public void Build(string outputDirectory) { this.outputDirectory = outputDirectory; Directory.CreateDirectory(outputDirectory); DataTable cat = datastore.DatastoreCatalogue; DatastoreSchema schema = datastore.DSSchema; int TitleColIdx = cat.Columns.IndexOf(schema.TitleField); int LinkColIdx = cat.Columns.IndexOf(schema.LinkField); int UniqueKeyColIdx = cat.Columns.IndexOf(schema.UniqueKeyField); //delete any existing sql file File.Delete(Path.Combine(outputDirectory, MapTubeSQLFilename)); //todo: check whether you need to create the data staging directory here for (int i = 0; i < cat.Rows.Count; i++) { DataRow Row = cat.Rows[i]; string Title = Row[TitleColIdx] as string; string DataLink = Row[LinkColIdx] as string; string UniqueKey = Row[UniqueKeyColIdx] as string; //this is only unique for the table's name and file, so we're going to need to add a column number to this if (string.IsNullOrEmpty(DataLink)) { continue; //no data so skip } //Data staging - download to the local file system and unzip if necessary Uri StagedDataUri = datastore.StageData(new Uri(DataLink)); //this is either the root of the extracted zip hierarchy, or an actual file Uri[] StagedDataFiles = datastore.FilterDataFiles(StagedDataUri); //get a list of files under the staging area that might contain data //now get the files and analyse it foreach (Uri FileUri in StagedDataFiles) { //we should have a true file (not dir) at this point and it should be a valid type as it's been filtered (*.csv) if (FileUri.LocalPath.ToLower().EndsWith(".csv")) { Console.WriteLine("Staged File: " + FileUri.LocalPath); ProcessFile(FileUri, UniqueKey, Title); } } } }
/// <summary> /// Build the correlation matrix for every combination in the catalogue. /// Requires a list of pdf files matching the data files. /// </summary> /// <param name="Catalogue">The DataTable used by the main Datastore class for its catalogue</param> /// <param name="Schema">And the Schema is the Datastore.Schema property</param> public void GenerateTextCorrelation(DataTable Catalogue, DatastoreSchema Schema) { //TODO: get the hardcoded directories out! //KeywordProcessor kp = new KeywordProcessor(@"..\..\..\Data\glasgow_stop_words_mod.txt"); //string PdfText = kp.GetPdfFileText(@"C:\richard\wxtemp\Datastores\CensusMetaData\ks101ew.pdf"); //Dictionary<string, int> WordTable = kp.TextToHistogram(PdfText); //Dictionary<string, int> StemWordTable = kp.StemHistogram(WordTable); //KeywordProcessor.DebugPrintWords(StemWordTable); int TitleColIdx = Catalogue.Columns.IndexOf(Schema.TitleField); int LinkColIdx = Catalogue.Columns.IndexOf(Schema.LinkField); int UniqueKeyColIdx = Catalogue.Columns.IndexOf(Schema.UniqueKeyField); //We're going to create a vector space model (VSM) of the histograms for every document in //the set. This will enable the number of documents that each word appears in to be calculated //for the Term Frequency Inverse Document Frequency (TFIDF) method (Principles of Data Mining //by Max Bramer, pp244). int N = Catalogue.Rows.Count; Dictionary <string, float> [] VSM = new Dictionary <string, float> [N]; //load all the documents and create histograms for (int i = 0; i < N; i++) { DataRow Row_i = Catalogue.Rows[i]; string Title_i = Row_i[TitleColIdx] as string; //string DataLink_i = Row_i[LinkColIdx] as string; string UniqueKey_i = Row_i[UniqueKeyColIdx] as string; //extract the data from the pdf for the i table string PdfText_i = GetPdfFileText(@"C:\richard\wxtemp\Datastores\CensusMetaData\" + UniqueKey_i + ".pdf"); Dictionary <string, float> WordTable_i = TextToHistogram(PdfText_i); Dictionary <string, float> StemWordTable_i = StemHistogram(WordTable_i); //Dictionary<string, float> NormStemWordTable_i = NormaliseBagOfWords(StemWordTable_i); VSM[i] = StemWordTable_i; } //OK, that's the vector space model for all the tables, now do the TFIDF Dictionary <string, float>[] TFIDF = TermFrequencyInverseDocumentFrequency(VSM); //great, now I've got two copies of all the documents in memory! Lucky they're really quite small. //Now normalise all the weights for (int i = 0; i < TFIDF.Length; i++) { TFIDF[i] = NormaliseBagOfWords(TFIDF[i]); } //and finally, we have all the vectors we need to do the correlation... //do for every i, for every j and compute products of matching TFIDF stem words for (int i = 0; i < N; i++) { DataRow Row_i = Catalogue.Rows[i]; string Title_i = Row_i[TitleColIdx] as string; //string DataLink_i = Row_i[LinkColIdx] as string; string UniqueKey_i = Row_i[UniqueKeyColIdx] as string; for (int j = 0; j < N; j++) //NOTE: could do j=i..N, but want a double check on results being symmetric { DataRow Row_j = Catalogue.Rows[j]; string Title_j = Row_j[TitleColIdx] as string; //string DataLink_j = Row_j[LinkColIdx] as string; string UniqueKey_j = Row_j[UniqueKeyColIdx] as string; //and finally work out the (inverse) distance i.e. dot product float L = 0; foreach (KeyValuePair <string, float> KVP in TFIDF[i]) { if (TFIDF[j].ContainsKey(KVP.Key)) { L += KVP.Value * TFIDF[j][KVP.Key]; //i*j from matching keywords } } //NOTE: Distance would be 1-L at this point, but we want L //write out data System.Diagnostics.Debug.WriteLine(L + "," + i + "," + j + "," + UniqueKey_i + "," + UniqueKey_j); } } }
public GovDatastore() { //define field names in GovDatastore data that we require for processing //TitleField = "title"; //LinkField = "resource-0-url"; //TagsField = "tags"; //DescriptionField = "notes_rendered"; CSVCatalogue reader = new CSVCatalogue(); reader.LineEndings = "\r"; //override line endings as this catalogue file only uses a CR this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile)); this.ResourcesDT = reader.ReadCatalogue(Path.Combine(DataRootDir, CatResourcesFile)); //datasets has: Name,Title,URL,Organization,Top level organisation,License,Published,NII,Location,Import source,Author,Geographic Coverage,Isopen,License,License Id,Maintainer,Mandate,Metadata Created,Metadata Modified,Notes,Odi Certificate,ODI Certificate URL,Tags,Temporal Coverage From,Temporal Coverage To,Primary Theme,Secondary Themes,Update Frequency,Version //resources has: Dataset Name,URL,Format,Description,Resource ID,Position,Date,Organization,Top level organization //so join on Name and Dataset Name //todo: this doesn't work as the dataset name in the resources file isn't unique - it contains multiple entries for all the resources attached to a dataset. //this means that you're going to have to handle two tables and merge the descriptions together somehow. //TODO: none of this works yet /*DataColumn DatasetNameCol = resource.Columns["Dataset Name"]; * resource.PrimaryKey = new DataColumn[] { DatasetNameCol }; * //create the new columns in catalogue * //foreach (DataColumn col in resource.Columns) * //{ * // if (col.ColumnName == "URL") this.Catalogue.Columns.Add("URL2"); //there's already one in the catalogue csv file * // if (col.ColumnName != "Dataset Name") this.Catalogue.Columns.Add(col.ColumnName, typeof(string)); * //} * //Manually add columns because of the duplicates * this.Catalogue.Columns.Add("URL2", typeof(string)); * this.Catalogue.Columns.Add("Format", typeof(string)); * this.Catalogue.Columns.Add("Description", typeof(string)); * this.Catalogue.Columns.Add("Resource ID", typeof(string)); * this.Catalogue.Columns.Add("Position", typeof(string)); * this.Catalogue.Columns.Add("Date", typeof(string)); * //now add elements to row, joining in name and Dataset Namerows * foreach (DataRow row in this.Catalogue.Rows) * { * string DatasetName = row["Name"] as string; * DataRow ResRow = resource.Rows.Find(DatasetName); * if (ResRow == null) * { * System.Diagnostics.Debug.WriteLine("Error: resource " + DatasetName + " not found in catalogue"); * } * else * { * row["URL2"] = ResRow["URL"]; * row["Format"] = ResRow["Format"]; * row["Description"] = ResRow["Description"]; * row["Resource ID"] = ResRow["Resource ID"]; * row["Position"] = ResRow["Position"]; * row["Date"] = ResRow["Date"]; * } * }*/ //resource-0-format is CSV (also look at RDF etc) //also note bbox-east-long, bbox-north-lat, bbox-south-lat, bbox-west-long, spatial-reference-system and spatial contains a polygon box //then create a schema to describe what the columns are //define field names in GovDatastore data that we require for processing //2012 schema //Schema = new DatastoreSchema(); //Schema.AddField("title", SemanticFieldType.Title); //Schema.AddField("notes_rendered", SemanticFieldType.Description); //Schema.AddField("resource-0-url", SemanticFieldType.Link); //Schema.AddField("tags", SemanticFieldType.Tags); //2016 schema //as of 4 April 2016, the data now looks like this: //Name,Title,URL,Organization,Top level organisation,License,Published,NII,Location,Import source,Author,Geographic Coverage,Isopen,License,License Id,Maintainer,Mandate,Metadata Created,Metadata Modified,Notes,Odi Certificate,ODI Certificate URL,Tags,Temporal Coverage From,Temporal Coverage To,Primary Theme,Secondary Themes,Update Frequency,Version Schema = new DatastoreSchema(); Schema.AddField("Title", SemanticFieldType.Title); //Schema.AddField("Notes", SemanticFieldType.Description); Schema.AddField("Description", SemanticFieldType.Description); //Schema.AddField("URL", SemanticFieldType.Link); Schema.AddField("URL2", SemanticFieldType.Link); Schema.AddField("Tags", SemanticFieldType.Tags); }