Пример #1
0
        public NOMIS2011Census()
        {
            //define field names on NOMIS website catalogue page that we require for processing
            //TitleField = "Description";
            //LinkField = "oaurl";
            //TagsField = "";
            //DescriptionField = ""; //doesn't exist
            CSVCatalogue reader = new CSVCatalogue();

            this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile));
            //FileFilterOptions = new FileFilter(FileFilterEnum.Top, "");
            FileFilterOptions = new FileFilter(FileFilterEnum.Pattern, "DATA.CSV"); //had to change this to prevent returning CODE0.CSV file instead

            //add weights for geometry to favour 2011 datasets over the older ones
            SetGeometryHint("OA_2011", 2.0f); SetGeometryHint("OA", 0.1f);
            SetGeometryHint("LSOA_2011", 2.0f); SetGeometryHint("LSOA", 0.1f);
            SetGeometryHint("MSOA_2011", 2.0f); SetGeometryHint("MSOA", 0.1f);

            //then create a schema to describe what the columns are
            Schema = new DatastoreSchema();
            Schema.AddField("TableName", SemanticFieldType.UniqueKey);
            Schema.AddField("Description", SemanticFieldType.Title);
            Schema.AddField("oaurl", SemanticFieldType.Link); //there are two links to data here - oa/lsoa/msoa or wards (below)
            //Schema.AddField("wardurl", SemanticFieldType.Link);

            //Now build a table of description text for every variable using the variables file.
            //This is a quick lookup between variable code and plain text which is used for writing out data file. This is
            //duplicated in the data table loading below.
            //VariableNameDescriptionText = new Dictionary<string, string>();
            //using (TextReader varsFile = File.OpenText(Path.Combine(DataRootDir, VariablesFile)))
            //{
            //    string Line = varsFile.ReadLine(); //skip header
            //    while ((Line = varsFile.ReadLine()) != null)
            //    {
            //        string[] Fields = CSVCatalogue.ParseCSVLine(Line); //need to do this for the quoted final column
            //        //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription
            //        //KS101EW0001,Count,Person,All categories: Sex
            //        //KS101EW0002,Count,Person,Males
            //        VariableNameDescriptionText.Add(Fields[0], Fields[3]);
            //    }
            //    varsFile.Close();
            //}

            //This is a full DataTable containing all the data about each individual variable from the variable lookup:
            //ColumnVariableCode,ColumnVariableMeasurementUnit,ColumnVariableStatisticalUnit,ColumnVariableDescription
            //KS101EW0001,Count,Person,All categories: Sex
            //KS101EW0002,Count,Person,Males
            //Used for the short and long description text.
            CSVCatalogue VarCatalogue = new CSVCatalogue();

            VariableMetaData            = VarCatalogue.ReadCatalogue(Path.Combine(DataRootDir, VariablesFile));
            VariableMetaData.PrimaryKey = new DataColumn[] { VariableMetaData.Columns["ColumnVariableCode"] };
        }
Пример #2
0
        //constructor?

        public LondonDatastore()
        {
            //define field names in LondonDatastore data that we require for processing
            //TitleField = "TITLE";
            //LinkField = "CSV_URL";
            //TagsField = "";
            //DescriptionField = "LONGDESC";
            CSVCatalogue reader = new CSVCatalogue();

            this.Catalogue = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile));

            //then create a schema to describe what the columns are
            //define field names in LondonDatastore data that we require for processing
            Schema = new DatastoreSchema();
            Schema.AddField("TITLE", SemanticFieldType.Title);
            Schema.AddField("LONGDESC", SemanticFieldType.Description);
            Schema.AddField("CSV_URL", SemanticFieldType.Link);
        }
Пример #3
0
        /// <summary>
        /// Build data to allow creation on MapTube.
        /// Output is a set of directories in the outputDirectory, with one for each file source.
        /// Also included is a SQL insert file which can be used to add the maps to the database.
        /// </summary>
        /// <param name="outputDirectory"></param>
        public void Build(string outputDirectory)
        {
            this.outputDirectory = outputDirectory;
            Directory.CreateDirectory(outputDirectory);
            DataTable       cat             = datastore.DatastoreCatalogue;
            DatastoreSchema schema          = datastore.DSSchema;
            int             TitleColIdx     = cat.Columns.IndexOf(schema.TitleField);
            int             LinkColIdx      = cat.Columns.IndexOf(schema.LinkField);
            int             UniqueKeyColIdx = cat.Columns.IndexOf(schema.UniqueKeyField);

            //delete any existing sql file
            File.Delete(Path.Combine(outputDirectory, MapTubeSQLFilename));

            //todo: check whether you need to create the data staging directory here

            for (int i = 0; i < cat.Rows.Count; i++)
            {
                DataRow Row       = cat.Rows[i];
                string  Title     = Row[TitleColIdx] as string;
                string  DataLink  = Row[LinkColIdx] as string;
                string  UniqueKey = Row[UniqueKeyColIdx] as string; //this is only unique for the table's name and file, so we're going to need to add a column number to this

                if (string.IsNullOrEmpty(DataLink))
                {
                    continue;                                 //no data so skip
                }
                //Data staging - download to the local file system and unzip if necessary
                Uri   StagedDataUri   = datastore.StageData(new Uri(DataLink));   //this is either the root of the extracted zip hierarchy, or an actual file
                Uri[] StagedDataFiles = datastore.FilterDataFiles(StagedDataUri); //get a list of files under the staging area that might contain data

                //now get the files and analyse it
                foreach (Uri FileUri in StagedDataFiles)
                {
                    //we should have a true file (not dir) at this point and it should be a valid type as it's been filtered (*.csv)
                    if (FileUri.LocalPath.ToLower().EndsWith(".csv"))
                    {
                        Console.WriteLine("Staged File: " + FileUri.LocalPath);
                        ProcessFile(FileUri, UniqueKey, Title);
                    }
                }
            }
        }
Пример #4
0
        /// <summary>
        /// Build the correlation matrix for every combination in the catalogue.
        /// Requires a list of pdf files matching the data files.
        /// </summary>
        /// <param name="Catalogue">The DataTable used by the main Datastore class for its catalogue</param>
        /// <param name="Schema">And the Schema is the Datastore.Schema property</param>
        public void GenerateTextCorrelation(DataTable Catalogue, DatastoreSchema Schema)
        {
            //TODO: get the hardcoded directories out!
            //KeywordProcessor kp = new KeywordProcessor(@"..\..\..\Data\glasgow_stop_words_mod.txt");
            //string PdfText = kp.GetPdfFileText(@"C:\richard\wxtemp\Datastores\CensusMetaData\ks101ew.pdf");
            //Dictionary<string, int> WordTable = kp.TextToHistogram(PdfText);
            //Dictionary<string, int> StemWordTable = kp.StemHistogram(WordTable);
            //KeywordProcessor.DebugPrintWords(StemWordTable);


            int TitleColIdx     = Catalogue.Columns.IndexOf(Schema.TitleField);
            int LinkColIdx      = Catalogue.Columns.IndexOf(Schema.LinkField);
            int UniqueKeyColIdx = Catalogue.Columns.IndexOf(Schema.UniqueKeyField);

            //We're going to create a vector space model (VSM) of the histograms for every document in
            //the set. This will enable the number of documents that each word appears in to be calculated
            //for the Term Frequency Inverse Document Frequency (TFIDF) method (Principles of Data Mining
            //by Max Bramer, pp244).
            int N = Catalogue.Rows.Count;

            Dictionary <string, float> [] VSM = new Dictionary <string, float> [N];

            //load all the documents and create histograms
            for (int i = 0; i < N; i++)
            {
                DataRow Row_i   = Catalogue.Rows[i];
                string  Title_i = Row_i[TitleColIdx] as string;
                //string DataLink_i = Row_i[LinkColIdx] as string;
                string UniqueKey_i = Row_i[UniqueKeyColIdx] as string;

                //extract the data from the pdf for the i table
                string PdfText_i = GetPdfFileText(@"C:\richard\wxtemp\Datastores\CensusMetaData\" + UniqueKey_i + ".pdf");
                Dictionary <string, float> WordTable_i     = TextToHistogram(PdfText_i);
                Dictionary <string, float> StemWordTable_i = StemHistogram(WordTable_i);
                //Dictionary<string, float> NormStemWordTable_i = NormaliseBagOfWords(StemWordTable_i);
                VSM[i] = StemWordTable_i;
            }
            //OK, that's the vector space model for all the tables, now do the TFIDF
            Dictionary <string, float>[] TFIDF = TermFrequencyInverseDocumentFrequency(VSM);
            //great, now I've got two copies of all the documents in memory! Lucky they're really quite small.
            //Now normalise all the weights
            for (int i = 0; i < TFIDF.Length; i++)
            {
                TFIDF[i] = NormaliseBagOfWords(TFIDF[i]);
            }

            //and finally, we have all the vectors we need to do the correlation...

            //do for every i, for every j and compute products of matching TFIDF stem words
            for (int i = 0; i < N; i++)
            {
                DataRow Row_i   = Catalogue.Rows[i];
                string  Title_i = Row_i[TitleColIdx] as string;
                //string DataLink_i = Row_i[LinkColIdx] as string;
                string UniqueKey_i = Row_i[UniqueKeyColIdx] as string;

                for (int j = 0; j < N; j++) //NOTE: could do j=i..N, but want a double check on results being symmetric
                {
                    DataRow Row_j   = Catalogue.Rows[j];
                    string  Title_j = Row_j[TitleColIdx] as string;
                    //string DataLink_j = Row_j[LinkColIdx] as string;
                    string UniqueKey_j = Row_j[UniqueKeyColIdx] as string;

                    //and finally work out the (inverse) distance i.e. dot product
                    float L = 0;
                    foreach (KeyValuePair <string, float> KVP in TFIDF[i])
                    {
                        if (TFIDF[j].ContainsKey(KVP.Key))
                        {
                            L += KVP.Value * TFIDF[j][KVP.Key]; //i*j from matching keywords
                        }
                    }
                    //NOTE: Distance would be 1-L at this point, but we want L
                    //write out data
                    System.Diagnostics.Debug.WriteLine(L + "," + i + "," + j + "," + UniqueKey_i + "," + UniqueKey_j);
                }
            }
        }
Пример #5
0
        public GovDatastore()
        {
            //define field names in GovDatastore data that we require for processing
            //TitleField = "title";
            //LinkField = "resource-0-url";
            //TagsField = "tags";
            //DescriptionField = "notes_rendered";
            CSVCatalogue reader = new CSVCatalogue();

            reader.LineEndings = "\r"; //override line endings as this catalogue file only uses a CR
            this.Catalogue     = reader.ReadCatalogue(Path.Combine(DataRootDir, CatalogueFile));
            this.ResourcesDT   = reader.ReadCatalogue(Path.Combine(DataRootDir, CatResourcesFile));
            //datasets has: Name,Title,URL,Organization,Top level organisation,License,Published,NII,Location,Import source,Author,Geographic Coverage,Isopen,License,License Id,Maintainer,Mandate,Metadata Created,Metadata Modified,Notes,Odi Certificate,ODI Certificate URL,Tags,Temporal Coverage From,Temporal Coverage To,Primary Theme,Secondary Themes,Update Frequency,Version
            //resources has: Dataset Name,URL,Format,Description,Resource ID,Position,Date,Organization,Top level organization
            //so join on Name and Dataset Name

            //todo: this doesn't work as the dataset name in the resources file isn't unique - it contains multiple entries for all the resources attached to a dataset.
            //this means that you're going to have to handle two tables and merge the descriptions together somehow.

            //TODO: none of this works yet

            /*DataColumn DatasetNameCol = resource.Columns["Dataset Name"];
             * resource.PrimaryKey = new DataColumn[] { DatasetNameCol };
             * //create the new columns in catalogue
             * //foreach (DataColumn col in resource.Columns)
             * //{
             * //    if (col.ColumnName == "URL") this.Catalogue.Columns.Add("URL2"); //there's already one in the catalogue csv file
             * //    if (col.ColumnName != "Dataset Name") this.Catalogue.Columns.Add(col.ColumnName, typeof(string));
             * //}
             * //Manually add columns because of the duplicates
             * this.Catalogue.Columns.Add("URL2", typeof(string));
             * this.Catalogue.Columns.Add("Format", typeof(string));
             * this.Catalogue.Columns.Add("Description", typeof(string));
             * this.Catalogue.Columns.Add("Resource ID", typeof(string));
             * this.Catalogue.Columns.Add("Position", typeof(string));
             * this.Catalogue.Columns.Add("Date", typeof(string));
             * //now add elements to row, joining in name and Dataset Namerows
             * foreach (DataRow row in this.Catalogue.Rows)
             * {
             *  string DatasetName = row["Name"] as string;
             *  DataRow ResRow = resource.Rows.Find(DatasetName);
             *  if (ResRow == null)
             *  {
             *      System.Diagnostics.Debug.WriteLine("Error: resource " + DatasetName + " not found in catalogue");
             *  }
             *  else
             *  {
             *      row["URL2"] = ResRow["URL"];
             *      row["Format"] = ResRow["Format"];
             *      row["Description"] = ResRow["Description"];
             *      row["Resource ID"] = ResRow["Resource ID"];
             *      row["Position"] = ResRow["Position"];
             *      row["Date"] = ResRow["Date"];
             *  }
             * }*/

            //resource-0-format is CSV (also look at RDF etc)
            //also note bbox-east-long, bbox-north-lat, bbox-south-lat, bbox-west-long, spatial-reference-system and spatial contains a polygon box

            //then create a schema to describe what the columns are
            //define field names in GovDatastore data that we require for processing
            //2012 schema
            //Schema = new DatastoreSchema();
            //Schema.AddField("title", SemanticFieldType.Title);
            //Schema.AddField("notes_rendered", SemanticFieldType.Description);
            //Schema.AddField("resource-0-url", SemanticFieldType.Link);
            //Schema.AddField("tags", SemanticFieldType.Tags);

            //2016 schema
            //as of 4 April 2016, the data now looks like this:
            //Name,Title,URL,Organization,Top level organisation,License,Published,NII,Location,Import source,Author,Geographic Coverage,Isopen,License,License Id,Maintainer,Mandate,Metadata Created,Metadata Modified,Notes,Odi Certificate,ODI Certificate URL,Tags,Temporal Coverage From,Temporal Coverage To,Primary Theme,Secondary Themes,Update Frequency,Version
            Schema = new DatastoreSchema();
            Schema.AddField("Title", SemanticFieldType.Title);
            //Schema.AddField("Notes", SemanticFieldType.Description);
            Schema.AddField("Description", SemanticFieldType.Description);
            //Schema.AddField("URL", SemanticFieldType.Link);
            Schema.AddField("URL2", SemanticFieldType.Link);
            Schema.AddField("Tags", SemanticFieldType.Tags);
        }