Example #1
0
        public CsvRecordIterator(CsvDataSource datasource, CsvReader reader)
        {
            _reader  = reader;
            _builder = new RecordBuilder(datasource);

            // index here is random 0-n. index[0] gives the column no in the CSV
            // file, while colname[0] gives the corresponding column name.
            int columnSize = datasource.GetColumns().Count;

            _index  = new int[columnSize];
            _column = new Column[columnSize];

            // skip the required number of lines before getting to the data
            for (int ix = 0; ix < datasource.SkipLines; ix++)
            {
                _reader.Next();
            }

            // learn column indexes from header line (if there is one)
            String[] header;
            if (datasource.HasHeader)
            {
                header = _reader.Next();
            }
            else
            {
                // find highest column number
                int high = datasource.GetColumns().Select(c => Int32.Parse(c.GetName())).Concat(new[] { 0 }).Max();

                // build corresponding index
                header = new string[high];
                for (int ix = 0; ix < high; ix++)
                {
                    header[ix] = "" + (ix + 1);
                }
            }

            // build the 'index' and 'column' indexes
            int count = 0;

            foreach (var column in datasource.GetColumns())
            {
                for (int ix = 0; ix < header.Length; ix++)
                {
                    if (header[ix].Equals(column.GetName()))
                    {
                        _index[count]    = ix;
                        _column[count++] = column;
                        break;
                    }
                }
            }

            FindNextRecord();
        }
Example #2
0
        public CsvRecordIterator(CsvDataSource datasource, CsvReader reader)
        {
            _reader = reader;
            _builder = new RecordBuilder(datasource);

            // index here is random 0-n. index[0] gives the column no in the CSV
            // file, while colname[0] gives the corresponding column name.
            int columnSize = datasource.GetColumns().Count;
            _index = new int[columnSize];
            _column = new Column[columnSize];

            // skip the required number of lines before getting to the data
            for (int ix = 0; ix < datasource.SkipLines; ix++)
            {
                _reader.Next();
            }

            // learn column indexes from header line (if there is one)
            String[] header;
            if (datasource.HasHeader)
            {
                header = _reader.Next();
            }
            else
            {
                // find highest column number
                int high = datasource.GetColumns().Select(c => Int32.Parse(c.GetName())).Concat(new[] {0}).Max();

                // build corresponding index
                header = new string[high];
                for (int ix = 0; ix < high; ix++)
                {
                    header[ix] = "" + (ix + 1);
                }
            }

            // build the 'index' and 'column' indexes
            int count = 0;
            foreach (var column in datasource.GetColumns())
            {
                for (int ix = 0; ix < header.Length; ix++)
                {
                    if (header[ix].Equals(column.GetName()))
                    {
                        _index[count] = ix;
                        _column[count++] = column;
                        break;
                    }
                }
            }

            FindNextRecord();
        }
Example #3
0
        //Note that if file starts with 'classpath:' the resource is looked
        // up on the classpath instead.
        public static Configuration Load(string file)
        {
            var cfg = new Configuration();
            var properties = new List<Property>();

            // Get the appropriate nodes using Linq to XML
            XElement xml = XElement.Load(file);

            // Get the threshold
            double threshold =
                xml.Elements("schema").Descendants("threshold").Select(x => double.Parse(x.Value)).FirstOrDefault();
            cfg.Threshold = threshold;

            // Get all of the properties
            IEnumerable<XElement> xmlProperties = from s in xml.Elements("schema")
                                                  from p in s.Descendants("property")
                                                  select p;

            foreach (XElement xElement in xmlProperties)
            {
                string propName = xElement.Descendants("name").First().Value;
                var property = new Property(propName);

                // Check to see if this is an id property
                XAttribute xAttribute = xElement.Attribute("type");
                if (xAttribute != null)
                {
                    string id = xAttribute.Value;
                    if (id != null && id == "id")
                    {
                        property.IsIdProperty = true;
                    }
                }
                else
                {
                    string comparatorName = xElement.Descendants("comparator").FirstOrDefault().Value;
                    property.Comparator = GetComparatorFromString(comparatorName);
                    property.LowProbability =
                        xElement.Descendants("low").Select(x => double.Parse(x.Value)).FirstOrDefault();
                    property.HighProbability =
                        xElement.Descendants("high").Select(x => double.Parse(x.Value)).FirstOrDefault();
                    properties.Add(property);
                }
            }

            cfg.SetProperties(properties);

            //// Get the datasources
            //XPathNodeIterator dsi = xpn.Select("/duke/*[not(self::schema)]");

            //while (dsi.MoveNext())
            //{
            //    if (dsi.Current != null && xpi.Current.Name == "csv")
            //    {
            //        var datasource = GetCsvDataSourceFromXml(dsi, xpn);
            //    }
            //}
            IEnumerable<XElement> dataSources = from d in xml.Elements()
                                                where d.Name != "schema"
                                                select d;

            foreach (XElement dataSource in dataSources)
            {
                if (dataSource.Name == "csv")
                {
                    var csvDs = new CsvDataSource();
                    Hashtable csvParams = GetParametersTable(dataSource);
                    csvDs.File = csvParams["input-file"].ToString();
                    if (csvParams.Contains("header-line"))
                        csvDs.HasHeader = (csvParams["header-line"].ToString().ToLower() == "true");

                    if (csvParams.Contains("skip-lines"))
                    {
                        int skipLines = 0;
                        csvDs.SkipLines = Int32.TryParse(csvParams["skip-lines"].ToString(), out skipLines)
                                              ? skipLines
                                              : 0;
                    }

                    csvDs.FileEncoding = csvParams.Contains("encoding")
                                             ? GetTextEncodingFromString(csvParams["encoding"].ToString())
                                             : Encoding.Default;

                    List<Column> cols = GetDataSourceColumns(dataSource);
                    foreach (Column column in cols)
                    {
                        csvDs.AddColumn(column);
                    }

                    cfg.AddDataSource(0, csvDs);
                }
            }

            return cfg;
        }