/// <summary> /// Returns a filled DataSet containing all valid information read in /// from the *.names and *.data files. /// </summary> public DataSet BuildDataSet() { var dataSet = new DataSet(); BuildNamesEntries(ref dataSet); BuildDataEntries(ref dataSet); return dataSet; }
/// <summary> /// Returns a training/testing set split based on the given trainingSize. /// </summary> /// <param name="trainingSize">Number of data instances we want in our training set.</param> /// <returns>List[DataSet] containing training/test sets</returns> public List<DataSet> RandomInstance(double trainingSize) { DataSet instance = Shuffle(); var training = new DataSet { Features = new List<Feature>(this.Features) }; training.DataEntries.AddRange(instance.DataEntries.Take((int)trainingSize)); var test = new DataSet(instance.DataEntries.Except(training.DataEntries).ToList()); test.Features = new List<Feature>(this.Features); training.OutputIndex = test.OutputIndex = this.OutputIndex; return new List<DataSet>(){training, test}; }
/// <summary> /// Shuffles this dataset and returns a new dataset containing the random instances. /// </summary> /// <returns>DataSet</returns> private DataSet Shuffle() { var instance = new DataSet { Features = new List<Feature>(this.Features) }; for(int i=DataEntries.Count-1; i>=0; i--) { int index = Rng.Next(DataEntries.Count); while(instance.DataEntries.Contains(DataEntries[index])) index = Rng.Next(DataEntries.Count); instance.DataEntries.Add(DataEntries[index]); } return instance; }
/// <summary> /// Reads all valid entries(determined by DataReader) and stores values into a List[object] /// in our DataSet; paired with the attributes read in from the names file. /// </summary> private void BuildDataEntries(ref DataSet dataSet) { foreach(string entry in m_DataReader.ValidEntries()) { string[] data = entry.Split(','); if(data.Length != dataSet.Features.Count){ Console.WriteLine("[Error]: Invalid # of data elements in {0}.", data.Select(s=>s.ToString(CultureInfo.InvariantCulture))); continue; } var instance = new DataInstance(); for(int i=0; i<data.Length; i++) { if(IsValidValue(dataSet.Features[i].Type, data[i], dataSet.Features[i].PossibleValues.ToArray())) instance.Add(data[i]); } if(instance.Count == dataSet.Features.Count) dataSet.DataEntries.Add(instance); } }
/// <summary> /// Reads all valid entries (determined by NamesReader) and stores values into /// and attribute object that is stored in our DataSet. /// </summary> private void BuildNamesEntries(ref DataSet dataSet) { foreach(string line in m_NamesReader.ValidEntries()) { string[] features = line.Split(':'); var feature = new Feature(features[0], (Types)Enum.Parse(typeof(Types), features[1])); foreach(string s in features[2].Split(',')) { feature.PossibleValues.Add(s.Trim()); } dataSet.Features.Add(feature); if(feature.Type == Types.Output){ dataSet.OutputIndex = dataSet.Features.Count-1; } } }
public KNearest(DataSet data) { m_DataSet = data; }