コード例 #1
        /// <summary>
        /// Extracts feature vectors from points in a time range.
        /// </summary>
        /// <param name="prediction">Prediction to extract vectors for.</param>
        /// <param name="training">Whether or not this is the training phase.</param>
        /// <param name="start">Start time (points without a time are always included).</param>
        /// <param name="end">End time (points without a time are always included).</param>
        /// <returns></returns>
        protected virtual IEnumerable<FeatureVectorList> ExtractFeatureVectors(Prediction prediction, bool training, DateTime start, DateTime end)
            // this can be called concurrently (e.g., via the time slice model with one thread per slice), so lock on prediction to get the point objects and their vectors
            FeatureVectorList featureVectors;
            Dictionary<int, FeatureVector> pointIdFeatureVector;
            int numFeatures;
            lock (prediction)
                prediction.ReleasePoints(); // so that we get new point objects each time -- their times might be modified by a sub-class (e.g., TimeSliceDCM).
                featureVectors = new FeatureVectorList(prediction.Points.Count);
                pointIdFeatureVector = new Dictionary<int, FeatureVector>(prediction.Points.Count);
                numFeatures = GetNumFeaturesExtractedFor(prediction);
                foreach (Point point in prediction.Points)
                    if (point.Time == DateTime.MinValue || (point.Time >= start && point.Time <= end))
                        point.TrueClass = point.IncidentType;
                        FeatureVector vector = new FeatureVector(point, numFeatures);
                        pointIdFeatureVector.Add(point.Id, vector);

            Area area = training ? prediction.Model.TrainingArea : prediction.PredictionArea;
            Set<Thread> threads = new Set<Thread>();

            #region spatial distance features
            List<Feature> spatialDistanceFeatures = Features.Where(f => f.EnumValue.Equals(FeatureType.MinimumDistanceToGeometry)).ToList();
            if (spatialDistanceFeatures.Count > 0)
                Console.Out.WriteLine("Extracting spatial distance feature values");
                float distanceWhenBeyondThreshold = (float)Math.Sqrt(2.0 * Math.Pow(FeatureDistanceThreshold, 2)); // with a bounding box of FeatureDistanceThreshold around each point, the maximum distance between a point and some feature shapefile geometry would be sqrt(2*FeatureDistanceThreshold^2). That is, the feature shapefile geometry would be positioned in one of the corners of the bounding box.
                for (int i = 0; i < Configuration.ProcessorCount; ++i)
                    Thread t = new Thread(new ParameterizedThreadStart(o =>
                            int core = (int)o;
                            NpgsqlConnection threadConnection = DB.Connection.OpenConnection;
                            string pointTableName = Point.GetTableName(prediction);
                            foreach (Feature spatialDistanceFeature in spatialDistanceFeatures)
                                Shapefile shapefile = new Shapefile(int.Parse(training ? spatialDistanceFeature.TrainingResourceId : spatialDistanceFeature.PredictionResourceId));

                                NpgsqlCommand cmd = DB.Connection.NewCommand("SELECT points." + Point.Columns.Id + " as points_" + Point.Columns.Id + "," +
                                                                             "CASE WHEN COUNT(" + shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Geometry + ")=0 THEN " + distanceWhenBeyondThreshold + " " +
                                                                             "ELSE min(st_distance(st_closestpoint(" + shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Geometry + ",points." + Point.Columns.Location + "),points." + Point.Columns.Location + ")) " +
                                                                             "END as feature_value " +

                                                                             "FROM (SELECT *,st_expand(" + pointTableName + "." + Point.Columns.Location + "," + FeatureDistanceThreshold + ") as bounding_box " +
                                                                                   "FROM " + pointTableName + " " +
                                                                                   "WHERE " + pointTableName + "." + Point.Columns.Id + " % " + Configuration.ProcessorCount + " = " + core + " AND " +
                                                                                              "(" +
                                                                                                  pointTableName + "." + Point.Columns.Time + "='-infinity'::timestamp OR " +
                                                                                                  "(" +
                                                                                                      pointTableName + "." + Point.Columns.Time + ">=@point_start AND " +
                                                                                                      pointTableName + "." + Point.Columns.Time + "<=@point_end" +
                                                                                                  ")" +
                                                                                              ")" +
                                                                                   ") points " +

                                                                             "LEFT JOIN " + shapefile.GeometryTable + " " +

                                                                             "ON points.bounding_box && " + shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Geometry + " AND " +
                                                                                 "(" +
                                                                                    shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Time + "='-infinity'::timestamp OR " +
                                                                                    "(" +
                                                                                        shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Time + ">=@geometry_start AND " +
                                                                                        shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Time + "<=@geometry_end" +
                                                                                    ")" +
                                                                                 ")" +

                                                                             "GROUP BY points." + Point.Columns.Id, null, threadConnection);

                                DateTime spatialDistanceFeatureStart = start - spatialDistanceFeature.Parameters.GetTimeSpanValue(SpatialDistanceParameter.LagOffset);
                                DateTime spatialDistanceFeatureEnd = spatialDistanceFeatureStart + spatialDistanceFeature.Parameters.GetTimeSpanValue(SpatialDistanceParameter.LagDuration);

                                if (spatialDistanceFeatureEnd >= start)
                                    Console.Out.WriteLine("WARNING:  Spatial distance sample overlaps extraction period.");

                                if (spatialDistanceFeatureEnd < spatialDistanceFeatureStart)
                                    Console.Out.WriteLine("WARNING:  Spatial distance sample end precedes sample start.");

                                ConnectionPool.AddParameters(cmd, new Parameter("point_start", NpgsqlDbType.Timestamp, start),
                                                                  new Parameter("point_end", NpgsqlDbType.Timestamp, end),
                                                                  new Parameter("geometry_start", NpgsqlDbType.Timestamp, spatialDistanceFeatureStart),
                                                                  new Parameter("geometry_end", NpgsqlDbType.Timestamp, spatialDistanceFeatureEnd));

                                NpgsqlDataReader reader = cmd.ExecuteReader();
                                NumericFeature distanceFeature = _idNumericFeature[spatialDistanceFeature.Id];
                                while (reader.Read())
                                    FeatureVector vector;
                                    if (!pointIdFeatureVector.TryGetValue(Convert.ToInt32(reader["points_" + Point.Columns.Id]), out vector))  // above, we select all points that fall between point_start and point_end. the latter can be one tick short of the next minute, and npgsql rounds up causing points to appear in the reader that we didn't add to the pointIdFeatureVector collection.

                                    double value = Convert.ToDouble(reader["feature_value"]);

                                    // value > threshold shouldn't happen here, since we exluced such objects from consideration above; however, the calculations aren't perfect in postgis, so we check again and reset appropriately
                                    if (value > distanceWhenBeyondThreshold)
                                        value = distanceWhenBeyondThreshold;

                                    vector.Add(distanceFeature, value, false); // don't update range due to concurrent access to the feature



                foreach (Thread t in threads)

            #region spatial density features
            List<Feature> spatialDensityFeatures = Features.Where(f => f.EnumValue.Equals(FeatureType.GeometryDensity)).ToList();
            if (spatialDensityFeatures.Count > 0)
                List<PostGIS.Point> densityEvalPoints = featureVectors.Select(v => (v.DerivedFrom as Point).Location).ToList();
                Dictionary<string, List<float>> featureIdDensityEstimates = new Dictionary<string, List<float>>(spatialDensityFeatures.Count);
                for (int i = 0; i < Configuration.ProcessorCount; ++i)
                    Thread t = new Thread(new ParameterizedThreadStart(core =>
                            NpgsqlCommand command = DB.Connection.NewCommand(null);
                            for (int j = (int)core; j < spatialDensityFeatures.Count; j += Configuration.ProcessorCount)
                                Feature spatialDensityFeature = spatialDensityFeatures[j];

                                DateTime spatialDensityFeatureStart = start - spatialDensityFeature.Parameters.GetTimeSpanValue(SpatialDensityParameter.LagOffset);
                                DateTime spatialDensityFeatureEnd = spatialDensityFeatureStart + spatialDensityFeature.Parameters.GetTimeSpanValue(SpatialDensityParameter.LagDuration);

                                if (spatialDensityFeatureEnd >= start)
                                    Console.Out.WriteLine("WARNING:  Spatial density sample overlaps extraction period.");

                                if (spatialDensityFeatureEnd < spatialDensityFeatureStart)
                                    Console.Out.WriteLine("WARNING:  Spatial density sample end precedes sample start.");

                                Shapefile shapefile = new Shapefile(int.Parse(training ? spatialDensityFeature.TrainingResourceId : spatialDensityFeature.PredictionResourceId));
                                string geometryRecordWhereClause = "WHERE " + ShapefileGeometry.Columns.Time + "='-infinity'::timestamp OR (" + ShapefileGeometry.Columns.Time + ">=@geometry_start AND " + ShapefileGeometry.Columns.Time + "<=@geometry_end)";
                                Parameter geometryStart = new Parameter("geometry_start", NpgsqlDbType.Timestamp, spatialDensityFeatureStart);
                                Parameter geometryEnd = new Parameter("geometry_end", NpgsqlDbType.Timestamp, spatialDensityFeatureEnd);
                                List<PostGIS.Point> kdeInputPoints = Geometry.GetPoints(command, shapefile.GeometryTable, ShapefileGeometry.Columns.Geometry, ShapefileGeometry.Columns.Id, geometryRecordWhereClause, -1, geometryStart.NpgsqlParameter, geometryEnd.NpgsqlParameter).SelectMany(pointList => pointList).Select(p => new PostGIS.Point(p.X, p.Y, area.Shapefile.SRID)).ToList();

                                Console.Out.WriteLine("Computing spatial density of \"" + shapefile.Name + "\".");
                                int sampleSize = spatialDensityFeature.Parameters.GetIntegerValue(SpatialDensityParameter.SampleSize);
                                List<float> densityEstimates = KernelDensityDCM.GetDensityEstimate(kdeInputPoints, sampleSize, false, -1, -1, densityEvalPoints, false);

                                // the density might not be computable if too few points are provided -- use default value for all evaluation points in such cases
                                if (densityEstimates.Count != densityEvalPoints.Count)
                                    float defaultValue = spatialDensityFeature.Parameters.GetFloatValue(SpatialDensityParameter.DefaultValue);
                                    Console.Out.WriteLine("WARNING:  Using default value \"" + defaultValue + "\" for feature " + spatialDensityFeature);
                                    densityEstimates = Enumerable.Repeat(defaultValue, densityEvalPoints.Count).ToList();

                                lock (featureIdDensityEstimates) { featureIdDensityEstimates.Add(spatialDensityFeature.Id, densityEstimates); }



                foreach (Thread t in threads)

                foreach (string featureId in featureIdDensityEstimates.Keys)
                    List<float> densityEstimates = featureIdDensityEstimates[featureId];
                    NumericFeature densityFeature = _idNumericFeature[featureId];
                    for (int i = 0; i < densityEstimates.Count; ++i)
                        featureVectors[i].Add(densityFeature, densityEstimates[i], false);  // don't update range due to concurrent access to the feature

            #region geometry attribute features
            List<Feature> geometryAttributeFeatures = Features.Where(f => f.EnumValue.Equals(FeatureType.GeometryAttribute)).ToList();
            if (geometryAttributeFeatures.Count > 0)
                Console.Out.WriteLine("Extracting geometry attribute features.");
                for (int i = 0; i < Configuration.ProcessorCount; ++i)
                    Thread t = new Thread(new ParameterizedThreadStart(o =>
                            int core = (int)o;
                            NpgsqlConnection threadConnection = DB.Connection.OpenConnection;
                            string pointTableName = Point.GetTableName(prediction);
                            foreach (Feature geometryAttributeFeature in geometryAttributeFeatures)
                                Shapefile shapefile = new Shapefile(int.Parse(training ? geometryAttributeFeature.TrainingResourceId : geometryAttributeFeature.PredictionResourceId));
                                string attributeColumn = geometryAttributeFeature.Parameters.GetStringValue(GeometryAttributeParameter.AttributeColumn);
                                NpgsqlCommand cmd = DB.Connection.NewCommand("SELECT " + pointTableName + "." + Point.Columns.Id + " as point_id," + shapefile.GeometryTable + "." + attributeColumn + " as geometry_attribute " +
                                                                             "FROM " + pointTableName + " " +
                                                                             "LEFT JOIN " + shapefile.GeometryTable + " " + // the geometry might not overlap the point, in which case we'll use the default feature value below
                                                                             "ON st_intersects(" + pointTableName + "." + Point.Columns.Location + "," + shapefile.GeometryTable + "." + ShapefileGeometry.Columns.Geometry + ") " +
                                                                             "WHERE " + pointTableName + "." + Point.Columns.Id + " % " + Configuration.ProcessorCount + " = " + core + " AND " +
                                                                                        "(" +
                                                                                          pointTableName + "." + Point.Columns.Time + "='-infinity'::timestamp OR " +
                                                                                          "(" +
                                                                                            pointTableName + "." + Point.Columns.Time + ">=@point_start AND " +
                                                                                            pointTableName + "." + Point.Columns.Time + "<=@point_end" +
                                                                                          ")" +
                                                                                        ") " +
                                                                             "ORDER BY " + pointTableName + "." + Point.Columns.Id, null, threadConnection);

                                ConnectionPool.AddParameters(cmd, new Parameter("point_start", NpgsqlDbType.Timestamp, start),
                                                                  new Parameter("point_end", NpgsqlDbType.Timestamp, end));

                                LAIR.MachineLearning.Feature attributeFeature;
                                string attributeType = geometryAttributeFeature.Parameters.GetStringValue(GeometryAttributeParameter.AttributeType);
                                if (attributeType == "Numeric")
                                    attributeFeature = _idNumericFeature[geometryAttributeFeature.Id] as LAIR.MachineLearning.Feature;
                                else if (attributeType == "Nominal")
                                    attributeFeature = _idNominalFeature[geometryAttributeFeature.Id] as LAIR.MachineLearning.Feature;
                                    throw new NotImplementedException("Unrecognized geometry attribute feature type:  " + attributeType);

                                List<object> values = new List<object>();
                                int currPointId = -1;
                                int pointId = -1;

                                Action addFeatureToVector = new Action(() =>
                                        if (values.Count > 0)
                                            FeatureVector vector = pointIdFeatureVector[currPointId];
                                            if (attributeFeature is NumericFeature)
                                                vector.Add(attributeFeature, values.Select(v => Convert.ToSingle(v)).Average(), false);  // don't update range due to concurrent access to the feature
                                            else if (values.Count == 1)
                                                vector.Add(attributeFeature, Convert.ToString(values[0]), false);  // don't update range due to concurrent access to the feature
                                                throw new Exception("Nominal geometry attribute \"" + attributeColumn + "\" of shapefile \"" + shapefile.GeometryTable + "\" has multiple non-numeric values at point \"" + (vector.DerivedFrom as Point).Location + "\".");

                                        currPointId = pointId;

                                NpgsqlDataReader reader = cmd.ExecuteReader();
                                string defaultValue = geometryAttributeFeature.Parameters.GetStringValue(GeometryAttributeParameter.DefaultValue);
                                while (reader.Read())
                                    pointId = Convert.ToInt32(reader["point_id"]);
                                    if (pointId != currPointId)

                                    object value = reader["geometry_attribute"];
                                    if (value is DBNull)  // we did a left join above, so the value might be null meaning the geometry did not overlap the point
                                        value = defaultValue;





                foreach (Thread t in threads)

            #region incident density features
            List<Feature> kdeFeatures = Features.Where(f => f.EnumValue.Equals(FeatureType.IncidentDensity)).ToList();
            if (kdeFeatures.Count > 0)
                List<PostGIS.Point> densityEvalPoints = featureVectors.Select(v => (v.DerivedFrom as Point).Location).ToList();
                Dictionary<string, List<float>> featureIdDensityEstimates = new Dictionary<string, List<float>>(kdeFeatures.Count);
                for (int i = 0; i < Configuration.ProcessorCount; ++i)
                    Thread t = new Thread(new ParameterizedThreadStart(core =>
                            for (int j = (int)core; j < kdeFeatures.Count; j += Configuration.ProcessorCount)
                                Feature kdeFeature = kdeFeatures[j];

                                List<PostGIS.Point> kdeInputPoints = new List<PostGIS.Point>();
                                string incident = training ? kdeFeature.TrainingResourceId : kdeFeature.PredictionResourceId;
                                int lagCount = kdeFeature.Parameters.GetIntegerValue(IncidentDensityParameter.LagCount);
                                TimeSpan lagOffset = kdeFeature.Parameters.GetTimeSpanValue(IncidentDensityParameter.LagOffset);
                                TimeSpan lagDuration = kdeFeature.Parameters.GetTimeSpanValue(IncidentDensityParameter.LagDuration);
                                for (int k = 1; k <= lagCount; ++k)
                                    DateTime incidentSampleStart = start - new TimeSpan(k * lagOffset.Ticks);
                                    DateTime incidentSampleEnd = incidentSampleStart + lagDuration;

                                    if (incidentSampleEnd >= start)
                                        Console.Out.WriteLine("WARNING:  Incident density sample overlaps extraction period.");

                                    if (incidentSampleEnd < incidentSampleStart)
                                        Console.Out.WriteLine("WARNING:  Incident density sample end precedes sample start.");

                                    kdeInputPoints.AddRange(Incident.Get(incidentSampleStart, incidentSampleEnd, area, incident).Select(inc => inc.Location));

                                Console.Out.WriteLine("Computing spatial density of \"" + incident + "\" with " + lagCount + " lag(s) at offset " + lagOffset + ", each with duration " + lagDuration);
                                int sampleSize = kdeFeature.Parameters.GetIntegerValue(IncidentDensityParameter.SampleSize);
                                List<float> densityEstimates = KernelDensityDCM.GetDensityEstimate(kdeInputPoints, sampleSize, false, 0, 0, densityEvalPoints, false);

                                // the density might not be computable if too few points are provided -- use default density for all evaluation points in such cases
                                if (densityEstimates.Count != densityEvalPoints.Count)
                                    float defaultValue = kdeFeature.Parameters.GetFloatValue(IncidentDensityParameter.DefaultValue);
                                    Console.Out.WriteLine("WARNING:  Using default value \"" + defaultValue + "\" for feature " + kdeFeature);
                                    densityEstimates = Enumerable.Repeat(defaultValue, densityEvalPoints.Count).ToList();

                                lock (featureIdDensityEstimates) { featureIdDensityEstimates.Add(kdeFeature.Id, densityEstimates); }


                foreach (Thread t in threads)

                foreach (string featureId in featureIdDensityEstimates.Keys)
                    List<float> densityEstimates = featureIdDensityEstimates[featureId];
                    NumericFeature densityFeature = _idNumericFeature[featureId];
                    for (int i = 0; i < densityEstimates.Count; ++i)
                        featureVectors[i].Add(densityFeature, densityEstimates[i], false);  // don't update range due to concurrent access to the feature (e.g., via time slice model calling into this method)

            // update all feature ranges. this wasn't done above due to potential concurrent access, either within this method or from calls into this method. each feature needs to be locked here due to potential concurrent calls into this method (e.g., time slice model)
            foreach (FeatureVector vector in featureVectors)
                foreach (LAIR.MachineLearning.Feature f in vector)
                    lock (f)

            IFeatureExtractor externalFeatureExtractor = InitializeExternalFeatureExtractor(typeof(FeatureBasedDCM));
            if (externalFeatureExtractor == null)
                yield return featureVectors;
                foreach (FeatureVectorList externalFeatureVectors in externalFeatureExtractor.ExtractFeatures(prediction, featureVectors, training, start, end, true))
                    yield return externalFeatureVectors;