private Vector3 raycast(Point cameraPoint, Matrix4x4 cameraToWorldMatrix, CameraIntrinsics camIntrinsics) { // Convert the first point of the detected contour to Unity world coordinates // Point[] countoursPoint = contours[0].toArray(); // Windows.Foundation.Point orgpoint = new Windows.Foundation.Point(countoursPoint[0].x, countoursPoint[0].y); Windows.Foundation.Point orgpoint = new Windows.Foundation.Point(cameraPoint.x, cameraPoint.y); // Unprojects pixel coordinates into a camera space ray from the camera origin, expressed as a X, Y coordinates on a plane one meter from the camera. System.Numerics.Vector2 result = camIntrinsics.UnprojectAtUnitDepth(orgpoint); // manual calibration: correct y-axes by 5 cm to get better unprojection accurracy UnityEngine.Vector3 pos = new UnityEngine.Vector3(result.X + unprojectionOffset.x, result.Y + unprojectionOffset.y, 1.0f); // Convert from camera coordinates to world coordinates and RayCast in that direction. // convert right-handed coord-sys to Unity left-handed coord-sys Quaternion rotation = Quaternion.LookRotation(-cameraToWorldMatrix.GetColumn(2), cameraToWorldMatrix.GetColumn(1)); Vector3 layForward = Vector3.Normalize(rotation * pos); Vector3 cameraPos = cameraToWorldMatrix.GetColumn(3); RaycastHit hit = new RaycastHit(); return(Physics.Raycast(cameraPos, layForward, out hit, Mathf.Infinity, this.SpatialAwarnessLayerMask) ? hit.point : cameraPos + layForward * 5.0f); }
void ProcessFrame(SpatialCoordinateSystem worldCoordinateSystem) { if (!IsInValidateStateToProcessFrame()) { return; } // obtain the details of the last frame captured FrameGrabber.Frame frame = frameGrabber.LastFrame; if (frame.mediaFrameReference == null) { return; } MediaFrameReference mediaFrameReference = frame.mediaFrameReference; SpatialCoordinateSystem cameraCoordinateSystem = mediaFrameReference.CoordinateSystem; CameraIntrinsics cameraIntrinsics = mediaFrameReference.VideoMediaFrame.CameraIntrinsics; Matrix4x4? cameraToWorld = cameraCoordinateSystem.TryGetTransformTo(worldCoordinateSystem); if (!cameraToWorld.HasValue) { return; } // padding float averageFaceWidthInMeters = 0.15f; float pixelsPerMeterAlongX = cameraIntrinsics.FocalLength.X; float averagePixelsForFaceAt1Meter = pixelsPerMeterAlongX * averageFaceWidthInMeters; // Place the label 25cm above the center of the face. Vector3 labelOffsetInWorldSpace = new Vector3(0.0f, 0.25f, 0.0f); frameAnalyzer.AnalyzeFrame(frame.mediaFrameReference, (status, detectedPersons) => { if(status > 0 && detectedPersons.Count > 0) { FrameAnalyzer.Bounds? bestRect = null; Vector3 bestRectPositionInCameraSpace = Vector3.Zero; float bestDotProduct = -1.0f; FrameAnalyzer.DetectedPerson bestPerson = null; foreach (var dp in detectedPersons) { Debug.WriteLine($"Detected person: {dp.ToString()}"); Point faceRectCenterPoint = new Point( dp.bounds.left + dp.bounds.width /2, dp.bounds.top + dp.bounds.height / 2 ); // Calculate the vector towards the face at 1 meter. Vector2 centerOfFace = cameraIntrinsics.UnprojectAtUnitDepth(faceRectCenterPoint); // Add the Z component and normalize. Vector3 vectorTowardsFace = Vector3.Normalize(new Vector3(centerOfFace.X, centerOfFace.Y, -1.0f)); // Get the dot product between the vector towards the face and the gaze vector. // The closer the dot product is to 1.0, the closer the face is to the middle of the video image. float dotFaceWithGaze = Vector3.Dot(vectorTowardsFace, -Vector3.UnitZ); // Pick the faceRect that best matches the users gaze. if (dotFaceWithGaze > bestDotProduct) { // Estimate depth using the ratio of the current faceRect width with the average faceRect width at 1 meter. float estimatedFaceDepth = averagePixelsForFaceAt1Meter / (float)dp.bounds.width; // Scale the vector towards the face by the depth, and add an offset for the label. Vector3 targetPositionInCameraSpace = vectorTowardsFace * estimatedFaceDepth; bestDotProduct = dotFaceWithGaze; bestRect = dp.bounds; bestRectPositionInCameraSpace = targetPositionInCameraSpace; bestPerson = dp; } } if (bestRect.HasValue) { // Transform the cube from Camera space to World space. Vector3 bestRectPositionInWorldspace = Vector3.Transform(bestRectPositionInCameraSpace, cameraToWorld.Value); Vector3 labelPosition = bestRectPositionInWorldspace + labelOffsetInWorldSpace; quadRenderer.TargetPosition = labelPosition; textRenderer.RenderTextOffscreen($"{bestPerson.name}, {bestPerson.gender}, Age: {bestPerson.age}"); lastFaceDetectedTimestamp = Utils.GetCurrentUnixTimestampMillis(); } } }); }
public async Task EvaluateVideoFrameAsync(VideoFrame frame, VideoMediaFrame VideoFrame, SpatialCoordinateSystem worldCoordinateSystem, SpatialCoordinateSystem cameraCoordinateSystem) // <-- 2 { if (frame != null) { try { TimeRecorder.Restart(); // A matrix to transform camera coordinate system to world coordinate system Matrix4x4 cameraToWorld = (Matrix4x4)cameraCoordinateSystem.TryGetTransformTo(worldCoordinateSystem); // Internal orientation of camera CameraIntrinsics cameraIntrinsics = VideoFrame.CameraIntrinsics; // The frame of depth camera DepthMediaFrame depthFrame = VideoFrame.DepthMediaFrame; // not working, cause error // DepthCorrelatedCoordinateMapper depthFrameMapper = depthFrame.TryCreateCoordinateMapper(cameraIntrinsics, cameraCoordinateSystem); ONNXModelInput inputData = new ONNXModelInput(); inputData.Data = frame; var output = await Model.EvaluateAsync(inputData).ConfigureAwait(false); // <-- 3 TimeRecorder.Stop(); string timeStamp = $"({DateTime.Now})"; // $" Evaluation took {TimeRecorder.ElapsedMilliseconds}ms\n"; int count = 0; foreach (var prediction in output) { var product = prediction.TagName; // <-- 4 var loss = prediction.Probability; // <-- 5 if (loss > 0.5f) { float left = prediction.BoundingBox.Left; float top = prediction.BoundingBox.Top; float right = prediction.BoundingBox.Left + prediction.BoundingBox.Width; float bottom = prediction.BoundingBox.Top + prediction.BoundingBox.Height; float x = prediction.BoundingBox.Left + prediction.BoundingBox.Width / 2; float y = prediction.BoundingBox.Top + prediction.BoundingBox.Height / 2; Direct3DSurfaceDescription pixelData = frame.Direct3DSurface.Description; int height = pixelData.Height; int width = pixelData.Width; Vector3 ImageToWorld(float X, float Y) { // remove image distortion // Point objectCenterPoint = cameraIntrinsics.UndistortPoint(new Point(x, y)); // screen space -> camera space // unproject pixel coordinate of object center towards a plane that is one meter from the camera Vector2 objectCenter = cameraIntrinsics.UnprojectAtUnitDepth(new Point(X * width, Y * height)); // construct a ray towards object Vector3 vectorTowardsObject = Vector3.Normalize(new Vector3(objectCenter.X, objectCenter.Y, -1.0f)); // estimate the vending machine distance by its width // less accurate than use depth frame // magic number 940 pixels in width for an average vending machine at 2m // float estimatedVendingMachineDepth = (0.94f / prediction.BoundingBox.Width) * 2; float estimatedVendingMachineDepth = (0.3f / prediction.BoundingBox.Width) * 1; // times the vector towards object by the distance to get object's vector in camera coordinate system Vector3 vectorToObject = vectorTowardsObject * estimatedVendingMachineDepth; // camera space -> world space // tranform the object postion from camera coordinate system to world coordinate system Vector3 targetPositionInWorldSpace = Vector3.Transform(vectorToObject, cameraToWorld); return(targetPositionInWorldSpace); } Vector3 objectCenterInWorld = ImageToWorld(x, y); Vector3 objectTopLeft = ImageToWorld(left, top); Vector3 objectTopRight = ImageToWorld(right, top); Vector3 objectBotLeft = ImageToWorld(left, bottom); float widthInWorld = Vector3.Distance(objectTopLeft, objectTopRight); float heightInWorld = widthInWorld / (width * prediction.BoundingBox.Width) * (height * prediction.BoundingBox.Height); var lossStr = (loss * 100.0f).ToString("#0.00") + "%"; // lossStr = $"{prediction.BoundingBox.Width*width}X{prediction.BoundingBox.Height*height}"; UnityApp.StoreNetworkResult(timeStamp, product, lossStr, objectCenterInWorld.X, objectCenterInWorld.Y, objectCenterInWorld.Z, widthInWorld, heightInWorld); } } } catch (Exception ex) { var err_message = $"{ex.Message}"; ModifyText(err_message); } } }
private void ProcessFaces(List <BitmapBounds> faces, MediaFrameReference frame, SpatialCoordinateSystem worldCoordSystem) { VideoMediaFrameFormat videoFormat = frame.VideoMediaFrame.VideoFormat; SpatialCoordinateSystem cameraCoordinateSystem = frame.CoordinateSystem; CameraIntrinsics cameraIntrinsics = frame.VideoMediaFrame.CameraIntrinsics; System.Numerics.Matrix4x4?cameraToWorld = cameraCoordinateSystem.TryGetTransformTo(worldCoordSystem); // If we can't locate the world, this transform will be null. if (!cameraToWorld.HasValue) { return; } float textureWidthInv = 1.0f / videoFormat.Width; float textureHeightInv = 1.0f / videoFormat.Height; // The face analysis returns very "tight fitting" rectangles. // We add some padding to make the visuals more appealing. int paddingForFaceRect = 24; float averageFaceWidthInMeters = 0.15f; float pixelsPerMeterAlongX = cameraIntrinsics.FocalLength.X; float averagePixelsForFaceAt1Meter = pixelsPerMeterAlongX * averageFaceWidthInMeters; // Place the cube 25cm above the center of the face. System.Numerics.Vector3 cubeOffsetInWorldSpace = new System.Numerics.Vector3(0.0f, 0.25f, 0.0f); BitmapBounds bestRect = new BitmapBounds(); System.Numerics.Vector3 bestRectPositionInCameraSpace = System.Numerics.Vector3.Zero; float bestDotProduct = -1.0f; foreach (BitmapBounds faceRect in faces) { Point faceRectCenterPoint = new Point(faceRect.X + faceRect.Width / 2u, faceRect.Y + faceRect.Height / 2u); // Calculate the vector towards the face at 1 meter. System.Numerics.Vector2 centerOfFace = cameraIntrinsics.UnprojectAtUnitDepth(faceRectCenterPoint); // Add the Z component and normalize. System.Numerics.Vector3 vectorTowardsFace = System.Numerics.Vector3.Normalize(new System.Numerics.Vector3(centerOfFace.X, centerOfFace.Y, -1.0f)); // Estimate depth using the ratio of the current faceRect width with the average faceRect width at 1 meter. float estimatedFaceDepth = averagePixelsForFaceAt1Meter / faceRect.Width; // Get the dot product between the vector towards the face and the gaze vector. // The closer the dot product is to 1.0, the closer the face is to the middle of the video image. float dotFaceWithGaze = System.Numerics.Vector3.Dot(vectorTowardsFace, -System.Numerics.Vector3.UnitZ); // Scale the vector towards the face by the depth, and add an offset for the cube. System.Numerics.Vector3 targetPositionInCameraSpace = vectorTowardsFace * estimatedFaceDepth; // Pick the faceRect that best matches the users gaze. if (dotFaceWithGaze > bestDotProduct) { bestDotProduct = dotFaceWithGaze; bestRect = faceRect; bestRectPositionInCameraSpace = targetPositionInCameraSpace; } } // Transform the cube from Camera space to World space. System.Numerics.Vector3 bestRectPositionInWorldspace = System.Numerics.Vector3.Transform(bestRectPositionInCameraSpace, cameraToWorld.Value); cubeRenderer.SetTargetPosition(bestRectPositionInWorldspace + cubeOffsetInWorldSpace); // Texture Coordinates are [0,1], but our FaceRect is [0,Width] and [0,Height], so we need to normalize these coordinates // We also add padding for the faceRects to make it more visually appealing. float normalizedWidth = (bestRect.Width + paddingForFaceRect * 2u) * textureWidthInv; float normalizedHeight = (bestRect.Height + paddingForFaceRect * 2u) * textureHeightInv; float normalizedX = (bestRect.X - paddingForFaceRect) * textureWidthInv; float normalizedY = (bestRect.Y - paddingForFaceRect) * textureHeightInv; }