ios - Calculating Distance Between 2 Vision Points Using TrueDepth Camera Depth Point

I think I’m close on this one but have been stumped for days on how to calculate a real-world distance between 2 Vision points.

Using the Front TrueDepth camera, I gathered 2 Vision points on a hand, used those points to find the distance from the camera using the depthMap, and then leveraged the Camera Intrinsics – focalLength – to create my X and Y points. Together with the depth from the map which I am using as Z, I’m using simd_precise_distance to get a distance between the two points. I also ran the same calculation with a manually-built Euclidean distance function and the results were the same.

However, the resulting distance is not close to the real world. It is about half, but I cannot say it is exactly half. Looking at this post from Andy Jazz he does mention that the iPhone viewport might be 1/2 or 1/3 but I’m not sure that means I can truly double this number.

In my research I found numerous ways to attack this problem and you can see I attempted to find the points in this way, too:

/*
                        let xrwPoint1 = (Float(convertedPoints[0].x) - cameraIntrinsics[2][0]) * distanceValue1 / cameraIntrinsics[0][0];
                        let yrwPoint1 = (Float(convertedPoints[0].y) - cameraIntrinsics[2][1]) * distanceValue1 / cameraIntrinsics[1][1];

                        let xrwPoint2 = (Float(convertedPoints[1].x) - cameraIntrinsics[2][0]) * distanceValue2 / cameraIntrinsics[0][0];
                        let yrwPoint2 = (Float(convertedPoints[1].y) - cameraIntrinsics[2][1]) * distanceValue2 / cameraIntrinsics[1][1];

                        print("xrw = ", xrwPoint1);
                        print("yrw = ", yrwPoint1);
                        */

However, that was not accurate either. What I am wondering now but have not been able to find in my research is whether I need to convert the actual depth in meters to a different Z value before calculating the distance. But I’m basically lost there.

Here is the function. I left commented code in to show what I’ve tried.

func processPoints(_ handPoints: [CGPoint],_ depthPixelBuffer: CVImageBuffer,_ videoPixelBuffer: CVImageBuffer,_ cameraIntrinsics: simd_float3x3) {

    let convertedPoints = handPoints.map {
        cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
    }
    
    if handPoints.count == 2 {
        
        let handVisionPoint1 = handPoints[0]
        let handVisionPoint2 = handPoints[1]
        
        let scaleFactor = CGFloat(CVPixelBufferGetWidth(depthPixelBuffer)) / CGFloat(CVPixelBufferGetWidth(videoPixelBuffer))
       
        CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
        let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
        
        let width = CVPixelBufferGetWidth(depthPixelBuffer)
        let height = CVPixelBufferGetHeight(depthPixelBuffer)

        let colPosition1 = Int(handVisionPoint1.x * CGFloat(width))
        let rowPosition1 = Int(handVisionPoint1.y * CGFloat(height))
        
        let colPosition2 = Int(handVisionPoint2.x * CGFloat(width))
        let rowPosition2 = Int(handVisionPoint2.y * CGFloat(height))
        
        let handVisionPixelX = Int((handVisionPoint1.x * scaleFactor).rounded())
        let handVisionPixelY = Int((handVisionPoint1.y * scaleFactor).rounded())
        
        let handVisionPixel2X = Int((handVisionPoint2.x * scaleFactor).rounded())
        let handVisionPixel2Y = Int((handVisionPoint2.y * scaleFactor).rounded())
        
        
        guard CVPixelBufferGetPixelFormatType(depthPixelBuffer) == kCVPixelFormatType_DepthFloat32 else { return }

                CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)

                if let baseAddress = CVPixelBufferGetBaseAddress(depthPixelBuffer) {

                        let width = CVPixelBufferGetWidth(depthPixelBuffer)

                        let index1 = colPosition1 + (rowPosition1 * width)
                        let index2 = colPosition2 + (rowPosition2 * width)

                        let offset1 = index1 * MemoryLayout<Float>.stride
                        let offset2 = index2 * MemoryLayout<Float>.stride

                        let distanceValue1 = baseAddress.load(fromByteOffset: offset1, as: Float.self)
                        let distanceValue2 = baseAddress.load(fromByteOffset: offset2, as: Float.self)
                        
                        print("DISTANCE POINT 1 IS ", distanceValue1)
                        print("DISTANCE POINT 2 IS ", distanceValue2)
                    
                    
                        /*
                        let xrwPoint1 = (Float(convertedPoints[0].x) - cameraIntrinsics[2][0]) * distanceValue1 / cameraIntrinsics[0][0];
                        let yrwPoint1 = (Float(convertedPoints[0].y) - cameraIntrinsics[2][1]) * distanceValue1 / cameraIntrinsics[1][1];
                         
                        let xrwPoint2 = (Float(convertedPoints[1].x) - cameraIntrinsics[2][0]) * distanceValue2 / cameraIntrinsics[0][0];
                        let yrwPoint2 = (Float(convertedPoints[1].y) - cameraIntrinsics[2][1]) * distanceValue2 / cameraIntrinsics[1][1];
                         
                        print("xrw = ", xrwPoint1);
                        print("yrw = ", yrwPoint1);
                        */

                        let uPoint1 = Float(convertedPoints[0].x - CGFloat(width))/2
                        let vPoint1 = Float(CGFloat(height)/2-convertedPoints[0].y)
                        
                        let uPoint2 = Float(convertedPoints[1].x - CGFloat(width))/2
                        let vPoint2 = Float(CGFloat(height)/2-convertedPoints[1].y)
                        
                        let focalLengthPx = cameraIntrinsics.columns.0.x;
                        
                        let xPoint1 = Float(uPoint1 * Float(distanceValue1) / Float(focalLengthPx))
                        let yPoint1 = Float(vPoint1 * Float(distanceValue1) / Float(focalLengthPx))
                    
                        let xPoint2 = Float(uPoint2 * Float(distanceValue2) / Float(focalLengthPx))
                        let yPoint2 = Float(vPoint2 * Float(distanceValue2) / Float(focalLengthPx))
             
                        let visionPoint1In3D = simd_float3(xPoint1, -yPoint1, distanceValue1)
                        let visionPoint2In3D = simd_float3(xPoint2, -yPoint2, distanceValue2)
                         
                        // This is the same function as euclidean function below
                        let dist = simd_precise_distance(visionPoint1In3D, visionPoint2In3D)
                    
                        print("Distance In Meters Is  ", dist)
                    
                         /*
                         X = u*Z/f;
                         Y = v*Z/f,
                         where f is camera focal length in pixels, 
                         Z is distance meter
                         u = column-width/2
                         v = height/2-row
                         
                         the distance in 3D is given by Euclidean formula:
                         d = √[(x2 - x1)² + (y2 - y1)² + (z2 - z1)²].
                         
                         */
                    
                    
                    CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly)

                }
        
        CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly)   
    }
    
}

Thank you for any thoughts you have!