> ## Documentation Index
> Fetch the complete documentation index at: https://docs.phospho.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Computer vision

> How to leverage vision models for robotics.

The dev kit comes with a stereoscopic camera, ideal for 3D vision and AI workflows.
All code examples can be found in our open source repo [here](https://github.com/phospho-app/phosphobot).

## Wave back

Using OpenCV we can detect a wave gesture and make the robot wave back in just a couple lines of code.

<iframe className="w-full aspect-video" src="https://www.youtube.com/embed/rOkV3PCJgEU?si=CqU6hRiALBmrti0i" title="YouTube video player" alt="The robot waves back to the user" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerPolicy="strict-origin-when-cross-origin" allowFullScreen />

#

<AccordionGroup>
  <Accordion title="Show code - python implementation">
    ```python wave.py theme={null}
    import cv2
    import sys
    import time
    import signal
    import requests
    import mediapipe as mp  # type: ignore

    # Configurations
    PI_IP: str = "127.0.0.1"
    PI_PORT: int = 8080

    # Initialize MediaPipe Hand tracking
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(
        static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7
    )


    # Handle Ctrl+C to exit the program gracefully
    def signal_handler(sig, frame):
        print("\nExiting gracefully...")
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
        sys.exit(0)


    signal.signal(signal.SIGINT, signal_handler)


    # Function to call the API
    def call_to_api(endpoint: str, data: dict = {}):
        response = requests.post(f"http://{PI_IP}:{PI_PORT}/move/{endpoint}", json=data)
        return response.json()


    def wave_motion():
        points = 5
        for _ in range(2):
            for i in range(points):
                call_to_api(
                    "absolute",
                    {
                        "x": 0,
                        "y": 2 * (-1) ** i,
                        "z": 0,
                        "rx": -90,
                        "ry": 0,
                        "rz": 0,
                        "open": i % 2 == 0,
                    },
                )
                time.sleep(0.2)


    call_to_api("init")
    cap = cv2.VideoCapture(0)
    last_wave_time: float = 0
    WAVE_COOLDOWN: float = 3

    try:
        while True:
            success, image = cap.read()
            if not success:
                continue

            results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

            current_time = time.time()
            if (
                results.multi_hand_landmarks
                and current_time - last_wave_time > WAVE_COOLDOWN
            ):
                wave_motion()
                last_wave_time = current_time

            cv2.imshow("Hand Detection", image)
            cv2.waitKey(1)

    except KeyboardInterrupt:
        print("\nExiting gracefully...")
    finally:
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
    ```
  </Accordion>
</AccordionGroup>

## Hand tracking

This is a simple implementation of hand tracking using the MediaPipe library. The robot moves based on the hand position and closure.

<iframe className="w-full aspect-video" src="https://www.youtube.com/embed/xNXf1orc0uo?si=CKTHaSVHpLewXRIT" title="YouTube video player" frameBorder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerPolicy="strict-origin-when-cross-origin" allowFullScreen />

#

<AccordionGroup>
  <Accordion title="Show code - python implementation">
    ```python hand_tracking.py theme={null}
    import sys
    import cv2
    import time
    import signal
    import requests
    import numpy as np
    import mediapipe as mp  # type: ignore

    # Configurations
    PI_IP: str = "127.0.0.1"
    PI_PORT: int = 8080

    # Initialize MediaPipe Hand tracking
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(
        static_image_mode=False,
        max_num_hands=1,
        min_detection_confidence=0.7,
        min_tracking_confidence=0.7,
    )
    mp_draw = mp.solutions.drawing_utils


    # Handle Ctrl+C to exit the program gracefully
    def signal_handler(sig, frame):
        print("\nExiting gracefully...")
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
        sys.exit(0)


    signal.signal(signal.SIGINT, signal_handler)


    # Function to call the API
    def call_to_api(endpoint: str, data: dict = {}):
        response = requests.post(f"http://{PI_IP}:{PI_PORT}/move/{endpoint}", json=data)
        return response.json()


    def calculate_hand_closure(hand_landmarks):
        """
        Calculate if the hand is closed based on thumb and index finger distance
        Returns a value between 0 (open) and 1 (closed)
        """
        thumb_tip = hand_landmarks.landmark[4]
        index_tip = hand_landmarks.landmark[8]

        distance = np.sqrt(
            (thumb_tip.x - index_tip.x) ** 2
            + (thumb_tip.y - index_tip.y) ** 2
            + (thumb_tip.z - index_tip.z) ** 2
        )

        # Normalize distance (these values might need adjustment based on the hand size)
        normalized = np.clip(1.0 - (distance * 5), 0, 1)
        return normalized


    # 1 - Initialize the robot
    call_to_api("init")
    print("Initializing robot")
    time.sleep(2)

    # Initialize webcam
    cap = cv2.VideoCapture(0)

    # Get camera frame dimensions
    frame_width: float = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height: float = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the workspace boundaries (in meters)
    WORKSPACE_Y = 0.4
    WORKSPACE_Z = 0.2


    def map_to_robot_coordinates(hand_x, hand_y):
        """
        Map normalized hand coordinates to robot workspace coordinates
        We match the hand x coordinate to the robot y coordinate
        And the hand y coordinate to the robot z coordinate
        """
        robot_y = ((0.5 - hand_x) * 2) * (WORKSPACE_Y / 2) * 100
        robot_z = ((0.5 - hand_y) * 2) * (WORKSPACE_Z / 2) * 100
        return robot_y, robot_z


    # Previous position for smoothing, this helps make the robot movements less jerky
    prev_pos = {"y": 0, "z": 0}
    smoothing_factor = 0.5

    try:
        while cap.isOpened():
            success, image = cap.read()
            if not success:
                print("Failed to capture frame")
                continue

            image = cv2.flip(image, 1)  # The front camera is inverted
            rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB image
            results = hands.process(rgb_image)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    mp_draw.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                    palm = hand_landmarks.landmark[0]
                    hand_closed = calculate_hand_closure(hand_landmarks)

                    robot_y, robot_z = map_to_robot_coordinates(palm.x, palm.y)

                    robot_y = prev_pos["y"] * smoothing_factor + robot_y * (
                        1 - smoothing_factor
                    )
                    robot_z = prev_pos["z"] * smoothing_factor + robot_z * (
                        1 - smoothing_factor
                    )

                    prev_pos = {"y": robot_y, "z": robot_z}

                    call_to_api(
                        "absolute",
                        {
                            "x": -5,
                            "y": robot_y,
                            "z": robot_z,
                            "rx": 0,
                            "ry": 0,
                            "rz": 0,
                            "open": 1 - hand_closed,
                        },
                    )

                    cv2.putText(
                        image,
                        f"Position: (y:{robot_y:.3f}, z:{robot_z:.3f})",
                        (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1,
                        (0, 255, 0),
                        2,
                    )
                    cv2.putText(
                        image,
                        f"Grip: {'Closed' if hand_closed > 0.5 else 'Open'}",
                        (10, 70),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        1,
                        (0, 255, 0),
                        2,
                    )

            cv2.imshow("Hand Tracking", image)
            cv2.waitKey(1)

    except KeyboardInterrupt:
        print("\nExiting gracefully...")
    finally:
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
    ```
  </Accordion>
</AccordionGroup>

## Rock paper scissors

Create fun interactions with your robot with this simple implementation of a Rock Paper Scissors game.

<iframe className="w-full aspect-video" src="https://www.youtube.com/embed/lpCpK0SSS1Y?si=NX5CA-2vsfQlPc4p" title="YouTube video player" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerPolicy="strict-origin-when-cross-origin" allowFullScreen />

#

<AccordionGroup>
  <Accordion title="Show code - python implementation">
    ```python rock_paper_scissors.py theme={null}
    import cv2
    import time
    import random
    import requests
    import numpy as np
    import mediapipe as mp  # type: ignore

    # Robot API Configuration
    PI_IP = "127.0.0.1"
    PI_PORT = 8080


    class RockPaperScissorsGame:
        def __init__(self):
            self.mp_hands = mp.solutions.hands
            self.hands = self.mp_hands.Hands(
                static_image_mode=False,
                max_num_hands=1,
                min_detection_confidence=0.7,
                min_tracking_confidence=0.7,
            )
            self.cap = cv2.VideoCapture(0)
            self.gestures = {
                "rock": self.make_rock_gesture,
                "paper": self.make_paper_gesture,
                "scissors": self.make_scissors_gesture,
            }

        def call_to_api(self, endpoint: str, data: dict = {}):
            response = requests.post(f"http://{PI_IP}:{PI_PORT}/move/{endpoint}", json=data)
            return response.json()

        def detect_gesture(self, hand_landmarks):
            # Get relevant finger landmarks
            thumb_tip = hand_landmarks.landmark[4]
            index_tip = hand_landmarks.landmark[8]
            middle_tip = hand_landmarks.landmark[12]
            ring_tip = hand_landmarks.landmark[16]
            pinky_tip = hand_landmarks.landmark[20]

            # Get wrist position for reference
            wrist = hand_landmarks.landmark[0]

            # Calculate distances from wrist
            fingers_extended = []
            for tip in [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]:
                distance = np.sqrt((tip.x - wrist.x) ** 2 + (tip.y - wrist.y) ** 2)
                fingers_extended.append(distance > 0.2)  # Threshold for extended fingers

            # Determine gesture
            if not any(fingers_extended[1:]):  # All fingers closed
                return "rock"
            elif all(fingers_extended):  # All fingers open
                return "paper"
            elif (
                fingers_extended[1]
                and fingers_extended[2]
                and not fingers_extended[3]
                and not fingers_extended[4]
            ):  # Only index and middle extended
                return "scissors"
            return None

        def make_rock_gesture(self):
            # Move to closed fist position
            self.call_to_api(
                "absolute",
                {"x": 0, "y": 0, "z": 5, "rx": 0, "ry": 0, "rz": 0, "open": 0},
            )

        def make_paper_gesture(self):
            # Move to open hand position
            self.call_to_api(
                "absolute",
                {"x": 0, "y": 0, "z": 5, "rx": 0, "ry": 0, "rz": 0, "open": 1},
            )

        def make_scissors_gesture(self):
            # Move to scissors position
            self.call_to_api(
                "absolute",
                {"x": 0, "y": 0, "z": 5, "rx": 0, "ry": -45, "rz": 0, "open": 0.5},
            )

        def move_up_down(self, times=3):
            for step in range(times + 1):
                self.call_to_api(
                    "absolute",
                    {"x": 0, "y": 0, "z": 4, "rx": 0, "ry": 0, "rz": 0, "open": 0},
                )
                time.sleep(0.25)
                self.call_to_api(
                    "absolute",
                    {"x": 0, "y": 0, "z": -4, "rx": 0, "ry": 0, "rz": 0, "open": 0},
                )
                time.sleep(0.25)
                print(times - step)

        def determine_winner(self, player_gesture, robot_gesture):
            if player_gesture == robot_gesture:
                return "Tie!"
            winners = {"rock": "scissors", "paper": "rock", "scissors": "paper"}
            return (
                "Player wins!"
                if winners[player_gesture] == robot_gesture
                else "Robot wins!"
            )

        def play_game(self):
            print("Initializing robot...")
            self.call_to_api("init")
            time.sleep(1)

            print("Robot performing countdown...")
            self.move_up_down(times=3)

            ret, frame = self.cap.read()
            if not ret:
                print("Failed to capture image.")
                return

            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.hands.process(rgb_frame)

            if results.multi_hand_landmarks:
                player_gesture = self.detect_gesture(results.multi_hand_landmarks[0])

                if player_gesture:
                    robot_gesture = random.choice(["rock", "paper", "scissors"])
                    print(f"\nPlayer chose: {player_gesture}")
                    print(f"Robot chose: {robot_gesture}")

                    self.gestures[robot_gesture]()  # Robot makes its gesture
                    result = self.determine_winner(player_gesture, robot_gesture)
                    print(result)
                    time.sleep(2)
                else:
                    print("Gesture not detected. Please try again.")
            else:
                print("No hand detected. Please try again.")

            self.cap.release()
            cv2.destroyAllWindows()


    if __name__ == "__main__":
        game = RockPaperScissorsGame()
        game.play_game()
    ```
  </Accordion>
</AccordionGroup>