EyePy/demo.py

import cv2
import numpy as np
import tkinter as tk
import time
from gaze_estimator import GazeEstimator


def run_calibration(gaze_estimator, camera_index=0):
    root = tk.Tk()
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
    root.destroy()

    points = [
        (screen_width / 2, screen_height / 2),  # Middle
        (50, 50),  # Top left
        (screen_width - 50, 50),  # Top right
        (50, screen_height - 50),  # Bottom left
        (screen_width - 50, screen_height - 50),  # Bottom right
        (50, 50),  # Top left
        (50, screen_height - 50),  # Bottom left
        (screen_width - 50, 50),  # Top right
        (screen_width - 50, screen_height - 50),  # Bottom right
        (screen_width / 2, screen_height / 2),  # Middle
    ]

    cv2.namedWindow("Calibration", cv2.WND_PROP_FULLSCREEN)
    cv2.setWindowProperty("Calibration", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    cap = cv2.VideoCapture(camera_index)

    features_list = []
    targets_list = []

    N = 30  # Frames per movement

    def ease_in_out_quad(t):
        return t * t * (3 - 2 * t)

    face_detected = False
    countdown_active = False
    face_detection_start_time = None
    countdown_duration = 2

    while True:
        ret, frame = cap.read()
        if not ret:
            continue

        features, blink_detected = gaze_estimator.extract_features(frame)
        if features is not None and not blink_detected:
            face_detected = True
        else:
            face_detected = False

        canvas = np.zeros((screen_height, screen_width, 3), dtype=np.uint8)

        current_time = time.time()

        if face_detected:
            if not countdown_active:
                face_detection_start_time = current_time
                countdown_active = True
            elapsed_time = current_time - face_detection_start_time
            if elapsed_time >= countdown_duration:
                countdown_active = False
                break
            else:
                t = elapsed_time / countdown_duration
                eased_t = t * t * (3 - 2 * t)
                angle = 360 * (1 - eased_t)
                center = (screen_width // 2, screen_height // 2)
                radius = 50
                axes = (radius, radius)
                start_angle = -90
                end_angle = start_angle + angle
                color = (0, 255, 0)
                thickness = -1
                cv2.ellipse(
                    canvas, center, axes, 0, start_angle, end_angle, color, thickness
                )
        else:
            countdown_active = False
            face_detection_start_time = None
            text = "Face not detected"
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 2
            color = (0, 0, 255)
            thickness = 3
            text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
            text_x = (screen_width - text_size[0]) // 2
            text_y = (screen_height + text_size[1]) // 2
            cv2.putText(
                canvas, text, (text_x, text_y), font, font_scale, color, thickness
            )

        cv2.imshow("Calibration", canvas)
        if cv2.waitKey(1) == 27:
            cap.release()
            cv2.destroyWindow("Calibration")
            return

    for i in range(len(points) - 1):
        p0 = points[i]
        p1 = points[i + 1]

        for frame_idx in range(N):
            ret, frame = cap.read()
            if not ret:
                continue

            t = frame_idx / (N - 1)
            eased_t = ease_in_out_quad(t)

            x = int(p0[0] + (p1[0] - p0[0]) * eased_t)
            y = int(p0[1] + (p1[1] - p0[1]) * eased_t)

            canvas = np.zeros((screen_height, screen_width, 3), dtype=np.uint8)
            cv2.circle(canvas, (x, y), 20, (0, 255, 0), -1)

            cv2.imshow("Calibration", canvas)
            cv2.waitKey(1)

            features, blink_detected = gaze_estimator.extract_features(frame)
            if features is not None and not blink_detected:
                features_list.append(features)
                targets_list.append([x, y])

    cap.release()
    cv2.destroyWindow("Calibration")

    X = np.array(features_list)
    y = np.array(targets_list)

    gaze_estimator.train(X, y)


def fine_tune_kalman_filter(gaze_estimator, kalman, camera_index=0):
    root = tk.Tk()
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
    root.destroy()

    initial_points = [
        {
            "position": (screen_width // 2, screen_height // 4),
            "start_time": None,
            "collected_gaze": [],
        },
        {
            "position": (screen_width // 4, 3 * screen_height // 4),
            "start_time": None,
            "collected_gaze": [],
        },
        {
            "position": (3 * screen_width // 4, 3 * screen_height // 4),
            "start_time": None,
            "collected_gaze": [],
        },
    ]

    points = initial_points.copy()

    proximity_threshold = screen_width / 5  # pixels
    dot_duration = 3  # seconds

    cv2.namedWindow("Fine Tuning", cv2.WND_PROP_FULLSCREEN)
    cv2.setWindowProperty("Fine Tuning", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

    cap = cv2.VideoCapture(camera_index)

    gaze_positions = []

    while len(points) > 0:
        ret, frame = cap.read()
        if not ret:
            continue

        features, blink_detected = gaze_estimator.extract_features(frame)
        canvas = np.zeros((screen_height, screen_width, 3), dtype=np.uint8)

        for point in points:
            cv2.circle(canvas, point["position"], 20, (0, 255, 0), -1)

        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 1.5
        color = (255, 255, 255)
        thickness = 2
        text = "Look at the points until they disappear"
        text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
        text_x = (screen_width - text_size[0]) // 2
        text_y = screen_height - 50
        cv2.putText(canvas, text, (text_x, text_y), font, font_scale, color, thickness)

        if features is not None and not blink_detected:
            X = np.array([features])
            gaze_point = gaze_estimator.predict(X)[0]
            gaze_x, gaze_y = int(gaze_point[0]), int(gaze_point[1])

            cv2.circle(canvas, (gaze_x, gaze_y), 10, (255, 0, 0), -1)

            for point in points[:]:
                dx = gaze_x - point["position"][0]
                dy = gaze_y - point["position"][1]
                distance = np.sqrt(dx * dx + dy * dy)
                if distance <= proximity_threshold:
                    if point["start_time"] is None:
                        point["start_time"] = time.time()
                        point["collected_gaze"] = []
                    elapsed_time = time.time() - point["start_time"]
                    point["collected_gaze"].append([gaze_x, gaze_y])

                    shake_amplitude = int(5 + (elapsed_time / dot_duration) * 20)
                    shake_x = int(np.random.uniform(-shake_amplitude, shake_amplitude))
                    shake_y = int(np.random.uniform(-shake_amplitude, shake_amplitude))
                    shaken_position = (
                        int(point["position"][0] + shake_x),
                        int(point["position"][1] + shake_y),
                    )
                    cv2.circle(canvas, shaken_position, 20, (0, 255, 0), -1)

                    if elapsed_time >= dot_duration:
                        gaze_positions.extend(point["collected_gaze"])
                        points.remove(point)
                else:
                    point["start_time"] = None
                    point["collected_gaze"] = []
        else:
            for point in points:
                point["start_time"] = None
                point["collected_gaze"] = []

        cv2.imshow("Fine Tuning", canvas)
        if cv2.waitKey(1) == 27:
            cap.release()
            cv2.destroyWindow("Fine Tuning")
            return

    cap.release()
    cv2.destroyWindow("Fine Tuning")

    gaze_positions = np.array(gaze_positions)
    if gaze_positions.shape[0] < 2:
        return

    gaze_variance = np.var(gaze_positions, axis=0)
    gaze_variance[gaze_variance == 0] = 1e-4

    kalman.measurementNoiseCov = np.array(
        [[gaze_variance[0], 0], [0, gaze_variance[1]]], dtype=np.float32
    )


def main():
    camera_index = 0

    gaze_estimator = GazeEstimator()

    run_calibration(gaze_estimator, camera_index=camera_index)

    kalman = cv2.KalmanFilter(4, 2)
    kalman.measurementMatrix = np.array([[1, 0, 0, 0], [0, 1, 0, 0]], np.float32)
    kalman.transitionMatrix = np.array(
        [[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]], np.float32
    )
    kalman.processNoiseCov = np.eye(4, dtype=np.float32) * 1
    kalman.measurementNoiseCov = np.eye(2, dtype=np.float32) * 1
    kalman.statePre = np.zeros((4, 1), np.float32)
    kalman.statePost = np.zeros((4, 1), np.float32)

    fine_tune_kalman_filter(gaze_estimator, kalman, camera_index=camera_index)

    root = tk.Tk()
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
    root.destroy()

    cam_width, cam_height = 480, 360

    cv2.namedWindow("Gaze Estimation", cv2.WND_PROP_FULLSCREEN)
    cv2.setWindowProperty(
        "Gaze Estimation", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN
    )

    cap = cv2.VideoCapture(camera_index)
    prev_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            continue

        features, blink_detected = gaze_estimator.extract_features(frame)
        if features is not None and not blink_detected:
            X = np.array([features])
            gaze_point = gaze_estimator.predict(X)[0]
            x, y = int(gaze_point[0]), int(gaze_point[1])

            prediction = kalman.predict()
            x_pred, y_pred = int(prediction[0]), int(prediction[1])

            measurement = np.array([[np.float32(x)], [np.float32(y)]])
            if np.count_nonzero(kalman.statePre) == 0:
                kalman.statePre[:2] = measurement
                kalman.statePost[:2] = measurement
            kalman.correct(measurement)
        else:
            x_pred, y_pred = None, None
            blink_detected = True

        small_frame = cv2.resize(frame, (cam_width, cam_height))

        canvas = np.zeros((screen_height, screen_width, 3), dtype=np.uint8)

        canvas[:cam_height, :cam_width] = small_frame

        if x_pred is not None and y_pred is not None:
            cv2.circle(canvas, (x_pred, y_pred), 20, (0, 0, 255), -1)

        current_time = time.time()
        fps = 1 / (current_time - prev_time)
        prev_time = current_time

        cv2.putText(
            canvas,
            f"FPS: {int(fps)}",
            (50, 50),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (255, 255, 255),
            2,
        )

        blink_text = "Blinking" if blink_detected else "Not Blinking"
        cv2.putText(
            canvas,
            blink_text,
            (50, 100),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 255, 0) if not blink_detected else (0, 0, 255),
            2,
        )

        cv2.imshow("Gaze Estimation", canvas)
        if cv2.waitKey(1) == 27:
            break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()