Image-to-Speech-Transcript/Image_To_Speech_Transcript.py at main · omwanere/Image-to-Speech-Transcript · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import cv2
import pytesseract
from PIL import Image
import os
import subprocess
import RPi.GPIO as GPIO
import time

BUTTON_PIN = 17
GPIO.setmode(GPIO.BCM)
GPIO.setup(BUTTON_PIN, GPIO.IN, pull_up_down=GPIO.PUD_UP)

preview_mode = False
cam = None

ip = "192.168.213.20"
port = "4747"
stream_url = f"http://{ip}:{port}/video"

def start_preview():
    global cam
    cam = cv2.VideoCapture(stream_url)
    if not cam.isOpened():
        print("Error: Could not access mobile stream.")
        return False
    print("Preview started. Press button again to capture.")
    return True


def capture_image():
    global cam
    ret, frame = cam.read()
    if not ret:
        print("Failed to grab frame.")
        return None
    img_path = "captured_image.jpg"
    cv2.imwrite(img_path, frame)
    print(f"Image captured and saved as {img_path}")
    cam.release()
    cv2.destroyAllWindows()
    return img_path


def image_to_text(image_path):
    try:
        img = cv2.imread(image_path)
        img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
        gray = cv2.dilate(gray, kernel, iterations=1)
        gray = cv2.erode(gray, kernel, iterations=1)
        filtered = cv2.bilateralFilter(gray, 11, 17, 17)
        thresh = cv2.adaptiveThreshold(
            filtered, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY, 11, 2
        )

        preprocessed_path = "preprocessed_image.jpg"
        cv2.imwrite(preprocessed_path, thresh)

        config = "--oem 3 --psm 6 -l eng"
        text = pytesseract.image_to_string(thresh, config=config)


        cleaned_text = re.sub(r'[^A-Za-z0-9 .,?!]', '', text)
        print("\nExtracted & Cleaned Text:\n", cleaned_text)
        return cleaned_text.strip()

    except Exception as e:
        print("Error in OCR:", str(e))
        return None


def text_to_speech(text, speed=180, voice="en+f3"):
    try:
        if text:
            print("\nSpeaking...")
            subprocess.run(["espeak", "-s", str(speed), "-v", voice, text])
        else:
            print("No text to speak.")
    except Exception as e:
        print("TTS Error:", str(e))


if __name__ == "__main__":
    print("Waiting for button presses...")

    try:
        while True:
            if GPIO.input(BUTTON_PIN) == GPIO.LOW:
                time.sleep(0.3)
                if not preview_mode:
                    preview_mode = start_preview()
                else:
                    print("Capturing image and processing...")
                    img_path = capture_image()
                    preview_mode = False
                    if img_path and os.path.exists(img_path):
                        text = image_to_text(img_path)
                        if text:
                            text_to_speech(text, speed=150)
                        else:
                            print("No text detected.")
                    else:
                        print("Failed to capture image.")

                while GPIO.input(BUTTON_PIN) == GPIO.LOW:
                    time.sleep(0.1)

            if preview_mode and cam:
                ret, frame = cam.read()
                if ret:
                    cv2.imshow("Live Preview", frame)
                    if cv2.waitKey(1) == 27:  # ESC to exit
                        break
            else:
                time.sleep(0.1)

    except KeyboardInterrupt:
        print("\nExiting...")
    finally:
        if cam:
            cam.release()
        cv2.destroyAllWindows()
        GPIO.cleanup()  explain this code in detail function by function