社区首页 >专栏 >检测物体+估计距离和方向 | 附源码

检测物体+估计距离和方向 | 附源码

小白学视觉

发布于 2025-03-27 07:52:35

10900

代码可运行

文章被收录于专栏：深度学习和计算机视觉深度学习和计算机视觉

运行总次数：0

代码可运行

在这篇文章中，演示了如何使用计算机视觉创建一个应用程序，用于从语音命令中检测物体，估算物体的大致距离，并利用位置信息改善盲人的生活。这个项目的主要目标是处理实时数据，类似于Meta和Envision等可穿戴技术，以增强用户的生活和改善他们的日常体验。

本教程涵盖的步骤如下：

导入库并定义类参数。
语音识别和处理函数定义。
从返回的参数中检测物体，找到其位置，计算平均距离，发送通知。

导入库并定义类参数

导入 "speech_recognition" 库以从麦克风捕获音频并将语音转换为文本
导入 cv2 库以捕获网络摄像头的视频并对其应用各种操作
导入 Numpy 用于数学运算
导入 Ultralytics 库以使用预训练的 YOLOv8 模型
导入 pyttsx3 以进行文本到语音转换
导入 math 库用于三角计算和数学运算

import speech_recognition as sr
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3
import math


class_names = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "telephone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"]

object_dimensions = {                         
    "bird" : "0.10",
    "cat" : "0.45",
    "backpack" : "0.55",
    "umbrella" : "0.50",
    "bottle" : "0.20",
    "wine glass" : "0.25",
    "cup" : "0.15",
    "fork" : "0.15",
    "knife" : "0.25",
    "spoon" : "0.15",
    "banana" : "0.20",
    "apple" : "0.07",
    "sandwich" : "0.20",
    "orange" : "0.08",
    "chair" : "0.50",
    "laptop" : "0.40",
    "mouse" : "0.10",
    "remote" : "0.20",
    "keyboard" : "0.30",
    "phone" : "0.15",
    "book" : "0.18",
    "toothbrush" : "0.16"
}

我将我的 YOLOv8 模型训练的 COCO 数据集中的类别存储在 'class_names' 变量中，它们的平均维度存储在 'object_dimensions' 变量中。考虑到这个应用将用于家庭环境，我选择了特定的物体。如果您想使用自己的数据集，您需要执行自定义的物体检测并相应修改这些变量。

语音识别和处理函数定义

为了创建一个通用函数，假设物体位于句子的末尾，从短语中捕获搜索的物体（如“我的书在哪里？”，“找书！”，“书。”），我定义了一个名为 'get_last_word' 的函数。这个函数将返回句子中的最后一个词，也就是物体。

def get_last_word(sentence):
    words = sentence.split()
    return words[-1]

定义了一个名为 'voice_command' 的函数，以语音命令返回要搜索的物体以及这个物体的平均实际尺寸。

def voice_command():
    recognizer = sr.Recognizer()

    with sr.Microphone() as source:
        print("Waiting for voice command...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    target_object = ""  
    real_width = 0.15  

    try:
        command = recognizer.recognize_google(audio, language="en-US")
        print("Recognized command:", command)
        last_word = get_last_word(command.lower())  
        if last_word:
            print("Last word:", last_word)

        target_object = last_word.lower()
        
        if target_object in object_dimensions:
            real_width = float(object_dimensions[target_object])
            print(real_width)
        else:
            print(f"No length information found for {target_object}, using the default value of 0.15.")
    except sr.UnknownValueError:
        print("Voice cannot be understood.")
    except sr.RequestError as e:
        print("Voice recognition error; {0}".format(e))

    return target_object, real_width

创建了一个名为 'voice_notification' 的函数，用于用语音提醒用户。

def voice_notification(obj_name, direction, distance):
    engine = pyttsx3.init()
    text = "{} is at {}. It is {:.2f} meters away.".format(obj_name, direction, distance)
    engine.say(text)
    engine.runAndWait()

加载了 YOLOv8 模型，可以从 Ultralytics 网站下载和使用：https://docs.ultralytics.com/models/yolov8/#overview。

计算从语音命令接收的物体到摄像头的距离，并用语音通知最终用户物体在时钟上的位置方向。

def main():
    # Load the YOLO model
    model = YOLO("yolov8n.pt")
    
    # Get video frame dimensions for calculating 
    cap = cv2.VideoCapture(0)
    frame_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  
    frame_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  
    center_x = int(frame_width // 2)
    center_y = int(frame_height // 2)
    radius = min(center_x, center_y) - 30  # Radius of the circle where clock hands are drawn
    
    #The target object the user wants to search for via voice command and its real-world average size
    target_object, real_width = voice_command()

    while True:
        success, img = cap.read()
        
        # Predict objects using the YOLO model
        results = model.predict(img, stream=True)
        
        # Draw clock
        for i in range(1, 13):
            angle = math.radians(360 / 12 * i - 90)
            x = int(center_x + radius * math.cos(angle))
            y = int(center_y + radius * math.sin(angle))

            if i % 3 == 0:
                thickness = 3
                length = 20
            else:
                thickness = 1
                length = 10

            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(img, str(i), (x - 10, y + 10), font, 0.5, (0, 255, 0), thickness)
        
        # detect and process objects recognized by model
        for r in results:
            boxes = r.boxes

            for box in boxes:
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)  

                cls = int(box.cls)

                if class_names[cls].lower() == target_object:
                    camera_width = x2 - x1
                    distance = (real_width * frame_width) / camera_width
                    #voice_notification(target_object)

                    obj_center_x = (x1 + x2) // 2
                    obj_center_y = (y1 + y2) // 2

                    camera_middle_x = frame_width // 2
                    camera_middle_y = frame_height // 2

                    vector_x = obj_center_x - camera_middle_x
                    vector_y = obj_center_y - camera_middle_y

                    angle_deg = math.degrees(math.atan2(vector_y, vector_x))
                    #direction = ''
                    if angle_deg < 0:
                        angle_deg += 360

                    if 0 <= angle_deg < 30:
                        direction = "3 o'clock"
                    elif 30 <= angle_deg < 60:
                        direction = "4 o'clock"
                    elif 60 <= angle_deg < 90:
                        direction = "5 o'clock"
                    elif 90 <= angle_deg < 120:
                        direction = "6 o'clock"
                    elif 120 <= angle_deg < 150:
                        direction = "7 o'clock"
                    elif 150 <= angle_deg < 180:
                        direction = "8 o'clock"
                    elif 180 <= angle_deg < 210:
                        direction = "9 o'clock"
                    elif 210 <= angle_deg < 240:
                        direction = "10 o'clock"
                    elif 240 <= angle_deg < 270:
                        direction = "11 o'clock"
                    elif 270 <= angle_deg < 300:
                        direction = "12 o'clock"
                    elif 300 <= angle_deg < 330:
                        direction = "1 o'clock"
                    elif 330 <= angle_deg < 360:
                        direction = "2 o'clock"
                    else:
                        direction = "Unknown Clock Position"

                    cv2.putText(img, direction, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.putText(img, "Distance: {:.2f} meters".format(distance), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
                    
                    if boxes is not None:
                        
                        voice_notification(target_object, direction, distance)

        
        cv2.imshow("Webcam", img)

        k = cv2.waitKey(1)
        if k == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

结论和建议

希望这篇文章演示了计算机视觉如何增强人类视力并使我们的生活更轻松，希望这个项目可以带给大家对新事物探索的更多启发。你也可以使用自己的数据集，进一步开发和定制这个项目以满足您的具体需求，并添加适合您目标的功能。例如，您可以使用光学字符识别（OCR）技术将任何文本转换为文本到语音，这可以非常有用。这可以帮助某人在商店中找到产品或听书，还有许多其他应用。

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2025-03-26，如有侵权请联系 cloudcommunity@tencent.com 删除

变量