[AI talent Creation Camp phase II] PaddlePaddle+OpenVINO implementer_ Copy 1

PaddlePaddle+OpenVINO realize human-computer interaction

Reprinted from AI Studio
Project link https://aistudio.baidu.com/aistudio/projectdetail/3525813

Project display

Project description

  project reference from HandPose_x , the project is jointly used by paddepadde + openvino to open up the one-stop process of the project from training to rapid deployment. Where the project needs to be expanded and developed, welcome to join in the construction.
   the project needs to analyze the relative coordinates of the hand in the real world through the detection and tracking of the key points of the hand, so as to intercept some picture information in the real world, and then transfer this part of information to the computer for analysis to realize human-computer interaction. At present, the functions that have been realized include information classification, text extraction in specific areas and painting.

train

   from the project requirements, we can know that to realize the basic module, we need: ① hand target detection, ② hand key point detection; The extended function needs: ③ classification, ④ OCR.

object detection

data set

   the data set is the combination of TV hand and coco hand (coco hand big part). The official website address of TV hand and coco hand data sets is: website , because the dataset format is yolo(txt), it needs to be converted to VOC format.

!mkdir -p train/label
!mv datasets_TVCOCO_hand_train/anno/images train/
from xml.dom.minidom import Document
import os
import cv2
from tqdm import tqdm

def writeXml(tmp, imgname, w, h, objbud):
    doc = Document()
    # owner
    annotation = doc.createElement('annotation')
    doc.appendChild(annotation)
    # owner
    folder = doc.createElement('folder')
    annotation.appendChild(folder)
    folder_txt = doc.createTextNode("train")
    folder.appendChild(folder_txt)

    filename = doc.createElement('filename')
    annotation.appendChild(filename)
    filename_txt = doc.createTextNode(imgname)
    filename.appendChild(filename_txt)
    # ones#
    source = doc.createElement('source')
    annotation.appendChild(source)

    database = doc.createElement('database')
    source.appendChild(database)
    database_txt = doc.createTextNode("Unknown")
    database.appendChild(database_txt)

    # onee#
    # twos#
    size = doc.createElement('size')
    annotation.appendChild(size)

    width = doc.createElement('width')
    size.appendChild(width)
    width_txt = doc.createTextNode(str(w))
    width.appendChild(width_txt)

    height = doc.createElement('height')
    size.appendChild(height)
    height_txt = doc.createTextNode(str(h))
    height.appendChild(height_txt)

    depth = doc.createElement('depth')
    size.appendChild(depth)
    depth_txt = doc.createTextNode("3")
    depth.appendChild(depth_txt)
    # twoe#
    segmented = doc.createElement('segmented')
    annotation.appendChild(segmented)
    segmented_txt = doc.createTextNode("0")
    segmented.appendChild(segmented_txt)

    for i in range(0, int(len(objbud) / 5)):
        # threes#
        object_new = doc.createElement("object")
        annotation.appendChild(object_new)

        name = doc.createElement('name')
        object_new.appendChild(name)
        name_txt = doc.createTextNode(objbud[i * 5])
        name.appendChild(name_txt)

        pose = doc.createElement('pose')
        object_new.appendChild(pose)
        pose_txt = doc.createTextNode("Unspecified")
        pose.appendChild(pose_txt)

        truncated = doc.createElement('truncated')
        object_new.appendChild(truncated)
        truncated_txt = doc.createTextNode("0")
        truncated.appendChild(truncated_txt)

        difficult = doc.createElement('difficult')
        object_new.appendChild(difficult)
        difficult_txt = doc.createTextNode("0")
        difficult.appendChild(difficult_txt)
        # threes-1#
        bndbox = doc.createElement('bndbox')
        object_new.appendChild(bndbox)

        xmin = doc.createElement('xmin')
        bndbox.appendChild(xmin)
        xmin_txt = doc.createTextNode(str(objbud[i * 5 + 1]))
        xmin.appendChild(xmin_txt)

        ymin = doc.createElement('ymin')
        bndbox.appendChild(ymin)
        ymin_txt = doc.createTextNode(str(objbud[i * 5 + 2]))
        ymin.appendChild(ymin_txt)

        xmax = doc.createElement('xmax')
        bndbox.appendChild(xmax)
        xmax_txt = doc.createTextNode(str(objbud[i * 5 + 3]))
        xmax.appendChild(xmax_txt)

        ymax = doc.createElement('ymax')
        bndbox.appendChild(ymax)
        ymax_txt = doc.createTextNode(str(objbud[i * 5 + 4]))
        ymax.appendChild(ymax_txt)
        # threee-1#
        # threee#

    tempfile = tmp + imgname.split(".")[0] + ".xml"
    with open(tempfile, "w") as f:
        doc.writexml(f, indent='', addindent='\t', newl='\n', encoding='utf-8')

    return

image_path = "./train/images/"
txt_label_path = "./datasets_TVCOCO_hand_train/anno/labels/"
xml_label_path = "./train/label/"
image_name = os.listdir(image_path)

for name in tqdm(image_name):
    if ".jpg" in name:
        image = cv2.imread(os.path.join(image_path, name))
        height, width, _ = image.shape
        txt_path = os.path.join(txt_label_path, name.split(".")[0]+".txt")
        obj1 = []
        obj2 = []
        with open(txt_path, "r") as f:
            data = f.readlines()
            for line in data:
                for line_data in line.split("\n")[0].split(" "):
                    obj1.append(line_data)
        for i in range(int(len(obj1)/5)):
            if (float(obj1[i * 5 + 1]) - 0.5*float(obj1[i * 5 + 3]) or
                float(obj1[i * 5 + 2]) - 0.5*float(obj1[i * 5 + 4]) or
                float(obj1[i * 5 + 1]) + 0.5*float(obj1[i * 5 + 3]) or
                float(obj1[i * 5 + 2]) + 0.5*float(obj1[i * 5 + 4])) < 0:
                pass
            else:
                obj2.append(obj1[i * 5])
                obj2.append(int((float(obj1[i * 5 + 1]) - 0.5*float(obj1[i * 5 + 3]))*width))
                obj2.append(int((float(obj1[i * 5 + 2]) - 0.5*float(obj1[i * 5 + 4]))*height))
                obj2.append(int((float(obj1[i * 5 + 1]) + 0.5*float(obj1[i * 5 + 3]))*width))
                obj2.append(int((float(obj1[i * 5 + 2]) + 0.5*float(obj1[i * 5 + 4]))*height))
        if len(obj2) != 0:
            writeXml(xml_label_path, name, width, height, obj2)
    else:
        continue

  making data sets.

import os

image_dir = "train/images/"
xml_dir = "train/label/"
xml_path = os.listdir(xml_dir)
f_train = open("train/train.txt", "w")
f_test = open("train/test.txt", "w")

for i in range(len(xml_path)):
    if (i % 100) != 0:
        f_train.write(image_dir + xml_path[i].split(".")[0] + ".jpg "+xml_dir+xml_path[i]+"\n")
    else:
        f_test.write(image_dir + xml_path[i].split(".")[0] + ".jpg "+xml_dir+xml_path[i]+"\n")

Training target detection model through paddedetection

PaddleDetection provides a variety of mainstream target detection, case segmentation, tracking, key point detection algorithms, configurable network module components, data enhancement strategies, loss functions and so on. It introduces a variety of server-side and mobile terminal industry level SOTA models, and integrates PaddlePaddle model compression and cross platform high performance deployment capabilities. Help developers complete the whole end-to-end development process faster and better.
In the beginning, paddledetection used picodet to train in paddledetection (picodet did support openvino in paddledetection document). Later, it involved openvino deployment. After attempting openvino deployment, it found that the operator was missing, and could only stop, query openvino official web resources, and found code that supports paddlepaddle yolov3/ppyolo model. Therefore, yolov3 was selected for the first time. After the experiment, yolov3 was successfully deployed through openvino. If you want to try picodet training deployment, you can use the paddle original ecological reasoning information, and the information reasoning code will be given later. However, if you can support openvino deployment as soon as possible, it is still very fragrant, The acceleration brought by inserting an inter reasoning rod will make picodet, a lightweight model, play a better advantage on the end side.

!git clone https://gitee.com/paddlepaddle/PaddleDetection.git
!pip install -r PaddleDetection/requirements.txt
!export CUDA_VISIBLE_DEVICES=0
!python PaddleDetection/tools/train.py -c config/yolo.yml --eval

Training target detection model through paddlex

   similarly, training can also be carried out through paddlex. Paddlex also collects some detection models of paddedetection. Paddlex supports model training more simply and conveniently. As a novice, it is a good choice.

!pip install paddlex==2.1.0
import paddlex as pdx
from paddlex import transforms as T
train_transforms = T.Compose([
    T.MixupImage(mixup_epoch=250), T.RandomDistort(),
    T.RandomExpand(im_padding_value=[123.675, 116.28, 103.53]), T.RandomCrop(),
    T.RandomHorizontalFlip(), T.BatchRandomResize(
        target_sizes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608],
        interp='RANDOM'), T.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

eval_transforms = T.Compose([
    T.Resize(
        608, interp='CUBIC'), T.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
train_dataset = pdx.datasets.VOCDetection(
    data_dir='./',
    file_list='train/train.txt',
    label_list='train/labels.txt',
    transforms=train_transforms,
    shuffle=True)

eval_dataset = pdx.datasets.VOCDetection(
    data_dir='./',
    file_list='train/test.txt',
    label_list='train/labels.txt',
    transforms=eval_transforms,
    shuffle=False)
num_classes = len(train_dataset.labels)
model = pdx.det.YOLOv3(num_classes=num_classes, backbone='MobileNetV3', nms_topk=500, nms_keep_topk=50)
model.train(
    num_epochs=270,
    train_dataset=train_dataset,
    train_batch_size=32,
    eval_dataset=eval_dataset,
    pretrain_weights='COCO',
    learning_rate=.001,
    warmup_steps=1000,
    warmup_start_lr=0.0,
    save_interval_epochs=5,
    lr_decay_epochs=[70, 140, 210],
    use_ema=True,
    save_dir='output/yolo')

Key points of hand

PoseHand_x

It can be implemented with reference to paddlepaddle. HandPose_x . After downloading the model from the project, it will be exported in the reasoning model export in the next step.


PaddleHub

There are a lot of pre training models under PaddlePaddle ecology in PaddleHub, which complete the management of the model and the prediction of one button. With the use of fine tune API, the migration learning can be quickly completed based on the large-scale pre training model, so that the pre training model can better serve the application of users' specific scenes. There are ready-made hand key point detection models: hand_pose_localization , you can download the model directly (the model can be directly used for deployment).

  you can download the model through hub install. After downloading, you will be prompted where the download location is, and then go there to take out the model.

   of course, the model is provided by Xiao Lao. You can also go to his project and take out the model. The project address shows the way: project

!hub install hand_pose_localization==1.0.1

OpenVINO and Inference inference model derivation

object detection

paddledetection

  export steps refer to the document paddedetection Deployment model export tutorial.

   here we export both the information model and the onnx model (for deployment in openvino).

!pip install paddle2onnx
!pip install onnx
!python PaddleDetection/tools/export_model.py -c config/yolo.yml \
                                                TestReader.inputs_def.image_shape=[1, 3, 608, 608] \
                                                --output_dir inference_model
!paddle2onnx \
    --model_dir inference_model/inference_model \
    --model_filename model.pdmodel \
    --params_filename model.pdiparams \
    --save_file yolo.onnx \
    --opset_version 11 \
    --enable_onnx_checker True

paddlex

  export steps refer to the document paddlex Deployment model export tutorial.

!paddlex --export_inference --model_dir=./model/ --save_dir=./inference_model --fixed_input_shape=[1,3,608,608]
!paddle2onnx \
    --model_dir inference_model/inference_model \
    --model_filename model.pdmodel \
    --params_filename model.pdiparams \
    --save_file yolo.onnx \
    --opset_version 11 \
    --enable_onnx_checker True

Hand key point model

PoseHand_x

  you can use the pad onnx. The export API exports the onnx model.

from resnet50 import resnet50
model_ = resnet50(num_classes=42, img_size=256)
model_.eval()  # Set to forward inference mode
model_path = '848resnet_50-model_epoch-9.pdparams' #Trained model
ckpd = paddle.load(model_path)
model_.set_state_dict(ckpd)
x_spec = paddle.static.InputSpec(shape=[1, 3, 256, 256], dtype='float32')
paddle.onnx.export(model, 'posehand', input_spec=[x_spec])

Reasoning deployment

Target detection (OpenVINO)

from openvino.inference_engine import IENetwork, IECore
import cv2
import numpy as np

class OpenvinoHandDetectModel(object):
    def __init__(self, crop_size=[608, 608], k_top=2):
        self.model = OpenvinoHandDetectInference(crop_size=crop_size, k_top=k_top)
        self.crop_size = crop_size
    def predict(self, img_cv2, threshold):
        h, w, _ = img_cv2.shape
        output = self.model.forward(img_cv2)
        hands_list = []
        if len(output) > 0:
            if output[0][1] > threshold:
                for i in range(len(output)):
                    if output[i][1] > threshold:
                        x1 = int(output[i][2] / self.crop_size[0] * w)
                        y1 = int(output[i][3] / self.crop_size[0] * h)
                        x2 = int(output[i][4] / self.crop_size[0] * w)
                        y2 = int(output[i][5] / self.crop_size[0] * h)
                        hands_list.append([img_cv2[y1:y2, x1:x2], x1, y1, x2, y2])
                        cv2.rectangle(img_cv2, (x1, y1), (x2, y2), [0, 0, 255], thickness=2)
        return hands_list

class OpenvinoHandDetectInference(object):
    def __init__(self, model_path="./onnx/model.onnx", crop_size=[608, 608], k_top=2, device="CPU"):
        ie = IECore()
        net = ie.read_network(model_path)
        net.reshape({'image': [1, 3, crop_size[0], crop_size[1]], 'im_shape': [1, 2], 'scale_factor': [1, 2]})
        self.exec_net = ie.load_network(net, device)
        self.crop_size = crop_size
        self.k_top = k_top

    def forward(self, src_img):
        test_image = handle(src_img, self.crop_size)
        test_im_shape = np.array([[608, 608]]).astype('float32')
        test_scale_factor = np.array([[1, 1]]).astype('float32')
        inputs_dict = {'image': test_image, "im_shape": test_im_shape,
                       "scale_factor": test_scale_factor}
        output = self.exec_net.infer(inputs_dict)
        output_data = list(output.values())

        return output_data[:self.k_top]

def normalize(src_img, mean, std):
    src_img = src_img.astype(np.float32, copy=False)
    mean = np.array(mean)[np.newaxis, np.newaxis, :]
    std = np.array(std)[np.newaxis, np.newaxis, :]
    src_img = src_img / 255.0
    src_img -= mean
    src_img /= std

    return src_img

def handle(src_img, crop_size):
    src_img = cv2.resize(src_img, (crop_size[0], crop_size[1]))
    src_img = normalize(src_img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    src_img = src_img.transpose([2, 0, 1])
    tensor_img = src_img[None, :].astype("float32")

    return tensor_img

Target detection (influence)

import cv2
import numpy as np
import paddle.inference as inference

class HandDetectModel(object):
    def __init__(self, crop_size=[608, 608], k_top=2):
        self.model = HandDetectInference(crop_size=crop_size, k_top=k_top)
        self.crop_size = crop_size
    def predict(self, img_cv2, threshold):
        h, w, _ = img_cv2.shape
        output = self.model.forward(img_cv2)
        hands_list = []
        if len(output) > 0:
            if output[0][1] > threshold:
                for i in range(len(output)):
                    if output[i][1] > threshold:
                        x1 = int(output[i][2] / self.crop_size[0] * w)
                        y1 = int(output[i][3] / self.crop_size[0] * h)
                        x2 = int(output[i][4] / self.crop_size[0] * w)
                        y2 = int(output[i][5] / self.crop_size[0] * h)
                        hands_list.append([img_cv2[y1:y2, x1:x2], x1, y1, x2, y2])
                        cv2.rectangle(img_cv2, (x1, y1), (x2, y2), [0, 0, 255], thickness=2)
        return hands_list

class HandDetectInference(object):
    def __init__(self, model_path="./inference_model/model.pdmodel", param_path="./inference_model/model.pdiparams", crop_size=[512, 512], k_top=2):
        self.config = inference.Config(model_path, param_path)
        self.predictor = inference.create_predictor(self.config)
        self.crop_size = crop_size
        self.k_top = k_top

    def forward(self, src_img):
        input_names = self.predictor.get_input_names()
        input_handle = self.predictor.get_input_handle(input_names[0])
        input_handle.copy_from_cpu(np.array([self.crop_size, ]))
        input_handle = self.predictor.get_input_handle(input_names[1])
        input_handle.copy_from_cpu(handle(src_img, self.crop_size))
        input_handle = self.predictor.get_input_handle(input_names[2])
        input_handle.copy_from_cpu(np.array([[1, 1], ]))
        output_names = self.predictor.get_output_names()
        output_handle = self.predictor.get_output_handle(output_names[0])

        self.predictor.run()
        output_data = output_handle.copy_to_cpu()

        return output_data[:self.k_top]

def normalize(src_img, mean, std):
    src_img = src_img.astype(np.float32, copy=False)
    mean = np.array(mean)[np.newaxis, np.newaxis, :]
    std = np.array(std)[np.newaxis, np.newaxis, :]
    src_img = src_img / 255.0
    src_img -= mean
    src_img /= std

    return src_img

def handle(src_img, crop_size):
    src_img = cv2.resize(src_img, (crop_size[0], crop_size[1]))
    src_img = normalize(src_img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    src_img = src_img.transpose([2, 0, 1])
    tensor_img = src_img[None, :].astype("float32")

    return tensor_img

Hand key point model

import cv2
import numpy as np
from paddle.inference import Config
from paddle.inference import create_predictor

class KeypointInferenceModel(object):
    def __init__(self):
        self.config = Config("model/__model__", "model/__params__")
        self.predictor = create_predictor(self.config)

    def forward(self, inpBlob):
        input_names = self.predictor.get_input_names()
        input_handle = self.predictor.get_input_handle(input_names[0])
        output_names = self.predictor.get_output_names()
        output_handle = self.predictor.get_output_handle(output_names[0])
        input_handle.copy_from_cpu(inpBlob)
        self.predictor.run()
        output_data = output_handle.copy_to_cpu()

        return output_data

class KeypointModel(object):
    # initialization
    def __init__(self):
        self.num_points = 21
        self.inHeight = 368
        self.threshold = 0.1
        self.point_pairs = [[0, 1], [1, 2], [2, 3], [3, 4],
                            [0, 5], [5, 6], [6, 7], [7, 8],
                            [0, 9], [9, 10], [10, 11], [11, 12],
                            [0, 13], [13, 14], [14, 15], [15, 16],
                            [0, 17], [17, 18], [18, 19], [19, 20]]
        self.model = KeypointInferenceModel()

    # Model reasoning and prediction
    def predict(self, img_cv2):
        # Image preprocessing
        img_height, img_width, _ = img_cv2.shape
        aspect_ratio = img_width / img_height
        inWidth = int(((aspect_ratio * self.inHeight) * 8) // 8)
        inpBlob = cv2.dnn.blobFromImage(img_cv2, 1.0 / 255, (inWidth, self.inHeight), (0, 0, 0), swapRB=False,
                                        crop=False)

        # Model reasoning
        output = self.model.forward(inpBlob)

        # Key point calculation
        points = []
        for idx in range(self.num_points):
            # confidence map
            probMap = output[0, idx, :, :]
            probMap = cv2.resize(probMap, (img_width, img_height))

            # Find global maxima of the probMap.
            minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)

            if prob > self.threshold:
                points.append((int(point[0]), int(point[1])))
            else:
                points.append(None)

        return points

    # Hand posture visualization function
    def vis_pose(self, img_cv2, points, clas_hand):
        img_cv2_copy = np.copy(img_cv2)
        for idx in range(len(points)):
            if points[idx]:
                cv2.circle(img_cv2_copy, points[idx], 3, (0, 255, 255), thickness=-1,
                           lineType=cv2.FILLED)
                cv2.putText(img_cv2_copy, "{}".format(idx), points[idx], cv2.FONT_HERSHEY_SIMPLEX,
                            1, (0, 0, 255), 2, lineType=cv2.LINE_AA)

        # Draw Skeleton
        for pair in self.point_pairs:
            partA = pair[0]
            partB = pair[1]

            if points[partA] and points[partB]:
                cv2.line(img_cv2, points[partA], points[partB], (0, 255, 255), 2)
                cv2.circle(img_cv2, points[partA], 3, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
                cv2.circle(img_cv2, points[partB], 3, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)

            if clas_hand == "left" and points[20]:
                return points[20]
            elif clas_hand == "right" and points[8]:
                return points[8]
            else:
                return None

if __name__ == '__main__':
    pose_model = KeypointModel()
    frame = cv2.imread('test/left.jpg')
    res_points = pose_model.predict(frame)
    pose_model.vis_pose(frame, res_points, "left")
    cv2.imshow("video", frame)
    cv2.waitKey(0)

Interaction strategy

   ① detect through the target detection handle and cut out the part containing the hand;

   ② put the picture obtained in ① into the key point detection model for reasoning;

   ③ because there is target detection, you can realize simple iou tracking through the opponent. If you stay in a certain area for too long, you think you want to interact;

   ④ set the interactive flag to True and extract the area surrounded by the index finger of both hands through key points for classification detection or OCR extraction (or start painting at the point where the index finger of one hand is located. When the index finger moves too fast, it is considered to stop painting).

  the extended functions mentioned in ④ are in clas_flag can be added (the place with comments).

Openvino version

from keypoint import KeypointModel
from detect import OpenvinoHandDetectModel
from util import compute_iou
from config import *
import cv2

handdetectmodel = OpenvinoHandDetectModel(crop_size=crop_size)
keypointmodel = KeypointModel()

video = cv2.VideoCapture(0)
ret, frame = video.read()
h, w, _ = frame.shape
while ret:
    hands_list = handdetectmodel.predict(frame, detect_threshold)
    if len(hands_list) == 1:
        if (hands_list[0][1] + hands_list[0][3]) > 2*w:
            clas_hand = "right"
        else:
            clas_hand = "left"
        single_hand = hands_list[0][0]
        res_points = keypointmodel.predict(single_hand)
        point = keypointmodel.vis_pose(single_hand, res_points, clas_hand)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand
    elif len(hands_list) == 2:
        if (hands_list[0][1] + hands_list[0][3]) > (hands_list[1][1] + hands_list[1][3]):
            clas_hand_1 = "right"
            clas_hand_2 = "left"
            new_box_l = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_r = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        else:
            clas_hand_1 = "left"
            clas_hand_2 = "right"
            new_box_r = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_l = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        if compute_iou(new_box_l, old_box_l) > iou_threshold and \
                compute_iou(new_box_r, old_box_r) > iou_threshold:
            iou_times += 1
            if iou_times > iou_times_threshold:
                iou_flag = True
            elif iou_times > clas_times_threshold:
                clas_flag = True
        else:
            iou_times = 0
            iou_flag = False
            iou_flag_times = 0
            clas_flag = False
        old_box_l = new_box_l
        old_box_r = new_box_r
        single_hand_1 = hands_list[0][0]
        res_points_1 = keypointmodel.predict(single_hand_1)
        point1 = keypointmodel.vis_pose(single_hand_1, res_points_1, clas_hand_1)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand_1
        single_hand_2 = hands_list[1][0]
        res_points_2 = keypointmodel.predict(single_hand_2)
        point2 = keypointmodel.vis_pose(single_hand_2, res_points_2, clas_hand_2)
        frame[hands_list[1][2]:hands_list[1][4], hands_list[1][1]:hands_list[1][3]] = single_hand_2
        if iou_flag and not clas_flag:
            if point1 != None and point2 != None:
                if clas_hand_1 == "left":
                    cv2.ellipse(frame, (point1[0]+hands_list[0][1], point1[1]+hands_list[0][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                    cv2.ellipse(frame, (point2[0]+hands_list[1][1], point2[1]+hands_list[1][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                iou_flag_times += 1
        elif clas_flag:
            # Extended part
            pass
    else:
        pass

Under the influence version

from keypoint import KeypointModel
from detect import HandDetectModel
from util import compute_iou
from config import *
import cv2

handdetectmodel = HandDetectModel(crop_size=crop_size)
keypointmodel = KeypointModel()

video = cv2.VideoCapture(0)
ret, frame = video.read()
h, w, _ = frame.shape
while ret:
    hands_list = handdetectmodel.predict(frame, detect_threshold)
    if len(hands_list) == 1:
        if (hands_list[0][1] + hands_list[0][3]) > 2*w:
            clas_hand = "right"
        else:
            clas_hand = "left"
        single_hand = hands_list[0][0]
        res_points = keypointmodel.predict(single_hand)
        point = keypointmodel.vis_pose(single_hand, res_points, clas_hand)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand
    elif len(hands_list) == 2:
        if (hands_list[0][1] + hands_list[0][3]) > (hands_list[1][1] + hands_list[1][3]):
            clas_hand_1 = "right"
            clas_hand_2 = "left"
            new_box_l = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_r = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        else:
            clas_hand_1 = "left"
            clas_hand_2 = "right"
            new_box_r = [hands_list[0][1], hands_list[0][2], hands_list[0][3], hands_list[0][4]]
            new_box_l = [hands_list[1][1], hands_list[1][2], hands_list[1][3], hands_list[1][4]]
        if compute_iou(new_box_l, old_box_l) > iou_threshold and \
                compute_iou(new_box_r, old_box_r) > iou_threshold:
            iou_times += 1
            if iou_times > iou_times_threshold:
                iou_flag = True
            elif iou_times > clas_times_threshold:
                clas_flag = True
        else:
            iou_times = 0
            iou_flag = False
            iou_flag_times = 0
            clas_flag = False
        old_box_l = new_box_l
        old_box_r = new_box_r
        single_hand_1 = hands_list[0][0]
        res_points_1 = keypointmodel.predict(single_hand_1)
        point1 = keypointmodel.vis_pose(single_hand_1, res_points_1, clas_hand_1)
        frame[hands_list[0][2]:hands_list[0][4], hands_list[0][1]:hands_list[0][3]] = single_hand_1
        single_hand_2 = hands_list[1][0]
        res_points_2 = keypointmodel.predict(single_hand_2)
        point2 = keypointmodel.vis_pose(single_hand_2, res_points_2, clas_hand_2)
        frame[hands_list[1][2]:hands_list[1][4], hands_list[1][1]:hands_list[1][3]] = single_hand_2
        if iou_flag and not clas_flag:
            if point1 != None and point2 != None:
                if clas_hand_1 == "left":
                    cv2.ellipse(frame, (point1[0]+hands_list[0][1], point1[1]+hands_list[0][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                    cv2.ellipse(frame, (point2[0]+hands_list[1][1], point2[1]+hands_list[1][2]), (12, 12), 0, 0, int(min(8 * iou_flag_times, 360)), (255, 255, 0), thickness=2)
                iou_flag_times += 1
        elif clas_flag:
            # Extended part
            pass
    else:
        pass

Personal profile

My official account

    


Deep learning deep learning content is deep learning official content, in-depth learning related interesting applications, papers read and reproduced, in-depth reading and in-depth study of books, etc., the small author will also open the AI Studio public project behind the story and thinking points in the official account of AI official account update, Studio Welcome to duck~

About the author

schoolHarbin Institute of Technology (Shenzhen)   junior
Direction of interestLarge focus: image and video, reinforcement learning, point cloud
Trumpet focus: text and voice processing
Personal interestI prefer interesting things. I will open source some interesting projects. The projects are simple and suitable for novices. Welcome to fork often
homepageLarge Homepage
Trumpet home page
My emailfirewhitefox@qq.com
My official accountHello Neural Networks

Keywords: Deep Learning NLP paddlepaddle BERT

Added by ccalzaretta on Sat, 05 Mar 2022 11:00:03 +0200