VGG_based_multi_input_ASL_recognition / data preprocessing.py
CantonMonkey
add app files
4db3489
import cv2
import mediapipe as mp
import numpy as np
import os
import random
from tqdm import tqdm
import shutil
class HandProcessor:
def __init__(self, min_detection_confidence=0.8, min_tracking_confidence=0.5):
self.mpHands=mp.solutions.hands
self.hands=self.mpHands.Hands(
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence
)
self.offset=10
self.max_images_per_class=1400 # 1400 img for each class
def get_bbox_coordinates(self, results, image_shape):
"""Get bounding box coordinates for a hand landmark."""
all_x, all_y=[], []
if results.multi_hand_landmarks:
for handLms in results.multi_hand_landmarks[0].landmark:
all_x.append(int(handLms.x*image_shape[1]))
all_y.append(int(handLms.y*image_shape[0]))
if all_x and all_y:
return min(all_x), min(all_y), max(all_x), max(all_y)
return None
def extract_landmarks(self, results):
"""Extract normalized landmarks from detection results."""
if results.multi_hand_landmarks:
landmarks=[]
for landmark in results.multi_hand_landmarks[0].landmark:
landmarks.extend([landmark.x, landmark.y])
return landmarks
return None
def process_image(self, img_path):
"""Process a single image and return cropped image and landmarks."""
img=cv2.imread(img_path)
if img is None:
return None, None
imgRGB=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results=self.hands.process(imgRGB)
# get landmarks
landmarks=self.extract_landmarks(results)
if landmarks is None:
return None, None
# get bbox & crop
bbox=self.get_bbox_coordinates(results, img.shape)
if bbox is None:
return None, None
minX, minY, MaxX, MaxY=bbox
h, w=img.shape[:2]
minX=max(0, minX-self.offset)
minY=max(0, minY-self.offset)
MaxX=min(w, MaxX+self.offset)
MaxY=min(h, MaxY+self.offset)
imgCrop=img[minY:MaxY, minX:MaxX]
if imgCrop.size==0:
return None, None
return imgCrop, landmarks
def process_dataset(self, input_path, output_img_path, output_landmarks_path):
"""Process entire dataset and save cropped images and landmarks."""
os.makedirs(output_img_path, exist_ok=True)
os.makedirs(output_landmarks_path, exist_ok=True)
landmarks_dict={}
for class_name in tqdm(os.listdir(input_path), desc="Processing classes"):
input_class_path=os.path.join(input_path, class_name)
if not os.path.isdir(input_class_path):
continue
output_class_path=os.path.join(output_img_path, class_name)
os.makedirs(output_class_path, exist_ok=True)
# img processing
processed_count=0
for img_name in os.listdir(input_class_path):
if processed_count>=self.max_images_per_class:
break
input_img_path=os.path.join(input_class_path, img_name)
cropped_img, landmarks=self.process_image(input_img_path)
if cropped_img is not None and landmarks is not None:
# export cropped image
output_img_path_full=os.path.join(output_class_path, img_name)
cv2.imwrite(output_img_path_full, cropped_img)
# store landmarks
relative_path=os.path.join(class_name, img_name)
landmarks_dict[relative_path]=landmarks
processed_count+=1
# export landmarks
np_path=os.path.join(output_landmarks_path, 'hand_landmarks.npz')
np.savez_compressed(np_path, **landmarks_dict)
print(f"Processed {len(landmarks_dict)} images")
print(f"Cropped images saved to: {output_img_path}")
print(f"Landmarks data saved to: {output_landmarks_path}")
if __name__=="__main__":
input_path="E:/ML_ASL_try/asl_alphabet_train/asl_alphabet_train/"
output_img_path="E:/ML_ASL_try/new_cropped_images/"
output_landmarks_path="E:/ML_ASL_try/new_landmarks_data/"
processor=HandProcessor()
processor.process_dataset(input_path, output_img_path, output_landmarks_path)