113 lines
3.7 KiB
Python
113 lines
3.7 KiB
Python
import os
|
|
import requests
|
|
import tqdm
|
|
import tarfile
|
|
import pickle
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
# 计算大小
|
|
def get_human_readable_size(size_in_bytes):
|
|
# 定义单位
|
|
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
|
|
# 计算单位和大小
|
|
index = 0
|
|
while size_in_bytes >= 1024 and index < len(units) - 1:
|
|
size_in_bytes /= 1024
|
|
index += 1
|
|
return f"{size_in_bytes:.2f} {units[index]}"
|
|
|
|
# 下载与进度条
|
|
def download(url,output_path):
|
|
filename = url.split('/')[-1]
|
|
download_path = os.path.join(output_path,filename)
|
|
|
|
response=requests.get(url,stream=True)
|
|
total_size = int(response.headers.get('content-length', 0))
|
|
os.makedirs(output_path,exist_ok=True)
|
|
|
|
print("下载 ",filename,' 到 ',output_path,' |大小= ',get_human_readable_size(total_size))
|
|
|
|
# 如已下载,跳过
|
|
if os.path.exists(download_path):
|
|
print(filename, ' 已存在,跳过')
|
|
return
|
|
|
|
with open(download_path,'wb') as file,tqdm.tqdm(
|
|
desc='下载进度',
|
|
total=total_size,
|
|
unit='B',
|
|
unit_scale=True,
|
|
unit_divisor=1024
|
|
)as bar:
|
|
for chunk in response.iter_content(chunk_size=1024):
|
|
file.write(chunk)
|
|
bar.update(len(chunk))
|
|
|
|
|
|
def decompress(file_path, output_dir):
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
print('解压 '+file_path.split('/')[-1],' 到 ',output_dir)
|
|
with tarfile.open(file_path) as tar:
|
|
members = tar.getmembers()
|
|
with tqdm.tqdm(total=len(members), desc='解压进度', unit='file') as bar:
|
|
for member in members:
|
|
# 构建完整路径
|
|
full_path = os.path.join(output_dir, member.name)
|
|
|
|
# 因FDDB存在同一路径下存在相同文件的问题,特此跳过
|
|
# 如果文件已存在,跳过解压
|
|
if os.path.exists(full_path):
|
|
bar.update(1)
|
|
continue
|
|
|
|
# 解压文件
|
|
tar.extract(member, path=output_dir)
|
|
bar.update(1)
|
|
|
|
# 下载人脸数据集
|
|
face_dataset_url='http://vis-www.cs.umass.edu/fddb/originalPics.tar.gz'
|
|
face_dataset_path='cache/dataset/face/'
|
|
download(face_dataset_url,face_dataset_path)
|
|
|
|
# 下载人脸数据标签
|
|
face_label_path='cache/dataset/face/label/'
|
|
face_label_url='http://vis-www.cs.umass.edu/fddb/FDDB-folds.tgz'
|
|
download(face_label_url,face_label_path)
|
|
|
|
# 下载非人脸数据集
|
|
non_face_path='cache/dataset/non_face/'
|
|
non_face_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
|
|
download(non_face_url,non_face_path)
|
|
|
|
# 解压数据集
|
|
decompress(os.path.join(face_dataset_path,face_dataset_url.split('/')[-1]),face_dataset_path)
|
|
decompress(os.path.join(face_label_path,face_label_url.split('/')[-1]),face_label_path)
|
|
decompress(os.path.join(non_face_path,non_face_url.split('/')[-1]),non_face_path)
|
|
|
|
# 解码非人脸数据集
|
|
def unpickle(file_path):
|
|
with open(file_path, 'rb') as file:
|
|
return pickle.load(file, encoding='bytes')
|
|
|
|
def save_images(data, labels, directory):
|
|
os.makedirs(directory, exist_ok=True)
|
|
for i in tqdm.tqdm(range(len(data))):
|
|
img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0))
|
|
img = img.astype(np.uint8)
|
|
label = str(labels[i])
|
|
img_filename = os.path.join(directory, f"{label}_{i}.png")
|
|
img_to_save = Image.fromarray(img)
|
|
img_to_save.save(img_filename)
|
|
|
|
print('解码非人脸数据集:')
|
|
cifar10_dir = os.path.join(non_face_path, 'cifar-10-batches-py')
|
|
output_dir = non_face_path
|
|
batch_file = 'data_batch_1'
|
|
file_path = os.path.join(cifar10_dir, batch_file)
|
|
batch_data = unpickle(file_path)
|
|
|
|
images = batch_data[b'data']
|
|
labels = batch_data[b'labels']
|
|
|
|
save_images(images, labels, output_dir) |