获取人脸数据集

2024-11-28 14:47:32 +08:00
parent 58baaf68e1
commit 1ef65ff1a2
7 changed files with 113 additions and 7 deletions
@@ -143,4 +143,13 @@ cython_debug/
 # jetbrains
 .idea/
-*/.idea/
+*/.idea/
 # mnist dataset
 实验六/DataImages*
 实验六/data/
 实验六/cache/
 实验六/models/
 # dataset
 实验七/cache/
@@ -6,4 +6,8 @@ matplotlib
 pillow
 scikit-learn==1.3
 jupyterlab
-jupyterlab-language-pack-zh-CN
+jupyterlab-language-pack-zh-CN
 icecream
 torch
 torchvision
 rich
@@ -0,0 +1,77 @@
 import os
 import requests
 import tqdm
 import tarfile
 # 计算大小
 def get_human_readable_size(size_in_bytes):
    # 定义单位
    units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
    # 计算单位和大小
    index = 0
    while size_in_bytes >= 1024 and index < len(units) - 1:
        size_in_bytes /= 1024
        index += 1
    return f"{size_in_bytes:.2f} {units[index]}"
 # 下载与进度条
 def download(url,output_path):
    filename = url.split('/')[-1]
    download_path = os.path.join(output_path,filename)
    response=requests.get(url,stream=True)
    total_size = int(response.headers.get('content-length', 0))
    os.makedirs(output_path,exist_ok=True)
    print("下载 ",filename,' 到 ',output_path,' |大小= ',get_human_readable_size(total_size))
    # 如已下载，跳过
    if os.path.exists(download_path):
        print(filename, ' 已存在，跳过')
        return
    with open(download_path,'wb') as file,tqdm.tqdm(
        desc='下载进度',
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024
    )as bar:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
            bar.update(len(chunk))
 def decompress(file_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    print('解压 '+file_path.split('/')[-1],' 到 ',output_dir)
    with tarfile.open(file_path) as tar:
        members = tar.getmembers()
        with tqdm.tqdm(total=len(members), desc='解压进度', unit='file') as bar:
            for member in members:
                # 构建完整路径
                full_path = os.path.join(output_dir, member.name)
                # 因FDDB存在同一路径下存在相同文件的问题，特此跳过
                # 如果文件已存在，跳过解压
                if os.path.exists(full_path):
                    bar.update(1)
                    continue
                # 解压文件
                tar.extract(member, path=output_dir)
                bar.update(1)
 # 下载人脸数据集
 face_dataset_url='http://vis-www.cs.umass.edu/fddb/originalPics.tar.gz'
 face_dataset_path='cache/dataset/face/'
 download(face_dataset_url,face_dataset_path)
 # 下载人脸数据标签
 face_label_path='cache/dataset/face/label/'
 face_label_url='http://vis-www.cs.umass.edu/fddb/FDDB-folds.tgz'
 download(face_label_url,face_label_path)
 # 解压数据集
 decompress(os.path.join(face_dataset_path,face_dataset_url.split('/')[-1]),face_dataset_path)
 decompress(os.path.join(face_label_path,face_label_url.split('/')[-1]),face_label_path)
@@ -0,0 +1,16 @@
 # 这是一个示例 Python 脚本。
 # 按 Shift+F10 执行或将其替换为您的代码。
 # 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。
 def print_hi(name):
    # 在下面的代码行中使用断点来调试脚本。
    print(f'Hi, {name}')  # 按 Ctrl+F8 切换断点。
 # 按装订区域中的绿色按钮以运行脚本。
 if __name__ == '__main__':
    print_hi('PyCharm')
 # 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
@@ -16,5 +16,5 @@ def save_img_subset(data, save_path, num_samples):
        img.save(os.path.join(save_path, f"{i}-label-{label}.png"))
 # 保存前 600 张训练集图片和前 100 张测试集图片
-save_img_subset(train_data, './DataImages-Train', 6000)
+save_img_subset(train_data, './DataImages-Train', 60000)
-save_img_subset(test_data, './DataImages-Test', 1000)
+save_img_subset(test_data, './DataImages-Test', 10000)
@@ -64,8 +64,8 @@ def ensure_dir_exists(directory):
        os.makedirs(directory)
 # 加载训练数据
-trains_paths, trains_labels = load_data("cache/pretrains/train")
+trains_paths, trains_labels = load_data("DataImages-Train")
-test_paths, test_labels = load_data("cache/pretrains/test")
+test_paths, test_labels = load_data("DataImages-Test")
 # 提取特征和标签
 X_train = np.array([extract_features(train_path) for train_path in tqdm.tqdm(trains_paths, desc="训练集特征提取中：")])
@@ -87,7 +87,7 @@ for test_sample in tqdm.tqdm(X_test, desc="测试集中预测进度"):
    Y_pred.append(classifier.predict(test_sample.reshape(1, -1)))
 accuracy = accuracy_score(Y_test, Y_pred)
-print(f"性能: {accuracy * 100:.2f}%")
+print(f"准确率: {accuracy * 100:.2f}%")
 # 保存模型
 ensure_dir_exists("models")