(一)构建自己的图像分类数据集

一、图像采集

1.1、导包

import os
import time
import requests
import urllib3
urllib3.disable_warnings()
from tqdm import tqdm
View Code

 

1.2、参数请求

cookies = {
'BDqhfp': '%E7%8B%97%E7%8B%97%26%26NaN-1undefined%26%2618880%26%2621',
'BIDUPSID': '06338E0BE23C6ADB52165ACEB972355B',
'PSTM': '1646905430',
'BAIDUID': '104BD58A7C408DABABCAC9E0A1B184B4:FG=1',
'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598',
'H_PS_PSSID': '35836_35105_31254_36024_36005_34584_36142_36120_36032_35993_35984_35319_26350_35723_22160_36061',
'BDSFRCVID': '8--OJexroG0xMovDbuOS5T78igKKHJQTDYLtOwXPsp3LGJLVgaSTEG0PtjcEHMA-2ZlgogKK02OTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
'H_BDCLCKID_SF': 'tJPqoKtbtDI3fP36qR3KhPt8Kpby2D62aKDs2nopBhcqEIL4QTQM5p5yQ2c7LUvtynT2KJnz3Po8MUbSj4QoDjFjXJ7RJRJbK6vwKJ5s5h5nhMJSb67JDMP0-4F8exry523ioIovQpn0MhQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0D6bBjHujtT_s2TTKLPK8fCnBDP59MDTjhPrMypomWMT-0bFH_-5L-l5js56SbU5hW5LSQxQ3QhLDQNn7_JjOX-0bVIj6Wl_-etP3yarQhxQxtNRdXInjtpvhHR38MpbobUPUDa59LUvEJgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLtD8bMC-RDjt35n-Wqxobbtof-KOhLTrJaDkWsx7Oy4oTj6DD5lrG0P6RHmb8ht59JROPSU7mhqb_3MvB-fnEbf7r-2TP_R6GBPQtqMbIQft20-DIeMtjBMJaJRCqWR7jWhk2hl72ybCMQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8EjHCet5DJJn4j_Dv5b-0aKRcY-tT5M-Lf5eT22-usy6Qd2hcH0KLKDh6gb4PhQKuZ5qutLTb4QTbqWKJcKfb1MRjvMPnF-tKZDb-JXtr92nuDal5TtUthSDnTDMRhXfIL04nyKMnitnr9-pnLJpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn02eCKuj6tWj6j0DNRabK6aKC5bL6rJabC3b5CzXU6q2bDeQN3OW4Rq3Irt2M8aQI0WjJ3oyU7k0q0vWtvJWbbvLT7johRTWqR4enjb3MonDh83Mxb4BUrCHRrzWn3O5hvvhKoO3MA-yUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRCqVIKa3f',
'BDSFRCVID_BFESS': '8--OJexroG0xMovDbuOS5T78igKKHJQTDYLtOwXPsp3LGJLVgaSTEG0PtjcEHMA-2ZlgogKK02OTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
'H_BDCLCKID_SF_BFESS': 'tJPqoKtbtDI3fP36qR3KhPt8Kpby2D62aKDs2nopBhcqEIL4QTQM5p5yQ2c7LUvtynT2KJnz3Po8MUbSj4QoDjFjXJ7RJRJbK6vwKJ5s5h5nhMJSb67JDMP0-4F8exry523ioIovQpn0MhQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0D6bBjHujtT_s2TTKLPK8fCnBDP59MDTjhPrMypomWMT-0bFH_-5L-l5js56SbU5hW5LSQxQ3QhLDQNn7_JjOX-0bVIj6Wl_-etP3yarQhxQxtNRdXInjtpvhHR38MpbobUPUDa59LUvEJgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLtD8bMC-RDjt35n-Wqxobbtof-KOhLTrJaDkWsx7Oy4oTj6DD5lrG0P6RHmb8ht59JROPSU7mhqb_3MvB-fnEbf7r-2TP_R6GBPQtqMbIQft20-DIeMtjBMJaJRCqWR7jWhk2hl72ybCMQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8EjHCet5DJJn4j_Dv5b-0aKRcY-tT5M-Lf5eT22-usy6Qd2hcH0KLKDh6gb4PhQKuZ5qutLTb4QTbqWKJcKfb1MRjvMPnF-tKZDb-JXtr92nuDal5TtUthSDnTDMRhXfIL04nyKMnitnr9-pnLJpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn02eCKuj6tWj6j0DNRabK6aKC5bL6rJabC3b5CzXU6q2bDeQN3OW4Rq3Irt2M8aQI0WjJ3oyU7k0q0vWtvJWbbvLT7johRTWqR4enjb3MonDh83Mxb4BUrCHRrzWn3O5hvvhKoO3MA-yUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRCqVIKa3f',
'indexPageSugList': '%5B%22%E7%8B%97%E7%8B%97%22%5D',
'cleanHistoryStatus': '0',
'BAIDUID_BFESS': '104BD58A7C408DABABCAC9E0A1B184B4:FG=1',
'BDRCVFR[dG2JNJb_ajR]': 'mk3SLVN4HKm',
'BDRCVFR[-pGxjrCMryR]': 'mk3SLVN4HKm',
'ab_sr': '1.0.1_Y2YxZDkwMWZkMmY2MzA4MGU0OTNhMzVlNTcwMmM2MWE4YWU4OTc1ZjZmZDM2N2RjYmVkMzFiY2NjNWM4Nzk4NzBlZTliYWU0ZTAyODkzNDA3YzNiMTVjMTllMzQ0MGJlZjAwYzk5MDdjNWM0MzJmMDdhOWNhYTZhMjIwODc5MDMxN2QyMmE1YTFmN2QyY2M1M2VmZDkzMjMyOThiYmNhZA==',
'delPer': '0',
'PSINO': '2',
'BA_HECTOR': '8h24a024042g05alup1h3g0aq0q',
}

headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'Accept': 'text/plain, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1647837998851_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDIsNiwxLDUsNCw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=%E7%8B%97%E7%8B%97',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
View Code

 

1.3、爬取函数封装

def craw_single_class(keyword, DOWNLOAD_NUM=200):
    if os.path.exists('dataset/' + keyword):
        print('文件夹 dataset/{} 已存在,之后直接将爬取到的图片保存至该文件夹中'.format(keyword))
    else:
        os.makedirs('dataset/{}'.format(keyword))
        print('新建文件夹:dataset/{}'.format(keyword))
    count = 1

    with tqdm(total=DOWNLOAD_NUM, position=0, leave=True) as pbar:

        # 爬取第几张
        num = 0

        # 是否继续爬取
        FLAG = True

        while FLAG:

            page = 30 * count

            params = (
                ('tn', 'resultjson_com'),
                ('logid', '12508239107856075440'),
                ('ipn', 'rj'),
                ('ct', '201326592'),
                ('is', ''),
                ('fp', 'result'),
                ('fr', ''),
                ('word', f'{keyword}'),
                ('queryWord', f'{keyword}'),
                ('cl', '2'),
                ('lm', '-1'),
                ('ie', 'utf-8'),
                ('oe', 'utf-8'),
                ('adpicid', ''),
                ('st', '-1'),
                ('z', ''),
                ('ic', ''),
                ('hd', ''),
                ('latest', ''),
                ('copyright', ''),
                ('s', ''),
                ('se', ''),
                ('tab', ''),
                ('width', ''),
                ('height', ''),
                ('face', '0'),
                ('istype', '2'),
                ('qc', ''),
                ('nc', '1'),
                ('expermode', ''),
                ('nojc', ''),
                ('isAsync', ''),
                ('pn', f'{page}'),
                ('rn', '30'),
                ('gsm', '1e'),
                ('1647838001666', ''),
            )

            response = requests.get('https://image.baidu.com/search/acjson', headers=headers, params=params,
                                    cookies=cookies)
            if response.status_code == 200:
                try:
                    json_data = response.json().get("data")

                    if json_data:
                        for x in json_data:
                            type = x.get("type")
                            if type not in ["gif"]:
                                img = x.get("thumbURL")
                                fromPageTitleEnc = x.get("fromPageTitleEnc")
                                try:
                                    resp = requests.get(url=img, verify=False)
                                    time.sleep(1)
                                    # print(f"链接 {img}")

                                    # 保存文件名
                                    # file_save_path = f'dataset/{keyword}/{num}-{fromPageTitleEnc}.{type}'
                                    file_save_path = f'dataset/{keyword}/{num}.{type}'
                                    with open(file_save_path, 'wb') as f:
                                        f.write(resp.content)
                                        f.flush()
                                        # print('第 {} 张图像 {} 爬取完成'.format(num, fromPageTitleEnc))
                                        num += 1
                                        pbar.update(1)  # 进度条更新

                                    # 爬取数量达到要求
                                    if num > DOWNLOAD_NUM:
                                        FLAG = False
                                        print('{} 张图像爬取完毕'.format(num))
                                        break

                                except Exception:
                                    pass
                except:
                    pass
            else:
                break

            count += 1
View Code

 

1.4、调用爬取函数

# 爬取单类
craw_single_class('柚子', DOWNLOAD_NUM = 200)

# 爬取多类
class_list = ['黄瓜','南瓜','冬瓜','木瓜','苦瓜','丝瓜','窝瓜','甜瓜','香瓜','白兰瓜','黄金瓜','西葫芦','人参果','羊角蜜','佛手瓜','伊丽莎白瓜']
for each in class_list:
    craw_single_class(each, DOWNLOAD_NUM = 200)
View Code

 

二、统计图像尺寸,比例分布

2.1、导包

import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
View Code

2.2、指定根目录路径,读取路径下的所有文件至DataFrame中

dataset_path = 'fruit81_full'
os.chdir(dataset_path)

df = pd.DataFrame()
for fruit in tqdm(os.listdir()): # 遍历每个类别
    os.chdir(fruit)
    for file in os.listdir(): # 遍历每张图像
        try:
            img = cv2.imread(file)
            df = df.append({'类别':fruit, '文件名':file, '图像宽':img.shape[1], '图像高':img.shape[0]}, ignore_index=True)
        except:
            print(os.path.join(fruit, file), '读取错误')
    os.chdir('../')
os.chdir('../')

print(df)
df.to_csv('image.csv')
View Code
dataset_path = 'fruit81_full'
os.chdir(dataset_path) # 切换到当前path的路径下
os.chdir(path) 表示切换到指定的path路径下
 

2.3、可视化图像尺寸分布

x = df['图像宽'] # x:(0, 500.0) (1, 500.0) (2, 749.0) (3, 500.0) ...
y = df['图像高'] # y:(0, 500.0) (1, 329.0) (2, 500.0) (3, 500.0) ...

xy = np.vstack([x,y]) # xy: ndarray: (2, 14433),  [[500. 500. 749. ... 500. 667. 600. ], [500. 329. 500. ... 482. 500. 476.]]
z = gaussian_kde(xy)(xy) # z: ndarray: (14433,)  [5.67285883e-05 2.10230972e-05 1.86175456e-05 ... ]

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort() # idx: ndarray: (14433,)  (5167, 5617, 6406, 2279, 10228...)
x, y, z = x[idx], y[idx], z[idx] # x:Series, (14433,);  y:Series, (14433,); z:ndarray, (14433,)

plt.figure(figsize=(10,10))
plt.scatter(x, y, c=z,  s=5, cmap='Spectral_r')
plt.tick_params(labelsize=15)

xy_max = max(max(df['图像宽']), max(df['图像高']))
plt.xlim(xmin=0, xmax=xy_max)
plt.ylim(ymin=0, ymax=xy_max)

plt.ylabel('height', fontsize=25)
plt.xlabel('width', fontsize=25)

plt.savefig('图像尺寸分布.pdf', dpi=120, bbox_inches='tight')

plt.show()
View Code

 

 

 

 

三、划分训练集、测试集

3.1、导包:

import os
import shutil
import random
import pandas as pd
View Code

3.2、获取所有类别名称:

# 指定数据集路径
dataset_path = 'fruit81_full'

dataset_name = dataset_path.split('_')[0]
print('数据集', dataset_name)

classes = os.listdir(dataset_path)
print(len(classes))
print(classes)
View Code

3.3、创建训练集文件夹和测试集文件夹

# 创建 train 文件夹
os.mkdir(os.path.join(dataset_path, 'train'))

# 创建 test 文件夹
os.mkdir(os.path.join(dataset_path, 'val'))

# 在 train 和 test 文件夹中创建各类别子文件夹
for fruit in classes:
    os.mkdir(os.path.join(dataset_path, 'train', fruit))
    os.mkdir(os.path.join(dataset_path, 'val', fruit))
View Code

3.4、划分训练集、测试集,移动文件

# 划分训练集、测试集,移动文件
test_frac = 0.2  # 测试集比例
random.seed(123) # 随机数种子,便于复现

df = pd.DataFrame()

print('{:^18} {:^18} {:^18}'.format('类别', '训练集数据个数', '测试集数据个数'))

for fruit in classes:  # 遍历每个类别

    # 读取该类别的所有图像文件名
    old_dir = os.path.join(dataset_path, fruit)
    images_filename = os.listdir(old_dir)
    random.shuffle(images_filename)  # 随机打乱

    # 划分训练集和测试集
    testset_numer = int(len(images_filename) * test_frac)  # 测试集图像个数
    testset_images = images_filename[:testset_numer]  # 获取拟移动至 test 目录的测试集图像文件名
    trainset_images = images_filename[testset_numer:]  # 获取拟移动至 train 目录的训练集图像文件名

    # 移动图像至 test 目录
    for image in testset_images:
        old_img_path = os.path.join(dataset_path, fruit, image)  # 获取原始文件路径
        new_test_path = os.path.join(dataset_path, 'val', fruit, image)  # 获取 test 目录的新文件路径
        shutil.move(old_img_path, new_test_path)  # 移动文件

    # 移动图像至 train 目录
    for image in trainset_images:
        old_img_path = os.path.join(dataset_path, fruit, image)  # 获取原始文件路径
        new_train_path = os.path.join(dataset_path, 'train', fruit, image)  # 获取 train 目录的新文件路径
        shutil.move(old_img_path, new_train_path)  # 移动文件

    # 删除旧文件夹
    assert len(os.listdir(old_dir)) == 0  # 确保旧文件夹中的所有图像都被移动走
    shutil.rmtree(old_dir)  # 删除文件夹

    # 工整地输出每一类别的数据个数
    print('{:^18} {:^18} {:^18}'.format(fruit, len(trainset_images), len(testset_images)))

    # 保存到表格中
    df = df.append({'class': fruit, 'trainset': len(trainset_images), 'testset': len(testset_images)},
                   ignore_index=True)

# 重命名数据集文件夹
shutil.move(dataset_path, dataset_name + '_split')

# 数据集各类别数量统计表格,导出为 csv 文件
df['total'] = df['trainset'] + df['testset']
df.to_csv('数据量统计.csv', index=False)
View Code
# 随机数的使用
random.seed(123) # 随机数种子,便于复现
random.shuffle(images_filename) # 随机打乱

random.seed(int): 设置随机数种子,int值固定,每次执行的时候,结果是一样的,保证相同结果可以重现

 

import shutil
shutil.move(old_img_path, new_test_path)
shutil.rmtree(old_dir)  # 删除文件夹
shutil.move(dataset_path, dataset_name + '_split')

shutil.move(old_path, new_path)  : 路径移动

shutil.tmtree(path)  :删除文件夹

shutil.move(old_name, new_name)  : 重命名

 

# 断言的使用,如果为False,则程序不继续执行
assert len(os.listdir(old_dir)) == 0 # 确保旧文件夹中的所有图像都被移动走

assert的使用,可以保证判别式成立,程序才继续执行。

 

四、可视化文件夹中的图像

4.1、可视化文件夹的图像

4.1.1、导包:

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import math
import os
import cv2
View Code

4.1.2、读取文件夹中的所有图像:

folder_path = 'fruit81_split/train/西瓜'
# 可视化图像的个数
N = 36
# n 行 n 列
n = math.floor(np.sqrt(N))
print(n)

images = []

for each_img in os.listdir(folder_path)[:N]:
    img_path = os.path.join(folder_path, each_img)
    img_bgr = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)  # 解决路径中带中文的问题
    # img_bgr = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    images.append(img_rgb)

fig = plt.figure(figsize=(50, 50))
grid = ImageGrid(fig, 111,  # 类似绘制子图 subplot(111)
                 nrows_ncols=(n, n),  # 创建 n 行 m 列的 axes 网格
                 axes_pad=0.02,  # 网格间距
                 share_all=True
                 )
View Code
ImageGrid画了一个空的画布

 

 

# 遍历每张图像
for ax, im in zip(grid, images):
    ax.imshow(im)
    ax.axis('off')

plt.tight_layout()
plt.show()
View Code
ax.imshow(im) 往每个网格中填充图片
           

 

   

 

 

 

ax.axis('off') 关闭网格线,最终得到图片如下:

  

images = []
for each_img in os.listdir(folder_path)[:N]:
    img_path = os.path.join(folder_path, each_img)
    img_bgr = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1) # 解决路径中带中文的问题
    # img_bgr = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    images.append(img_rgb)

cv2.imdecode: 将图片转化成rgb的格式:

[[[251 255 252],  [251 255 252],  [250 255 252],  ...,  [251 255 255],  [251 255 255],  [251 255 255]],, [[253 255 254],  [253 255 254],  [251 255 254],  ...,  [251 255 255],  [251 255 255],  [251 255 255]],, [[255 254 255],  [255 254 255],  [255 254 255],  ...,  [251 255 255],  [251 255 255],  [251 255 255]],, ...,, [[255 252 255],  [250 250 255],  [242 254 254],  ...,  [234 255 240],  [242 255 246],  [250 255 252]],, [[255 253 255],  [251 252 255],  [241 254 252],  ...,  [236 255 241],  [242 255 246],  [250 255 252]],, [[255 253 255],  [251 252 255],  [244 255 253],  ...,  [236 255 241],  [242 255 246],  [250 255 252]]]

shape[0]: 表示图像高

shape[1]: 表示图像宽

 

4.2、统计各类别图像数量

4.2.1、导包:

import pandas as pd
import matplotlib.pyplot as plt
View Code

4.2.2、设置中文字体

# windows
plt.rcParams['font.sans-serif']=['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号

# linux
import matplotlib
matplotlib.rc("font",family='SimHei') # 中文字体
# plt.rcParams['font.sans-serif']=['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号
View Code

4.2.3、图像数量柱状图可视化

df = pd.read_csv('数据量统计.csv')
print(df.shape)
print(df.head())

# 指定可视化的特征
feature = 'total'
df = df.sort_values(by=feature, ascending=False)
print(df.head())

plt.figure(figsize=(22, 7))

x = df['class']
y = df[feature]

plt.bar(x, y, facecolor='#1f77b4', edgecolor='k')

plt.xticks(rotation=90)
plt.tick_params(labelsize=15)
plt.xlabel('类别', fontsize=20)
plt.ylabel('图像数量', fontsize=20)

# plt.savefig('各类别图片数量.pdf', dpi=120, bbox_inches='tight')

plt.show()

plt.figure(figsize=(22, 7))
x = df['class']
y1 = df['testset']
y2 = df['trainset']

width = 0.55 # 柱状图宽度

plt.xticks(rotation=90) # 横轴文字旋转

plt.bar(x, y1, width, label='测试集')
plt.bar(x, y2, width, label='训练集', bottom=y1)


plt.xlabel('类别', fontsize=20)
plt.ylabel('图像数量', fontsize=20)
plt.tick_params(labelsize=13) # 设置坐标文字大小

plt.legend(fontsize=16) # 图例

# 保存为高清的 pdf 文件
plt.savefig('各类别图像数量.pdf', dpi=120, bbox_inches='tight')

plt.show()
View Code
plt.bar(x, y1, width, label='测试集')
plt.bar(x, y2, width, label='训练集', bottom=y1)

bar中bottom的使用: 默认为0,表示距离y轴的距离为0, 

bottom=y1表示距离y轴的距离为y1,此时可以画出两个bar柱状图出来

 

 

如果不设置 button,则第二个bar会把第一个bar覆盖,变成:

 

posted @ 2023-01-17 17:13  wangpengcufe  阅读(449)  评论(0)    收藏  举报