tjy/BloodPressure/dataloader.py

286 lines
9.7 KiB
Python
Raw Normal View History

2024-06-20 18:22:33 +08:00
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import h5py
def custom_collate_fn(batch):
X, y_SBP, y_DBP = zip(*batch)
X = torch.tensor(np.array(X), dtype=torch.float32)
y_SBP = torch.tensor(y_SBP, dtype=torch.float32)
y_DBP = torch.tensor(y_DBP, dtype=torch.float32)
return X, y_SBP, y_DBP
class BPDataset(Dataset):
def __init__(self, X_data, y_SBP, y_DBP):
self.X_data = X_data
self.y_SBP = y_SBP
self.y_DBP = y_DBP
def __len__(self):
return len(self.y_SBP)
def __getitem__(self, idx):
# X_sample = self.X_data[idx * 250:(idx + 1) * 250]
X_sample = self.X_data[idx]
y_SBP_sample = self.y_SBP[idx]
y_DBP_sample = self.y_DBP[idx]
return X_sample, y_SBP_sample, y_DBP_sample
class BPDataLoader:
def __init__(self, data_dir, val_split=0.2, batch_size=32, shuffle=True, data_type='npy'):
self.data_dir = data_dir
self.val_split = val_split
self.batch_size = batch_size
self.shuffle = shuffle
self.train_dataloader = None
self.val_dataloader = None
self.data_type = data_type
def load_data(self):
X_BP_path = os.path.join(self.data_dir, 'X_BP.npy')
y_DBP_path = os.path.join(self.data_dir, 'Y_DBP.npy')
y_SBP_path = os.path.join(self.data_dir, 'Y_SBP.npy')
X_BP = np.load(X_BP_path)
# 将数据reshape成(batch_size, 250,1)的形状
X_BP = X_BP.reshape(-1, 250, 1)
y_DBP = np.load(y_DBP_path)
y_SBP = np.load(y_SBP_path)
return X_BP, y_DBP, y_SBP
def load_data_UKL_h5(self):
X_BP_path = os.path.join(self.data_dir, 'rPPG-BP-UKL_rppg_7s.h5')
with h5py.File(X_BP_path, 'r') as f:
rppg = f.get('rppg')
BP = f.get('label')
rppg = np.array(rppg)
BP = np.array(BP)
# 将数据从(875, 7851)reshape成(7851, 875, 1)的形状
rppg = rppg.transpose(1, 0)
rppg = rppg.reshape(-1, 875, 1)
X_BP = rppg
y_DBP = BP[1]
y_SBP = BP[0]
return X_BP, y_DBP, y_SBP
def load_data_MIMIC_h5(self):
X_BP_path = os.path.join(self.data_dir, 'MIMIC-III_ppg_dataset.h5')
#
# 获取data_dir下文件列表
files = os.listdir(self.data_dir)
# 检查是否存在已经处理好的数据
if 'X_MIMIC_BP.npy' in files and 'Y_MIMIC_DBP.npy' in files and 'Y_MIMIC_SBP.npy' in files:
print('loading preprocessed data.....')
X_BP = np.load(os.path.join(self.data_dir, 'X_MIMIC_BP.npy'))
y_DBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_DBP.npy'))
y_SBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_SBP.npy'))
return X_BP, y_DBP, y_SBP
with h5py.File(X_BP_path, 'r') as f:
ppg = f.get('ppg')
BP = f.get('label')
ppg = np.array(ppg)
BP = np.array(BP)
# 统计BP中SBP的最大值和最小值
max_sbp = np.max(BP[:, 0])
min_sbp = np.min(BP[:, 0])
max_sbp = 10 - max_sbp % 10 + max_sbp
min_sbp = min_sbp - min_sbp % 10
# 划分区间
bins = np.arange(min_sbp, max_sbp, 10)
print(bins)
sampled_ppg_data = []
sampled_bp_data = []
for i in range(len(bins) - 1):
# 获取当前区间的数据
bin_data_sbp_dbp = BP[(BP[:, 0] >= bins[i]) & (BP[:, 0] < bins[i + 1])]
bin_data_ppg = ppg[(BP[:, 0] >= bins[i]) & (BP[:, 0] < bins[i + 1])]
# 如果当前区间有数据
if len(bin_data_sbp_dbp) > 0:
# 从当前区间中随机抽取20%的数据
num_samples = int(len(bin_data_sbp_dbp) * 0.1)
indices = np.random.choice(len(bin_data_sbp_dbp), num_samples, replace=False)
sampled_bin_data_sbp_dbp = bin_data_sbp_dbp[indices]
sampled_bin_data_ppg = bin_data_ppg[indices]
# 将抽取的数据添加到最终的列表中
sampled_bp_data.append(sampled_bin_data_sbp_dbp)
sampled_ppg_data.append(sampled_bin_data_ppg)
# 将列表中的数据合并成NumPy数组
ppg = np.concatenate(sampled_ppg_data, axis=0)
BP = np.concatenate(sampled_bp_data, axis=0)
print(ppg.shape, BP.shape)
# 将数据从(9054000, 875)reshape成(9054000, 875, 1)的形状
ppg = ppg.reshape(-1, 875, 1)
X_BP = ppg
# 取出第一列赋值给y_DBP第0列赋值给y_SBP
y_DBP = BP[:, 1]
y_SBP = BP[:, 0]
# 将数据保存到文件中
np.save('data/X_MIMIC_BP.npy', X_BP)
np.save('data/Y_MIMIC_DBP.npy', y_DBP)
np.save('data/Y_MIMIC_SBP.npy', y_SBP)
return X_BP, y_DBP, y_SBP
def load_data_MIMIC_h5_full(self):
X_BP_path = os.path.join(self.data_dir, 'MIMIC-III_ppg_dataset.h5')
# 获取data_dir下文件列表
files = os.listdir(self.data_dir)
# 检查是否存在已经处理好的数据
if 'X_MIMIC_BP_full.npy' in files and 'Y_MIMIC_DBP_full.npy' in files and 'Y_MIMIC_SBP_full.npy' in files:
print('loading preprocessed data.....')
X_BP = np.load(os.path.join(self.data_dir, 'X_MIMIC_BP_full.npy'))
y_DBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_DBP_full.npy'))
y_SBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_SBP_full.npy'))
return X_BP, y_DBP, y_SBP
with h5py.File(X_BP_path, 'r') as f:
ppg = f.get('ppg')
BP = f.get('label')
ppg = np.array(ppg)
BP = np.array(BP)
# 将数据从(9054000, 875)reshape成(9054000, 875, 1)的形状
ppg = ppg.reshape(-1, 875, 1)
X_BP = ppg
# 取出第一列赋值给y_DBP第0列赋值给y_SBP
y_DBP = BP[:, 1]
y_SBP = BP[:, 0]
print("data shape:", X_BP.shape, y_DBP.shape, y_SBP.shape)
print("saving data.....")
# 将数据保存到文件中
np.save('data/X_MIMIC_BP_full.npy', X_BP)
np.save('data/Y_MIMIC_DBP_full.npy', y_DBP)
np.save('data/Y_MIMIC_SBP_full.npy', y_SBP)
print("data saved.....")
return X_BP, y_DBP, y_SBP
def create_dataset(self, X_data, y_SBP, y_DBP):
return BPDataset(X_data, y_SBP, y_DBP)
def split_data(self, X_data, y_SBP, y_DBP):
X_train, X_val, y_train_SBP, y_val_SBP, y_train_DBP, y_val_DBP = train_test_split(
X_data, y_SBP, y_DBP, test_size=self.val_split, random_state=42
)
# print(X_train.shape, X_val.shape, y_train_SBP.shape, y_val_SBP.shape, y_train_DBP.shape, y_val_DBP.shape)
train_dataset = self.create_dataset(X_train, y_train_SBP, y_train_DBP)
val_dataset = self.create_dataset(X_val, y_val_SBP, y_val_DBP)
return train_dataset, val_dataset
def create_dataloaders(self):
if self.data_type == 'UKL':
X_data, y_DBP, y_SBP = self.load_data_UKL_h5()
elif self.data_type == 'MIMIC':
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5()
elif self.data_type == 'MIMIC_full':
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5_full()
else:
X_data, y_DBP, y_SBP = self.load_data()
train_dataset, val_dataset = self.split_data(X_data, y_SBP, y_DBP)
self.train_dataloader = DataLoader(
train_dataset, batch_size=self.batch_size, shuffle=self.shuffle, collate_fn=custom_collate_fn
)
self.val_dataloader = DataLoader(
val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=custom_collate_fn
)
def get_dataloaders(self):
if self.train_dataloader is None or self.val_dataloader is None:
self.create_dataloaders()
return self.train_dataloader, self.val_dataloader
def get_distributed_dataloaders(self, world_size, rank):
if self.data_type == 'UKL':
X_data, y_DBP, y_SBP = self.load_data_UKL_h5()
elif self.data_type == 'MIMIC':
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5()
elif self.data_type == 'MIMIC_full':
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5_full()
else:
X_data, y_DBP, y_SBP = self.load_data()
train_dataset, val_dataset = self.split_data(X_data, y_SBP, y_DBP)
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset, num_replicas=world_size, rank=rank, shuffle=True
)
val_sampler = torch.utils.data.distributed.DistributedSampler(
val_dataset, num_replicas=world_size, rank=rank, shuffle=False
)
train_dataloader = DataLoader(
train_dataset,
batch_size=self.batch_size,
sampler=train_sampler,
collate_fn=custom_collate_fn,
)
val_dataloader = DataLoader(
val_dataset,
batch_size=self.batch_size,
sampler=val_sampler,
collate_fn=custom_collate_fn,
)
return train_dataloader, val_dataloader, train_sampler, val_sampler
# 使用示例
#
# data_loader = BPDataLoader(data_dir='data', val_split=0.2, batch_size=32,data_type='MIMIC')
# train_dataloader, val_dataloader = data_loader.get_dataloaders()
#
# for i, (X, y_SBP, y_DBP) in enumerate(train_dataloader):
# print(f"Batch {i+1}: X.shape={X.shape }, y_SBP.shape={y_SBP.shape}, y_DBP.shape={y_DBP.shape}")
# if i == 2:
# break