286 lines
9.7 KiB
Python
286 lines
9.7 KiB
Python
import os
|
||
import numpy as np
|
||
import torch
|
||
from torch.utils.data import Dataset, DataLoader
|
||
from sklearn.model_selection import train_test_split
|
||
import h5py
|
||
|
||
def custom_collate_fn(batch):
|
||
X, y_SBP, y_DBP = zip(*batch)
|
||
|
||
X = torch.tensor(np.array(X), dtype=torch.float32)
|
||
y_SBP = torch.tensor(y_SBP, dtype=torch.float32)
|
||
y_DBP = torch.tensor(y_DBP, dtype=torch.float32)
|
||
|
||
return X, y_SBP, y_DBP
|
||
|
||
|
||
class BPDataset(Dataset):
|
||
def __init__(self, X_data, y_SBP, y_DBP):
|
||
self.X_data = X_data
|
||
self.y_SBP = y_SBP
|
||
self.y_DBP = y_DBP
|
||
|
||
def __len__(self):
|
||
return len(self.y_SBP)
|
||
|
||
def __getitem__(self, idx):
|
||
# X_sample = self.X_data[idx * 250:(idx + 1) * 250]
|
||
X_sample = self.X_data[idx]
|
||
y_SBP_sample = self.y_SBP[idx]
|
||
y_DBP_sample = self.y_DBP[idx]
|
||
|
||
return X_sample, y_SBP_sample, y_DBP_sample
|
||
|
||
|
||
class BPDataLoader:
|
||
def __init__(self, data_dir, val_split=0.2, batch_size=32, shuffle=True, data_type='npy'):
|
||
self.data_dir = data_dir
|
||
self.val_split = val_split
|
||
self.batch_size = batch_size
|
||
self.shuffle = shuffle
|
||
self.train_dataloader = None
|
||
self.val_dataloader = None
|
||
self.data_type = data_type
|
||
|
||
|
||
def load_data(self):
|
||
X_BP_path = os.path.join(self.data_dir, 'X_BP.npy')
|
||
y_DBP_path = os.path.join(self.data_dir, 'Y_DBP.npy')
|
||
y_SBP_path = os.path.join(self.data_dir, 'Y_SBP.npy')
|
||
|
||
X_BP = np.load(X_BP_path)
|
||
# 将数据reshape成(batch_size, 250,1)的形状
|
||
X_BP = X_BP.reshape(-1, 250, 1)
|
||
|
||
y_DBP = np.load(y_DBP_path)
|
||
y_SBP = np.load(y_SBP_path)
|
||
|
||
return X_BP, y_DBP, y_SBP
|
||
|
||
def load_data_UKL_h5(self):
|
||
|
||
X_BP_path = os.path.join(self.data_dir, 'rPPG-BP-UKL_rppg_7s.h5')
|
||
with h5py.File(X_BP_path, 'r') as f:
|
||
rppg = f.get('rppg')
|
||
BP = f.get('label')
|
||
rppg = np.array(rppg)
|
||
BP = np.array(BP)
|
||
|
||
# 将数据从(875, 7851)reshape成(7851, 875, 1)的形状
|
||
rppg = rppg.transpose(1, 0)
|
||
rppg = rppg.reshape(-1, 875, 1)
|
||
|
||
X_BP = rppg
|
||
y_DBP = BP[1]
|
||
y_SBP = BP[0]
|
||
|
||
return X_BP, y_DBP, y_SBP
|
||
|
||
def load_data_MIMIC_h5(self):
|
||
|
||
X_BP_path = os.path.join(self.data_dir, 'MIMIC-III_ppg_dataset.h5')
|
||
|
||
#
|
||
# 获取data_dir下文件列表
|
||
files = os.listdir(self.data_dir)
|
||
|
||
# 检查是否存在已经处理好的数据
|
||
if 'X_MIMIC_BP.npy' in files and 'Y_MIMIC_DBP.npy' in files and 'Y_MIMIC_SBP.npy' in files:
|
||
print('loading preprocessed data.....')
|
||
|
||
X_BP = np.load(os.path.join(self.data_dir, 'X_MIMIC_BP.npy'))
|
||
y_DBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_DBP.npy'))
|
||
y_SBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_SBP.npy'))
|
||
|
||
return X_BP, y_DBP, y_SBP
|
||
|
||
with h5py.File(X_BP_path, 'r') as f:
|
||
ppg = f.get('ppg')
|
||
BP = f.get('label')
|
||
ppg = np.array(ppg)
|
||
BP = np.array(BP)
|
||
|
||
# 统计BP中SBP的最大值和最小值
|
||
max_sbp = np.max(BP[:, 0])
|
||
min_sbp = np.min(BP[:, 0])
|
||
|
||
max_sbp = 10 - max_sbp % 10 + max_sbp
|
||
min_sbp = min_sbp - min_sbp % 10
|
||
|
||
# 划分区间
|
||
bins = np.arange(min_sbp, max_sbp, 10)
|
||
|
||
print(bins)
|
||
|
||
sampled_ppg_data = []
|
||
sampled_bp_data = []
|
||
|
||
for i in range(len(bins) - 1):
|
||
# 获取当前区间的数据
|
||
bin_data_sbp_dbp = BP[(BP[:, 0] >= bins[i]) & (BP[:, 0] < bins[i + 1])]
|
||
bin_data_ppg = ppg[(BP[:, 0] >= bins[i]) & (BP[:, 0] < bins[i + 1])]
|
||
|
||
# 如果当前区间有数据
|
||
if len(bin_data_sbp_dbp) > 0:
|
||
# 从当前区间中随机抽取20%的数据
|
||
num_samples = int(len(bin_data_sbp_dbp) * 0.1)
|
||
indices = np.random.choice(len(bin_data_sbp_dbp), num_samples, replace=False)
|
||
sampled_bin_data_sbp_dbp = bin_data_sbp_dbp[indices]
|
||
sampled_bin_data_ppg = bin_data_ppg[indices]
|
||
|
||
# 将抽取的数据添加到最终的列表中
|
||
sampled_bp_data.append(sampled_bin_data_sbp_dbp)
|
||
sampled_ppg_data.append(sampled_bin_data_ppg)
|
||
|
||
# 将列表中的数据合并成NumPy数组
|
||
ppg = np.concatenate(sampled_ppg_data, axis=0)
|
||
BP = np.concatenate(sampled_bp_data, axis=0)
|
||
|
||
print(ppg.shape, BP.shape)
|
||
|
||
# 将数据从(9054000, 875)reshape成(9054000, 875, 1)的形状
|
||
ppg = ppg.reshape(-1, 875, 1)
|
||
|
||
X_BP = ppg
|
||
|
||
# 取出第一列赋值给y_DBP,第0列赋值给y_SBP
|
||
y_DBP = BP[:, 1]
|
||
y_SBP = BP[:, 0]
|
||
|
||
# 将数据保存到文件中
|
||
np.save('data/X_MIMIC_BP.npy', X_BP)
|
||
np.save('data/Y_MIMIC_DBP.npy', y_DBP)
|
||
np.save('data/Y_MIMIC_SBP.npy', y_SBP)
|
||
|
||
return X_BP, y_DBP, y_SBP
|
||
|
||
def load_data_MIMIC_h5_full(self):
|
||
|
||
X_BP_path = os.path.join(self.data_dir, 'MIMIC-III_ppg_dataset.h5')
|
||
|
||
# 获取data_dir下文件列表
|
||
files = os.listdir(self.data_dir)
|
||
|
||
# 检查是否存在已经处理好的数据
|
||
if 'X_MIMIC_BP_full.npy' in files and 'Y_MIMIC_DBP_full.npy' in files and 'Y_MIMIC_SBP_full.npy' in files:
|
||
print('loading preprocessed data.....')
|
||
|
||
X_BP = np.load(os.path.join(self.data_dir, 'X_MIMIC_BP_full.npy'))
|
||
y_DBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_DBP_full.npy'))
|
||
y_SBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_SBP_full.npy'))
|
||
|
||
return X_BP, y_DBP, y_SBP
|
||
|
||
with h5py.File(X_BP_path, 'r') as f:
|
||
ppg = f.get('ppg')
|
||
BP = f.get('label')
|
||
ppg = np.array(ppg)
|
||
BP = np.array(BP)
|
||
|
||
|
||
# 将数据从(9054000, 875)reshape成(9054000, 875, 1)的形状
|
||
ppg = ppg.reshape(-1, 875, 1)
|
||
|
||
X_BP = ppg
|
||
|
||
# 取出第一列赋值给y_DBP,第0列赋值给y_SBP
|
||
y_DBP = BP[:, 1]
|
||
y_SBP = BP[:, 0]
|
||
|
||
print("data shape:", X_BP.shape, y_DBP.shape, y_SBP.shape)
|
||
|
||
print("saving data.....")
|
||
|
||
# 将数据保存到文件中
|
||
np.save('data/X_MIMIC_BP_full.npy', X_BP)
|
||
np.save('data/Y_MIMIC_DBP_full.npy', y_DBP)
|
||
np.save('data/Y_MIMIC_SBP_full.npy', y_SBP)
|
||
|
||
print("data saved.....")
|
||
|
||
return X_BP, y_DBP, y_SBP
|
||
|
||
def create_dataset(self, X_data, y_SBP, y_DBP):
|
||
return BPDataset(X_data, y_SBP, y_DBP)
|
||
|
||
def split_data(self, X_data, y_SBP, y_DBP):
|
||
X_train, X_val, y_train_SBP, y_val_SBP, y_train_DBP, y_val_DBP = train_test_split(
|
||
X_data, y_SBP, y_DBP, test_size=self.val_split, random_state=42
|
||
)
|
||
|
||
# print(X_train.shape, X_val.shape, y_train_SBP.shape, y_val_SBP.shape, y_train_DBP.shape, y_val_DBP.shape)
|
||
|
||
train_dataset = self.create_dataset(X_train, y_train_SBP, y_train_DBP)
|
||
val_dataset = self.create_dataset(X_val, y_val_SBP, y_val_DBP)
|
||
|
||
return train_dataset, val_dataset
|
||
|
||
def create_dataloaders(self):
|
||
if self.data_type == 'UKL':
|
||
X_data, y_DBP, y_SBP = self.load_data_UKL_h5()
|
||
elif self.data_type == 'MIMIC':
|
||
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5()
|
||
elif self.data_type == 'MIMIC_full':
|
||
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5_full()
|
||
else:
|
||
X_data, y_DBP, y_SBP = self.load_data()
|
||
train_dataset, val_dataset = self.split_data(X_data, y_SBP, y_DBP)
|
||
|
||
self.train_dataloader = DataLoader(
|
||
train_dataset, batch_size=self.batch_size, shuffle=self.shuffle, collate_fn=custom_collate_fn
|
||
)
|
||
self.val_dataloader = DataLoader(
|
||
val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=custom_collate_fn
|
||
)
|
||
|
||
def get_dataloaders(self):
|
||
if self.train_dataloader is None or self.val_dataloader is None:
|
||
self.create_dataloaders()
|
||
|
||
return self.train_dataloader, self.val_dataloader
|
||
|
||
def get_distributed_dataloaders(self, world_size, rank):
|
||
|
||
if self.data_type == 'UKL':
|
||
X_data, y_DBP, y_SBP = self.load_data_UKL_h5()
|
||
elif self.data_type == 'MIMIC':
|
||
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5()
|
||
elif self.data_type == 'MIMIC_full':
|
||
X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5_full()
|
||
else:
|
||
X_data, y_DBP, y_SBP = self.load_data()
|
||
train_dataset, val_dataset = self.split_data(X_data, y_SBP, y_DBP)
|
||
|
||
train_sampler = torch.utils.data.distributed.DistributedSampler(
|
||
train_dataset, num_replicas=world_size, rank=rank, shuffle=True
|
||
)
|
||
val_sampler = torch.utils.data.distributed.DistributedSampler(
|
||
val_dataset, num_replicas=world_size, rank=rank, shuffle=False
|
||
)
|
||
|
||
train_dataloader = DataLoader(
|
||
train_dataset,
|
||
batch_size=self.batch_size,
|
||
sampler=train_sampler,
|
||
collate_fn=custom_collate_fn,
|
||
)
|
||
val_dataloader = DataLoader(
|
||
val_dataset,
|
||
batch_size=self.batch_size,
|
||
sampler=val_sampler,
|
||
collate_fn=custom_collate_fn,
|
||
)
|
||
|
||
return train_dataloader, val_dataloader, train_sampler, val_sampler
|
||
|
||
# 使用示例
|
||
#
|
||
# data_loader = BPDataLoader(data_dir='data', val_split=0.2, batch_size=32,data_type='MIMIC')
|
||
# train_dataloader, val_dataloader = data_loader.get_dataloaders()
|
||
#
|
||
# for i, (X, y_SBP, y_DBP) in enumerate(train_dataloader):
|
||
# print(f"Batch {i+1}: X.shape={X.shape }, y_SBP.shape={y_SBP.shape}, y_DBP.shape={y_DBP.shape}")
|
||
# if i == 2:
|
||
# break
|