import os import numpy as np import torch from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split import h5py def custom_collate_fn(batch): X, y_SBP, y_DBP = zip(*batch) X = torch.tensor(np.array(X), dtype=torch.float32) y_SBP = torch.tensor(y_SBP, dtype=torch.float32) y_DBP = torch.tensor(y_DBP, dtype=torch.float32) return X, y_SBP, y_DBP class BPDataset(Dataset): def __init__(self, X_data, y_SBP, y_DBP): self.X_data = X_data self.y_SBP = y_SBP self.y_DBP = y_DBP def __len__(self): return len(self.y_SBP) def __getitem__(self, idx): # X_sample = self.X_data[idx * 250:(idx + 1) * 250] X_sample = self.X_data[idx] y_SBP_sample = self.y_SBP[idx] y_DBP_sample = self.y_DBP[idx] return X_sample, y_SBP_sample, y_DBP_sample class BPDataLoader: def __init__(self, data_dir, val_split=0.2, batch_size=32, shuffle=True, data_type='npy'): self.data_dir = data_dir self.val_split = val_split self.batch_size = batch_size self.shuffle = shuffle self.train_dataloader = None self.val_dataloader = None self.data_type = data_type def load_data(self): X_BP_path = os.path.join(self.data_dir, 'X_BP.npy') y_DBP_path = os.path.join(self.data_dir, 'Y_DBP.npy') y_SBP_path = os.path.join(self.data_dir, 'Y_SBP.npy') X_BP = np.load(X_BP_path) # 将数据reshape成(batch_size, 250,1)的形状 X_BP = X_BP.reshape(-1, 250, 1) y_DBP = np.load(y_DBP_path) y_SBP = np.load(y_SBP_path) return X_BP, y_DBP, y_SBP def load_data_UKL_h5(self): X_BP_path = os.path.join(self.data_dir, 'rPPG-BP-UKL_rppg_7s.h5') with h5py.File(X_BP_path, 'r') as f: rppg = f.get('rppg') BP = f.get('label') rppg = np.array(rppg) BP = np.array(BP) # 将数据从(875, 7851)reshape成(7851, 875, 1)的形状 rppg = rppg.transpose(1, 0) rppg = rppg.reshape(-1, 875, 1) X_BP = rppg y_DBP = BP[1] y_SBP = BP[0] return X_BP, y_DBP, y_SBP def load_data_MIMIC_h5(self): X_BP_path = os.path.join(self.data_dir, 'MIMIC-III_ppg_dataset.h5') # # 获取data_dir下文件列表 files = os.listdir(self.data_dir) # 检查是否存在已经处理好的数据 if 'X_MIMIC_BP.npy' in files and 'Y_MIMIC_DBP.npy' in files and 'Y_MIMIC_SBP.npy' in files: print('loading preprocessed data.....') X_BP = np.load(os.path.join(self.data_dir, 'X_MIMIC_BP.npy')) y_DBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_DBP.npy')) y_SBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_SBP.npy')) return X_BP, y_DBP, y_SBP with h5py.File(X_BP_path, 'r') as f: ppg = f.get('ppg') BP = f.get('label') ppg = np.array(ppg) BP = np.array(BP) # 统计BP中SBP的最大值和最小值 max_sbp = np.max(BP[:, 0]) min_sbp = np.min(BP[:, 0]) max_sbp = 10 - max_sbp % 10 + max_sbp min_sbp = min_sbp - min_sbp % 10 # 划分区间 bins = np.arange(min_sbp, max_sbp, 10) print(bins) sampled_ppg_data = [] sampled_bp_data = [] for i in range(len(bins) - 1): # 获取当前区间的数据 bin_data_sbp_dbp = BP[(BP[:, 0] >= bins[i]) & (BP[:, 0] < bins[i + 1])] bin_data_ppg = ppg[(BP[:, 0] >= bins[i]) & (BP[:, 0] < bins[i + 1])] # 如果当前区间有数据 if len(bin_data_sbp_dbp) > 0: # 从当前区间中随机抽取20%的数据 num_samples = int(len(bin_data_sbp_dbp) * 0.1) indices = np.random.choice(len(bin_data_sbp_dbp), num_samples, replace=False) sampled_bin_data_sbp_dbp = bin_data_sbp_dbp[indices] sampled_bin_data_ppg = bin_data_ppg[indices] # 将抽取的数据添加到最终的列表中 sampled_bp_data.append(sampled_bin_data_sbp_dbp) sampled_ppg_data.append(sampled_bin_data_ppg) # 将列表中的数据合并成NumPy数组 ppg = np.concatenate(sampled_ppg_data, axis=0) BP = np.concatenate(sampled_bp_data, axis=0) print(ppg.shape, BP.shape) # 将数据从(9054000, 875)reshape成(9054000, 875, 1)的形状 ppg = ppg.reshape(-1, 875, 1) X_BP = ppg # 取出第一列赋值给y_DBP,第0列赋值给y_SBP y_DBP = BP[:, 1] y_SBP = BP[:, 0] # 将数据保存到文件中 np.save('data/X_MIMIC_BP.npy', X_BP) np.save('data/Y_MIMIC_DBP.npy', y_DBP) np.save('data/Y_MIMIC_SBP.npy', y_SBP) return X_BP, y_DBP, y_SBP def load_data_MIMIC_h5_full(self): X_BP_path = os.path.join(self.data_dir, 'MIMIC-III_ppg_dataset.h5') # 获取data_dir下文件列表 files = os.listdir(self.data_dir) # 检查是否存在已经处理好的数据 if 'X_MIMIC_BP_full.npy' in files and 'Y_MIMIC_DBP_full.npy' in files and 'Y_MIMIC_SBP_full.npy' in files: print('loading preprocessed data.....') X_BP = np.load(os.path.join(self.data_dir, 'X_MIMIC_BP_full.npy')) y_DBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_DBP_full.npy')) y_SBP = np.load(os.path.join(self.data_dir, 'Y_MIMIC_SBP_full.npy')) return X_BP, y_DBP, y_SBP with h5py.File(X_BP_path, 'r') as f: ppg = f.get('ppg') BP = f.get('label') ppg = np.array(ppg) BP = np.array(BP) # 将数据从(9054000, 875)reshape成(9054000, 875, 1)的形状 ppg = ppg.reshape(-1, 875, 1) X_BP = ppg # 取出第一列赋值给y_DBP,第0列赋值给y_SBP y_DBP = BP[:, 1] y_SBP = BP[:, 0] print("data shape:", X_BP.shape, y_DBP.shape, y_SBP.shape) print("saving data.....") # 将数据保存到文件中 np.save('data/X_MIMIC_BP_full.npy', X_BP) np.save('data/Y_MIMIC_DBP_full.npy', y_DBP) np.save('data/Y_MIMIC_SBP_full.npy', y_SBP) print("data saved.....") return X_BP, y_DBP, y_SBP def create_dataset(self, X_data, y_SBP, y_DBP): return BPDataset(X_data, y_SBP, y_DBP) def split_data(self, X_data, y_SBP, y_DBP): X_train, X_val, y_train_SBP, y_val_SBP, y_train_DBP, y_val_DBP = train_test_split( X_data, y_SBP, y_DBP, test_size=self.val_split, random_state=42 ) # print(X_train.shape, X_val.shape, y_train_SBP.shape, y_val_SBP.shape, y_train_DBP.shape, y_val_DBP.shape) train_dataset = self.create_dataset(X_train, y_train_SBP, y_train_DBP) val_dataset = self.create_dataset(X_val, y_val_SBP, y_val_DBP) return train_dataset, val_dataset def create_dataloaders(self): if self.data_type == 'UKL': X_data, y_DBP, y_SBP = self.load_data_UKL_h5() elif self.data_type == 'MIMIC': X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5() elif self.data_type == 'MIMIC_full': X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5_full() else: X_data, y_DBP, y_SBP = self.load_data() train_dataset, val_dataset = self.split_data(X_data, y_SBP, y_DBP) self.train_dataloader = DataLoader( train_dataset, batch_size=self.batch_size, shuffle=self.shuffle, collate_fn=custom_collate_fn ) self.val_dataloader = DataLoader( val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=custom_collate_fn ) def get_dataloaders(self): if self.train_dataloader is None or self.val_dataloader is None: self.create_dataloaders() return self.train_dataloader, self.val_dataloader def get_distributed_dataloaders(self, world_size, rank): if self.data_type == 'UKL': X_data, y_DBP, y_SBP = self.load_data_UKL_h5() elif self.data_type == 'MIMIC': X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5() elif self.data_type == 'MIMIC_full': X_data, y_DBP, y_SBP = self.load_data_MIMIC_h5_full() else: X_data, y_DBP, y_SBP = self.load_data() train_dataset, val_dataset = self.split_data(X_data, y_SBP, y_DBP) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=world_size, rank=rank, shuffle=True ) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=world_size, rank=rank, shuffle=False ) train_dataloader = DataLoader( train_dataset, batch_size=self.batch_size, sampler=train_sampler, collate_fn=custom_collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=self.batch_size, sampler=val_sampler, collate_fn=custom_collate_fn, ) return train_dataloader, val_dataloader, train_sampler, val_sampler # 使用示例 # # data_loader = BPDataLoader(data_dir='data', val_split=0.2, batch_size=32,data_type='MIMIC') # train_dataloader, val_dataloader = data_loader.get_dataloaders() # # for i, (X, y_SBP, y_DBP) in enumerate(train_dataloader): # print(f"Batch {i+1}: X.shape={X.shape }, y_SBP.shape={y_SBP.shape}, y_DBP.shape={y_DBP.shape}") # if i == 2: # break