From f65b091fac68e05d3e0a7d0c6bf9f18e1d04a19a Mon Sep 17 00:00:00 2001 From: rzzn <2386089024@qq.com> Date: Tue, 30 Jul 2024 17:53:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20Mamba/mamba-main/train.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Mamba/mamba-main/train.py | 198 +++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/Mamba/mamba-main/train.py b/Mamba/mamba-main/train.py index 8f79fbe..f8c95cb 100644 --- a/Mamba/mamba-main/train.py +++ b/Mamba/mamba-main/train.py @@ -1,99 +1,99 @@ -import os -import pandas as pd -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, MambaConfig -from trl import SFTTrainer -from peft import LoraConfig -from datasets import Dataset - -# 设置环境变量来避免内存碎片化 -os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" - -# 数据文件夹路径 -data_folder = r'/mnt/Mamba/mamba-main/data/dataset' - -# 检查路径是否存在 -if not os.path.exists(data_folder): - raise ValueError(f"路径不存在: {data_folder}") - -# 加载分词器和模型 -path = "/mnt/Mamba/mamba-130m-hf" # 模型路径 -tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True) -model = AutoModelForCausalLM.from_pretrained(path, local_files_only=True, num_labels=8, use_mambapy=True) - -print("加载成功") - -# 配置训练参数 -training_args = TrainingArguments( - output_dir="./results", - num_train_epochs=3, - per_device_train_batch_size=12, # 减少批处理大小 - logging_dir='./logs', - logging_steps=10, - learning_rate=2e-3, - gradient_accumulation_steps=2, # 使用梯度累积减少显存占用 - fp16=True, # 启用混合精度训练 -) - -# LoRA配置 -lora_config = LoraConfig( - r=8, # 低秩分解的秩 - target_modules=["x_proj", "embeddings", "in_proj", "out_proj"], - task_type="SEQ_CLS", # 序列分类任务类型 - bias="none" -) - -# 初始化Trainer -trainer = SFTTrainer( - model=model, - tokenizer=tokenizer, - args=training_args, - peft_config=lora_config, - max_seq_length=512, # 设置max_seq_length参数 -) - -# 分块加载和处理数据 -chunksize = 40000 # 设置合适的分块大小,每次读取数据的行数 - - -def preprocess_data(chunk): - chunk = chunk.dropna() # 处理缺失值 - texts = chunk[["acc_x", "acc_y", "acc_z", "gyr_x", "gyr_y", "gyr_z", "mag_x", "mag_y", "mag_z"]].astype(str).apply( - ' '.join, axis=1).tolist() - labels = chunk["Person_id"].astype(int).tolist() # 确保标签是整数类型 - encodings = tokenizer(texts, truncation=True, padding=True, max_length=1024) - return {"input_ids": encodings["input_ids"], "attention_mask": encodings["attention_mask"], "labels": labels} - - -# 读取训练数据并进行训练 -train_file_path = os.path.join(data_folder, 'train_data.csv') -chunk_iter = pd.read_csv(train_file_path, chunksize=chunksize, header=0) - -for chunk in chunk_iter: - # 数据预处理 - processed_data = preprocess_data(chunk) - dataset = Dataset.from_dict(processed_data) - - # 训练模型 - trainer.train_dataset = dataset - trainer.train() - - # 清理CUDA缓存 - torch.cuda.empty_cache() - -# 保存训练后的模型 -model.save_pretrained("./trained_model") -tokenizer.save_pretrained("./trained_model") - -print("模型保存成功") - -# 读取测试数据并进行预测 -test_file_path = os.path.join(data_folder, 'test_data.csv') -test_data = pd.read_csv(test_file_path, header=0) -processed_test_data = preprocess_data(test_data) -test_dataset = Dataset.from_dict(processed_test_data) - -# 预测Person_id -predictions = trainer.predict(test_dataset) - -# 输出预测结果 -print(predictions) +import os +import pandas as pd +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, MambaConfig +from trl import SFTTrainer +from peft import LoraConfig +from datasets import Dataset +import torch +# 设置环境变量来避免内存碎片化 +os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" + +# 数据文件夹路径 +data_folder = r'/mnt/Mamba/mamba-main/data/dataset' + +# 检查路径是否存在 +if not os.path.exists(data_folder): + raise ValueError(f"路径不存在: {data_folder}") + +# 加载分词器和模型 +path = "/mnt/Mamba/mamba-130m-hf" # 模型路径 +tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True) +model = AutoModelForCausalLM.from_pretrained(path, local_files_only=True, num_labels=8, use_mambapy=True) + +print("加载成功") + +# 配置训练参数 +training_args = TrainingArguments( + output_dir="./results", + num_train_epochs=3, + per_device_train_batch_size=12, # 减少批处理大小 + logging_dir='./logs', + logging_steps=10, + learning_rate=2e-3, + gradient_accumulation_steps=2, # 使用梯度累积减少显存占用 + fp16=True, # 启用混合精度训练 +) + +# LoRA配置 +lora_config = LoraConfig( + r=8, # 低秩分解的秩 + target_modules=["x_proj", "embeddings", "in_proj", "out_proj"], + task_type="SEQ_CLS", # 序列分类任务类型 + bias="none" +) + +# 初始化Trainer +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + args=training_args, + peft_config=lora_config, + max_seq_length=512, # 设置max_seq_length参数 +) + +# 分块加载和处理数据 +chunksize = 40000 # 设置合适的分块大小,每次读取数据的行数 + + +def preprocess_data(chunk): + chunk = chunk.dropna() # 处理缺失值 + texts = chunk[["acc_x", "acc_y", "acc_z", "gyr_x", "gyr_y", "gyr_z", "mag_x", "mag_y", "mag_z"]].astype(str).apply( + ' '.join, axis=1).tolist() + labels = chunk["Person_id"].astype(int).tolist() # 确保标签是整数类型 + encodings = tokenizer(texts, truncation=True, padding=True, max_length=1024) + return {"input_ids": encodings["input_ids"], "attention_mask": encodings["attention_mask"], "labels": labels} + + +# 读取训练数据并进行训练 +train_file_path = os.path.join(data_folder, 'train_data.csv') +chunk_iter = pd.read_csv(train_file_path, chunksize=chunksize, header=0) + +for chunk in chunk_iter: + # 数据预处理 + processed_data = preprocess_data(chunk) + dataset = Dataset.from_dict(processed_data) + + # 训练模型 + trainer.train_dataset = dataset + trainer.train() + + # 清理CUDA缓存 + torch.cuda.empty_cache() + +# 保存训练后的模型 +model.save_pretrained("./trained_model") +tokenizer.save_pretrained("./trained_model") + +print("模型保存成功") + +# 读取测试数据并进行预测 +test_file_path = os.path.join(data_folder, 'test_data.csv') +test_data = pd.read_csv(test_file_path, header=0) +processed_test_data = preprocess_data(test_data) +test_dataset = Dataset.from_dict(processed_test_data) + +# 预测Person_id +predictions = trainer.predict(test_dataset) + +# 输出预测结果 +print(predictions)