cslily
diff --git a/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora 微调案例 - LaTexOCR.md
Lines changed: 712 additions & 0 deletions b/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora 微调案例 - LaTexOCR.md
Lines changed: 712 additions & 0 deletions
diff --git a/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/csv2json.py
Lines changed: 39 additions & 0 deletions b/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/csv2json.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/data2csv.py
Lines changed: 60 additions & 0 deletions b/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/data2csv.py
Lines changed: 60 additions & 0 deletions
diff --git a/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/inference.py
Lines changed: 66 additions & 0 deletions b/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/inference.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/requirements.txt
Lines changed: 10 additions & 0 deletions b/‎models/Qwen2-VL/06-Qwen2-VL-2B-Instruct Lora微调案例 - LaTextOCR 参考代码/requirements.txt
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,39 @@
+import pandas as pd
+import json
+
+csv_path = './latex_ocr_train.csv'
+train_json_path = './latex_ocr_train.json'
+val_json_path = './latex_ocr_val.json'
+df = pd.read_csv(csv_path)
+# Create conversation format
+conversations = []
+
+# Add image conversations
+for i in range(len(df)):
+    conversations.append({
+        "id": f"identity_{i+1}",
+        "conversations": [
+            {
+                "role": "user",
+                "value": f"{df.iloc[i]['image_path']}"
+            },
+            {
+                "role": "assistant", 
+                "value": str(df.iloc[i]['text'])
+            }
+        ]
+    })
+
+# print(conversations)
+# Save to JSON
+# Split into train and validation sets
+train_conversations = conversations[:-4]
+val_conversations = conversations[-4:]
+
+# Save train set
+with open(train_json_path, 'w', encoding='utf-8') as f:
+    json.dump(train_conversations, f, ensure_ascii=False, indent=2)
+
+# Save validation set 
+with open(val_json_path, 'w', encoding='utf-8') as f:
+    json.dump(val_conversations, f, ensure_ascii=False, indent=2)
@@ -0,0 +1,60 @@
+# 导入所需的库
+from modelscope.msdatasets import MsDataset
+import os
+import pandas as pd
+
+MAX_DATA_NUMBER = 1000
+dataset_id = 'AI-ModelScope/LaTeX_OCR'
+subset_name = 'default'
+split = 'train'
+
+dataset_dir = 'LaTeX_OCR'
+csv_path = './latex_ocr_train.csv'
+
+
+# 检查目录是否已存在
+if not os.path.exists(dataset_dir):
+    # 从modelscope下载COCO 2014图像描述数据集
+    ds =  MsDataset.load(dataset_id, subset_name=subset_name, split=split)
+    print(len(ds))
+    # 设置处理的图片数量上限
+    total = min(MAX_DATA_NUMBER, len(ds))
+
+    # 创建保存图片的目录
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    # 初始化存储图片路径和描述的列表
+    image_paths = []
+    texts = []
+
+    for i in range(total):
+        # 获取每个样本的信息
+        item = ds[i]
+        text = item['text']
+        image = item['image']
+        
+        # 保存图片并记录路径
+        image_path = os.path.abspath(f'{dataset_dir}/{i}.jpg')
+        image.save(image_path)
+        
+        # 将路径和描述添加到列表中
+        image_paths.append(image_path)
+        texts.append(text)
+        
+        # 每处理50张图片打印一次进度
+        if (i + 1) % 50 == 0:
+            print(f'Processing {i+1}/{total} images ({(i+1)/total*100:.1f}%)')
+
+    # 将图片路径和描述保存为CSV文件
+    df = pd.DataFrame({
+        'image_path': image_paths,
+        'text': texts,
+    })
+
+    # 将数据保存为CSV文件
+    df.to_csv(csv_path, index=False)
+    
+    print(f'数据处理完成，共处理了{total}张图片')
+
+else:
+    print(f'{dataset_dir}目录已存在,跳过数据处理步骤')
@@ -0,0 +1,66 @@
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from peft import PeftModel, LoraConfig, TaskType
+
+prompt = "你是一个LaText OCR助手,目标是读取用户输入的照片，转换成LaTex公式。"
+local_model_path = "./Qwen/Qwen2-VL-2B-Instruct"
+lora_model_path = "./output/Qwen2-VL-2B-LatexOCR/checkpoint-124"
+test_image_path = "./LaTeX_OCR/997.jpg"
+
+config = LoraConfig(
+    task_type=TaskType.CAUSAL_LM,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    inference_mode=True,
+    r=64,  # Lora 秩
+    lora_alpha=16,  # Lora alaph，具体作用参见 Lora 原理
+    lora_dropout=0.05,  # Dropout 比例
+    bias="none",
+)
+
+# default: Load the model on the available device(s)
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    local_model_path, torch_dtype="auto", device_map="auto"
+)
+
+model = PeftModel.from_pretrained(model, model_id=f"{lora_model_path}", config=config)
+processor = AutoProcessor.from_pretrained(local_model_path)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": test_image_path,
+                "resized_height": 100,
+                "resized_width": 500,
+            },
+            {"type": "text", "text": f"{prompt}"},
+        ],
+    }
+]
+
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=8192)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+
+print(output_text[0])
@@ -0,0 +1,10 @@
+torch==2.3.0
+torchvision==0.18.0
+swanlab==0.3.27
+transformers==4.46.2
+accelerate==1.1.1
+pandas==2.2.2
+modelscope==1.15.0
+qwen-vl-utils==0.0.8
+datasets==2.18.0
+peft==0.13.2