아빠는 개발자

[NLP] Bert 본문

카테고리 없음

[NLP] Bert

father6019 2025. 6. 24. 23:56
728x90
반응형

접속해보잣 

https://huggingface.co/klue/bert-base

 

상단에 Files and versions 로 들어가보면 

 

파일들이 나온다 

 

일단 아래 4개를 다운 받는다. 

 

 

 

내가 콘다 환경이 뭐가 있는지 확인해보잣

 

# 방법 1: 가장 일반적인 환경 목록 조회
conda env list

# 방법 2: 동일한 정보를 보여주는 대체 명령
conda info --envs

 

(base) ➜  ~ conda env list
# conda environments:
#
base                  *  /Users/doo/opt/anaconda3
900gle                   /Users/doo/opt/anaconda3/envs/900gle
aqqle                    /Users/doo/opt/anaconda3/envs/aqqle
doo                      /Users/doo/opt/anaconda3/envs/doo
image                    /Users/doo/opt/anaconda3/envs/image
mecab                    /Users/doo/opt/anaconda3/envs/mecab
nlp                      /Users/doo/opt/anaconda3/envs/nlp
py38                     /Users/doo/opt/anaconda3/envs/py38
rnn                      /Users/doo/opt/anaconda3/envs/rnn
tesla                    /Users/doo/opt/anaconda3/envs/tesla
text                     /Users/doo/opt/anaconda3/envs/text

 

train.py

import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader            # Dataset 추가
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
    Trainer,                # 추가
    TrainingArguments       # 추가
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ---------------------------------------------------
# 0) HuggingFace Dataset 래퍼 정의
# ---------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: BertTokenizer, max_len: int = 128):
        self.texts  = df["text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in tokenized.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ---------------------------------------------------
# 1) 전처리 함수 정의
# ---------------------------------------------------
def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", s)
    return s.strip()

# ---------------------------------------------------
# 2) 데이터 로드 & 전처리 & 분할
# ---------------------------------------------------
def load_data(path: str, test_size: float = 0.2, seed: int = 42):
    df = pd.read_csv(path)
    df["label"] = df["label"].astype(str).str.strip().astype(int)
    df["text"]  = df["text"].astype(str).apply(clean_text)
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=seed, stratify=df["label"]
    )
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

# ---------------------------------------------------
# 3) compute_metrics 함수
# ---------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ---------------------------------------------------
# 4) main() 안에서 호출
# ---------------------------------------------------
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 로컬에 저장된 사전학습 모델·토크나이저 로드
    tokenizer = BertTokenizer.from_pretrained("./klue-bert-base", do_lower_case=False)
    model     = BertForSequenceClassification.from_pretrained(
        "./klue-bert-base", num_labels=2
    )
    model.to(device)

    # 데이터 준비
    train_df, test_df = load_data("food_nonfood_data.csv", test_size=0.2)
    train_dataset = TextDataset(train_df, tokenizer)
    test_dataset  = TextDataset(test_df,  tokenizer)

    # DataLoader
    data_collator = DataCollatorWithPadding(tokenizer)
    train_loader  = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)
    test_loader   = DataLoader(test_dataset,  batch_size=32, shuffle=False, collate_fn=data_collator)

    # 옵티마이저·스케줄러
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 3  # 예: 3 epochs
    scheduler   = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    # 1) TrainingArguments 선언
    training_args = TrainingArguments(
        output_dir="./best_food_classifier",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        learning_rate=2e-5,
        logging_dir="./logs",
        logging_steps=100
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,       # 이건 있어도 되고 없어도 상관없음
#         tokenizer=tokenizer,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    # 학습
    trainer.train()

    # 수동으로 평가와 저장
    metrics = trainer.evaluate()
    print(metrics)
    trainer.save_model("./best_food_classifier")
    tokenizer.save_pretrained("./best_food_classifier")

if __name__ == "__main__":
    main()

 

우리 추론씨

 

import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from tqdm import tqdm  # 1) tqdm import

# 환경 설정
MODEL_DIR      = "./best_food_classifier"
RESULT_FILE    = "../season_keyword/result/log_result.txt"
OUTPUT_CSV     = "classification_results.csv"
OUTPUT_FOOD    = "food_list.csv"      # 식품만 저장할 파일
OUTPUT_NONFOOD = "nonfood_list.csv"   # 비식품만 저장할 파일

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델·토크나이저 로드
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
model     = BertForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(device)
model.eval()

# tqdm 설정: pandas에 progress_apply 연결
tqdm.pandas(desc="분류중")  # desc=진행바 앞에 표시될 텍스트

# 추론 함수
def predict_text(text: str) -> str:
    inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    pred = logits.argmax(dim=-1).item()
    return "식품" if pred == 1 else "비식품"

def main():
    # 1) 파일 읽기
    df = pd.read_csv(
        RESULT_FILE,
        header=None,
        names=["keyword", "count"],
        skipinitialspace=True
    )

    # 2) 예측 및 결과 컬럼 추가 (진행 바 표시)
    df["category"] = df["keyword"].progress_apply(predict_text)

    # 3) 전체 결과 저장
    df.to_csv(
        OUTPUT_CSV,
        columns=["keyword", "count", "category"],
        index=False,
        encoding="utf-8-sig"
    )
    print(f"Saved all classifications to {OUTPUT_CSV}")

    # 4) 식품/비식품으로 분리
    food_df    = df[df["category"] == "식품"]
    nonfood_df = df[df["category"] == "비식품"]

    # 5) 각각 별도 저장
    food_df.to_csv(
        OUTPUT_FOOD,
        columns=["keyword", "count"],
        index=False,
        encoding="utf-8-sig"
    )
    print(f"Saved food list to {OUTPUT_FOOD}")

    nonfood_df.to_csv(
        OUTPUT_NONFOOD,
        columns=["keyword", "count"],
        index=False,
        encoding="utf-8-sig"
    )
    print(f"Saved non-food list to {OUTPUT_NONFOOD}")

if __name__ == "__main__":
    main()
 
 
728x90
반응형