PHP前端开发

使用BERT在Kaggle上使用NLP入门

百变鹏仔 3周前 (02-05) #Python
文章标签 入门

1,进口和eda

import osiskaggle = os.environ.get('kaggle_kernel_run_type', '')from pathlib import pathif iskaggle:    path = path('/kaggle/input/us-patent-phrase-to-phrase-matching')
import pandas as pddf = pd.read_csv(path/'train.csv')df['input'] = 'text1: ' + df.context + '; text2: ' + df.target + '; anc1: ' + df.anchordf.input.head()

2,令牌化

from datasets import dataset, datasetdictds = dataset.from_pandas(df)import warnings,logging,torchwarnings.simplefilter('ignore')logging.disable(logging.warning)model_nm = 'anferico/bert-for-patents'# load model directlyfrom transformers import automodelforsequenceclassification, autotokenizermodel = automodelforsequenceclassification.from_pretrained(model_nm, num_labels=1)tokenizer = autotokenizer.from_pretrained('anferico/bert-for-patents')
def tok_func(x):    return tokenizer(x['input'])# tokenize all the sentences using the tokenizertok_ds = ds.map(tok_func, batched=true)tok_ds = tok_ds.rename_columns({'score':'labels'})

3,测试和验证集

eval_df = pd.read_csv(path/'test.csv')dds = tok_ds.train_test_split(0.25, seed=42)eval_df['input'] = 'text1: ' + eval_df.context + '; text2: ' + eval_df.target + '; anc1: ' + eval_df.anchoreval_ds = dataset.from_pandas(eval_df).map(tok_func, batched=true)

4,指标和相关性

import numpy as npdef corr(x,y):     ## change the 2-d array into 1-d array    return np.corrcoef(x.flatten(), y)[0,1]def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

5,训练我们的模型

14625233945

6,在测试集中获取预测

preds = trainer.predict(eval_ds).predictions.astype(float)preds = np.clip(preds, 0, 1)import datasetssubmission = datasets.Dataset.from_dict({    'id': eval_ds['id'],    'score': preds})submission.to_csv('submission.csv', index=False)