Ccmmutty logo
Commutty IT
2 min read

Sentence order predict

https://picsum.photos/seed/a2720940b4d84d15876b132cca75bef3/600/800
python
from transformers import AlbertTokenizer, AlbertModel, AlbertForMaskedLM,AlbertConfig, AlbertForPreTraining
import torch

config = AlbertConfig.from_json_file('model/config.json')
model =  AlbertForPreTraining.from_pretrained('model/pytorch_model.bin', config=config)
tokenizer = AlbertTokenizer("model/sentencepiece.model")
Some weights of AlbertForPreTraining were not initialized from the model checkpoint at model/pytorch_model.bin and are newly initialized: ['albert.pooler.weight', 'albert.pooler.bias', 'sop_classifier.classifier.weight', 'sop_classifier.classifier.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
python
import pandas as pd
df = pd.read_csv("data/pair.csv",names=["sentence1","sentence2"],header=None)
df = df.iloc[1:]
df["sop_label"] = 0
python
df
python
df1 = pd.DataFrame()
df1["sentence1"] = df["sentence2"]
df1["sentence2"] = df["sentence1"]
df1["sop_label"] = 1
df1.head()
python
df2 = pd.concat([df,df1])
df2 = df2.sample(frac=1)
df2.head()
python
df2 = df2.dropna()
python
df2.to_csv("sop_sen_pair.csv")
python
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
python
tokenizer([("<目的>XXの満開50日後の果実肥大調査。" ,"平年並み-やや大きい。"),("営農センターに在庫があるので、出荷の際とりにくるとのこと。","水稲肥料の一発かんたくんを6袋受注する。")]
             ,padding = "max_length",max_length = 100)
python
tokenizer.convert_ids_to_tokens(tokenizer("<目的>ももの満開50日後の果実肥大調査。" ,"平年並み-やや大きい。")["input_ids"])
python
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Discussion

コメントにはログインが必要です。