我得到了一个 federally incorporated companies 的列表。从这个列表中,我提取了公司名称、他们的公司 ID 和他们当前的地址。然后我的任务是找到这些公司的网页。
我尝试的是对公司名称进行 Google 搜索,然后从前 5 个返回 URL 中确定哪个是公司的网页。我做了一个方法来计算一个分数来判断公司名称和 URL 之间的相似性,例如编辑距离或者公司名称的缩写是否是 URL 域名的一部分。然后我为 5 个返回的 URL 中的每一个计算这个分数。然后我检查两个得分最高的 URL 的联系页面是否包含公司地址。如果是这样,那么我会相信 URL 是公司的正确 URL,并且该公司名称和 URL 将被视为正样本。如果地址不在任一 URL 的网页中,我将采用该公司得分最高的 URL。我还将访问 the federal corporation database 以查看该公司是否活跃。如果该公司不活跃且 URL 的最佳得分较低,那么我认为该样本为负样本。
然后我得到了一个包含大约 1000 个数据的列表,具有以下特征:
["name", "url", "active","location","corp_id","title","description","main_page_body","about_page","product_page","keywords","contact_page","score","images","has_address"]
目标是查看 URL 是公司的天气。所以我想到了用BERT看两句相似度任务。每个样本的两个句子将是:
但我不知道 BERT 因为我遵循了我找到的一些模板并训练了模型并获得了 1 个验证准确度和 0 个测试准确度,我想知道接下来要做什么?
我将所有从 jupyter notebook 转换为 .py 文件的代码放在下面:
"""Untitled5.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1-MHsbaAl9Nm7al1DjzRpBmwQX18jxQTR
"""
import pandas as pd
df_test = pd.read_csv('/content/drive/MyDrive/data_scrapping/testing3.csv',encoding = 'latin-1')
df_test = df_test.fillna(value={'location': "", 'title': "", 'description':"", 'about_page':"", 'score':0})
df_test["name_address"] = dff["name"]+", "+dff["location"]
df_test["description_about"] = dff["title"] +" "+ dff["description"]+" "+dff["about_page"]
df_test["label"] = 1
print(df_test.head(15))
df_test.dropna(inplace=True)
df_test = df_test[["name_address","description_about","label"]]
print(df_test.columns)
dff=pd.read_csv('/content/drive/MyDrive/data_scrapping/fed_data.csv',encoding = 'latin-1')
print(dff.head(15))
print(dff.columns)
dff = dff[["name","location","title","description","about_page",'score',"has_address","active"]]
dff = dff.fillna(value={'location': "", 'title': "", 'description':"", 'about_page':"", 'score':0})
# df.dropna(inplace=True)
dff["name_address"] = dff["name"]+", "+dff["location"]
dff["description_about"] = dff["title"] +" "+ dff["description"]+" "+dff["about_page"]
negative_df = dff.loc[ (dff['score'] <4)]
negative_df = negative_df.loc[negative_df["has_address"] == 0]
negative_df = negative_df.loc[negative_df["active"] == "no_active"]
negative_df["label"] = 0
negative_df = negative_df[["name_address", "description_about","label"]]
print(negative_df.head(10))
print(len(negative_df))
print(negative_df.columns)
positive_df = dff.loc[dff["has_address"] == 1]
positive_df["label"] = 0
positive_df = positive_df[["name_address", "description_about","label"]]
print(positive_df.columns)
print(positive_df.head(15))
print(len(positive_df))
naics_df = pd.read_csv("/content/drive/MyDrive/data_scrapping/naics_data.csv")
print(naics_df.columns)
naics_df.dropna(inplace =True)
print(len(naics_df))
print(naics_df[pd.notnull(naics_df.url)])
print(naics_df["url"])
naics_df["name_address"] = naics_df["name"] +", "+ naics_df["location"]
naics_df["description_about"] = naics_df["description"]+ " "+naics_df["about_page"]
naics_df["label"] = 1
naics_df = naics_df[["name_address", "description_about","label"]]
print(naics_df.columns)
vertical_concat = pd.concat([naics_df, positive_df,negative_df], axis=0)
print(vertical_concat)
print(len(vertical_concat))
import torch
if torch.cuda.is_available():
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
!pip install transformers
from sklearn.utils import shuffle
# shuffle the dataframe
vertical_concat = shuffle(vertical_concat, random_state=0)
vertical_concat.dropna(inplace=True)
print(vertical_concat)
df_n=vertical_concat[1500:]
df=vertical_concat[0:1500]
sent1=df.name_address.values
sent2=df.description_about.values
# sent3= df.url.values
labels=df.label.values
from transformers import BertTokenizer, RobertaTokenizerFast
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=True)
input_ids = []
tokenize_text=[]
for i in range(len(sent1)):
encoded_sent = tokenizer.encode(
sent1[i],
sent2[i],
add_special_tokens = True,
max_length=512
)
input_ids.append(encoded_sent)
print('Original: ', sent1[0],sent2[0])
print('Tokenized:', tokenizer.tokenize(sent1[0]+" "+sent2[0]))
print('Token IDs:', input_ids[0])
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 512
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
value=0, truncating="post", padding="post")
attention_masks = []
for sent in input_ids:
att_mask = [int(token_id > 0) for token_id in sent]
attention_masks.append(att_mask)
print('Token IDs:', input_ids[0])
print('Attention mask:', attention_masks[0])
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
random_state=2018, test_size=0.1)
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 8
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.cuda()
optimizer = AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
from transformers import get_linear_schedule_with_warmup
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
import numpy as np
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
import time
import datetime
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
elapsed_rounded = int(round((elapsed)))
return str(datetime.timedelta(seconds=elapsed_rounded))
import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
loss_values = []
model.zero_grad()
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
t0 = time.time()
total_loss = 0
model.train()
for step, batch in enumerate(train_dataloader):
if step % 40 == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
model.train()
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs[0]
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
model.zero_grad()
avg_train_loss = total_loss / len(train_dataloader)
loss_values.append(avg_train_loss)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(format_time(time.time() - t0)))
# ========================================
# Validation
# ========================================
print("")
print("Running Validation...")
t0 = time.time()
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in validation_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs[0]
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
nb_eval_steps += 1
print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print(" Validation took: {:}".format(format_time(time.time() - t0)))
torch.save(model.state_dict(), f"models/sent_bert{epoch_i}.pth")
print("")
print("Training complete!")
# Commented out IPython magic to ensure Python compatibility.
import matplotlib.pyplot as plt
# % matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(loss_values, 'b-o')
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.show()
# model_t = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model_t = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model_t.cuda()
model2=torch.load("models/sent_bert1.pth")
print(df_n.head())
# print(df_test.columns)
#df_n.drop(['Unnamed: 0','index'],axis=1,inplace=True)
sent1=df_test.name_address.values
sent2= df_test["description_about"].values
labels=df_test.label.values
input_ids = []
tokenize_text=[]
for i in range(len(sent1)):
encoded_sent = tokenizer.encode(
sent1[i],
sent2[i],
add_special_tokens = True,
max_length=64
)
input_ids.append(encoded_sent)
print('Original: ', sent1[0],sent2[0])
print('Tokenized:', tokenizer.tokenize(sent1[0]+" "+sent2[0]))
print('Token IDs:', input_ids[0])
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 512
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
value=0, truncating="post", padding="post")
attention_masks = []
for sent in input_ids:
att_mask = [int(token_id > 0) for token_id in sent]
attention_masks.append(att_mask)
print('Token IDs:', input_ids[0])
print('Attention mask:', attention_masks[0])
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)
batch_size = 8
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
model_t.load_state_dict(model2)
model_t.eval()
predictions , true_labels = [], []
for batch in test_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model_t(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs[0]
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE.')
pred1=predictions
print(pred1)
for batch in test_dataloader:
print(batch[0])
pred2=[]
for i in range(len(true_labels)):
pred= np.argmax(pred1[i], axis=1).flatten()
pred2=pred2+pred.tolist()
pred2=np.asarray(pred2)
true_labels[1]
print(pred2)
print(true_labels)
t3=[]
for i in range(len(true_labels)):
t2= true_labels[i]
t3=t3+t2.tolist()
print(t3)
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score
acc=accuracy_score(pred2,t3)
f1=f1_score(pred2,t3)
precision=precision_score(pred2,t3)
recall=recall_score(pred2,t3)
print(" accuracy={} \n f1={} \n precision={} \n recall={}".format(acc,f1,precision,recall))