Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NNTI Project #1

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,254 changes: 1,254 additions & 0 deletions CNN-task-3.ipynb

Large diffs are not rendered by default.

367 changes: 367 additions & 0 deletions LSTM-classifier-Task-2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import time\n",
"import torch\n",
"import pickle\n",
"import datetime\n",
"import numpy as np\n",
"import warnings\n",
"\n",
"import pandas as pd\n",
"from pandas.core.common import SettingWithCopyWarning\n",
"\n",
"warnings.simplefilter(action=\"ignore\", category=SettingWithCopyWarning)\n",
"\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.nn.functional as F\n",
"\n",
"from matplotlib import pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"\n",
"if device == 'cuda':\n",
" print(\"=================================\")\n",
" print(\"GPU found\")\n",
" print(\"Using GPU at cuda:\",torch.cuda.current_device())\n",
" print(\"=================================\")\n",
" print(\" \")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model1 = torch.load(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/epoch_300.pt\")\n",
"\n",
"w1 = model1[\"w1.weight\"].T\n",
"w2 = model1[\"w2.weight\"]\n",
"\n",
"cleandata = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/hindi_corpus_cleaned.pkl\")\n",
"word_index = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/word_index.pkl\")\n",
"index_word = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/index_word.pkl\")\n",
"V = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/vocab.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('https://raw.githubusercontent.com/SouravDutta91/NNTI-WS2021-NLP-Project/main/data/hindi_hatespeech.tsv',sep='\\t')\n",
"text = data[['text','task_1']]\n",
"text['text'] = cleandata['text'].apply(lambda x: x.split())\n",
"text['label'] = text['task_1'].apply(lambda x: 1 if x == 'HOF' else 0)\n",
"max_len = text.text.str.len().max()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def tag_count(input):\n",
" hcount,ncount = 0,0\n",
" for tag in input:\n",
" if tag == 1:\n",
" hcount+=1\n",
" else:\n",
" ncount+=1\n",
" return hcount,ncount\n",
"\n",
"word_index['<pad>'] = len(V)\n",
"index_word[len(V)] = '<pad>'\n",
"\n",
"\n",
"def get_word_embedding(input):\n",
" index = word_index[input]\n",
" return w1[index]\n",
"\n",
"def encode(corpus):\n",
" sent_idx = []\n",
" i = 0\n",
" for sentence in corpus:\n",
" sentence += ['<pad>'] * (max_len - len(sentence))\n",
" idx = [word_index[word] for word in sentence]\n",
" sent_idx.append(idx)\n",
" i+= 1\n",
" return np.array(sent_idx)\n",
"\n",
"\n",
"from itertools import islice\n",
"def take(n, iterable):\n",
" \"Return first n items of the iterable as a list\"\n",
" return list(islice(iterable, n))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"_ , emb_size = w1.shape\n",
"\n",
"def matrix_embeddings():\n",
" embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), emb_size))\n",
" embedding_matrix[word_index['<pad>']] = np.zeros((emb_size,))\n",
"\n",
" for word,i in take(len(V),word_index.items()):\n",
" temp = get_word_embedding(word)\n",
" if temp is not None:\n",
" embedding_matrix[i] = temp.cpu()\n",
"\n",
" return embedding_matrix\n",
"\n",
"encoded_text = encode(text.text)\n",
"labels = np.array(text['label'])\n",
"embeds = torch.tensor(matrix_embeddings())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame({'encoded':list(encoded_text), 'label':labels})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"xtrain,xtest,ytrain,ytest = train_test_split(encoded_text,labels,shuffle=True,test_size=0.1,random_state=45)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"batch_size = 10\n",
"\n",
"from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,SequentialSampler)\n",
"\n",
"def get_dataloader(traindata,trainlabels,testdata,testlabels,batchsize):\n",
" \n",
" traindata = torch.tensor(traindata).float()\n",
" trainlabels = torch.tensor(trainlabels)\n",
" testdata = torch.tensor(testdata).float()\n",
" testlabels = torch.tensor(testlabels)\n",
"\n",
" test = TensorDataset(testdata,testlabels)\n",
" test_dataload = DataLoader(test,sampler=RandomSampler(test),batch_size=batchsize,drop_last=True)\n",
" train = TensorDataset(traindata,trainlabels)\n",
" train_dataload = DataLoader(train,sampler=RandomSampler(train),batch_size=batchsize,drop_last=True)\n",
"\n",
" return train_dataload,test_dataload\n",
"\n",
"train_dataload,test_dataload = get_dataloader(xtrain, ytrain,xtest,ytest,batch_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class lstmmodel(nn.Module):\n",
" def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):\n",
" super().__init__()\n",
" \n",
" self.output_size = output_size\n",
" self.n_layers = n_layers\n",
" self.hidden_dim = hidden_dim\n",
" \n",
" #Pretrained Embeddings\n",
" self.embedding = nn.Embedding.from_pretrained(embeds.type(torch.float32),freeze=True)\n",
" #LSM Layer\n",
" self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)\n",
" \n",
" #Dropout Layer\n",
" self.dropout = nn.Dropout(0.3)\n",
" \n",
" #Fully connected Layer\n",
" self.fc = nn.Linear(hidden_dim, output_size)\n",
" \n",
" #sigmoid Layer\n",
" self.sig = nn.Sigmoid()\n",
"\n",
" def forward(self, x, hidden):\n",
" batch_size = x.size(0)\n",
" x = x.long()\n",
" embed = self.embedding(x)\n",
" lstm_out, hidden = self.lstm(embed.type(torch.float32), hidden)\n",
" lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)\n",
" out = self.dropout(lstm_out)\n",
" out = self.fc(out)\n",
" sig_out = self.sig(out)\n",
"\n",
" sig_out = sig_out.view(batch_size, -1)\n",
" sig_out = sig_out[:, -1]\n",
"\n",
" return sig_out, hidden\n",
" \n",
" def init_hidden(self, batch_size):\n",
" weight = next(self.parameters()).data\n",
"\n",
" hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), \n",
" weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())\n",
"\n",
" \n",
" return hidden"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"vocab_size = embeds.shape[0]\n",
"output_size = 2\n",
"embedding_dim = embeds.shape[1]\n",
"hidden_dim = 256\n",
"n_layers = 1\n",
"\n",
"model = lstmmodel(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)\n",
"\n",
"print(model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# loss and optimization functions\n",
"lr=0.05\n",
"criterion = nn.BCELoss()\n",
"optimizer = torch.optim.Adam(model.parameters(), lr=lr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = model.to(device)\n",
"criterion = criterion.to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"epochs = 10\n",
"counter = 0 \n",
"print_every = 50\n",
"clip = 5\n",
"\n",
"model.cuda()\n",
"\n",
"model.train()\n",
"for e in range(epochs):\n",
" tot = []\n",
" h = model.init_hidden(batch_size)\n",
" for inputs, labels in train_dataload:\n",
"\n",
" counter += 1\n",
" \n",
" inputs, labels = inputs.type(torch.float64).cuda(), labels.cuda()\n",
" h = tuple([each.data for each in h])\n",
" \n",
" model.zero_grad()\n",
" \n",
" output, h = model(inputs, h)\n",
" loss = criterion(output.squeeze(), labels.float())\n",
" loss.backward()\n",
" nn.utils.clip_grad_norm(model.parameters(), clip)\n",
" optimizer.step()\n",
" tot.append(loss.item())\n",
" if counter % print_every == 0:\n",
" print(\"Epoch: {}/{}...\".format(e+1, epochs),\n",
" \"Step: {}...\".format(counter),\n",
" \"Loss: {:.6f}...\".format(loss.item()))\n",
"plt.plot(tot)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_losses = []\n",
"num_correct = 0\n",
"\n",
"\n",
"h = model.init_hidden(10)\n",
"model.eval()\n",
"\n",
"for inputs, labels in test_dataload:\n",
" h = tuple([each.data for each in h])\n",
" \n",
" inputs, labels = inputs.cuda(), labels.cuda()\n",
" \n",
" output, h = model(inputs, h)\n",
" test_loss = criterion(output.squeeze(), labels.float())\n",
" test_losses.append(test_loss.item())\n",
" pred = torch.round(output.squeeze())\n",
" correct_tensor = pred.eq(labels.float().view_as(pred))\n",
" correct = np.squeeze(correct_tensor.cpu().numpy())\n",
" num_correct += np.sum(correct)\n",
"\n",
"print(\"Test loss: {:.3f}\".format(np.mean(test_losses)))\n",
"\n",
"test_acc = (num_correct)/len(test_dataload.dataset) * 100\n",
"print(\"Test accuracy: {:.3f}\".format(test_acc))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading