SouravDutta91 · nishantg96 · Mar 31, 2021 · Mar 31, 2021 · Mar 31, 2021 · Mar 31, 2021
diff --git a/CNN-task-3.ipynb b/CNN-task-3.ipynb
diff --git a/LSTM-classifier-Task-2.ipynb b/LSTM-classifier-Task-2.ipynb
@@ -0,0 +1,367 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "import torch\n",
+    "import pickle\n",
+    "import datetime\n",
+    "import numpy as np\n",
+    "import warnings\n",
+    "\n",
+    "import pandas as pd\n",
+    "from pandas.core.common import SettingWithCopyWarning\n",
+    "\n",
+    "warnings.simplefilter(action=\"ignore\", category=SettingWithCopyWarning)\n",
+    "\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from matplotlib import pyplot as plt\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "\n",
+    "if device == 'cuda':\n",
+    "  print(\"=================================\")\n",
+    "  print(\"GPU found\")\n",
+    "  print(\"Using GPU at cuda:\",torch.cuda.current_device())\n",
+    "  print(\"=================================\")\n",
+    "  print(\" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model1 = torch.load(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/epoch_300.pt\")\n",
+    "\n",
+    "w1 = model1[\"w1.weight\"].T\n",
+    "w2 = model1[\"w2.weight\"]\n",
+    "\n",
+    "cleandata = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/hindi_corpus_cleaned.pkl\")\n",
+    "word_index = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/word_index.pkl\")\n",
+    "index_word = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/index_word.pkl\")\n",
+    "V = pd.read_pickle(\"~/NNTI-WS2021-NLP-Project/Project_files/Hindi/vocab.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('https://raw.githubusercontent.com/SouravDutta91/NNTI-WS2021-NLP-Project/main/data/hindi_hatespeech.tsv',sep='\\t')\n",
+    "text = data[['text','task_1']]\n",
+    "text['text'] = cleandata['text'].apply(lambda x: x.split())\n",
+    "text['label'] = text['task_1'].apply(lambda x: 1 if x == 'HOF' else 0)\n",
+    "max_len = text.text.str.len().max()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "def tag_count(input):\n",
+    "  hcount,ncount = 0,0\n",
+    "  for tag in input:\n",
+    "    if tag == 1:\n",
+    "      hcount+=1\n",
+    "    else:\n",
+    "      ncount+=1\n",
+    "  return hcount,ncount\n",
+    "\n",
+    "word_index['<pad>'] = len(V)\n",
+    "index_word[len(V)] = '<pad>'\n",
+    "\n",
+    "\n",
+    "def get_word_embedding(input):\n",
+    "  index = word_index[input]\n",
+    "  return w1[index]\n",
+    "\n",
+    "def encode(corpus):\n",
+    "  sent_idx = []\n",
+    "  i = 0\n",
+    "  for sentence in corpus:\n",
+    "    sentence += ['<pad>'] * (max_len - len(sentence))\n",
+    "    idx = [word_index[word] for word in sentence]\n",
+    "    sent_idx.append(idx)\n",
+    "    i+= 1\n",
+    "  return np.array(sent_idx)\n",
+    "\n",
+    "\n",
+    "from itertools import islice\n",
+    "def take(n, iterable):\n",
+    "    \"Return first n items of the iterable as a list\"\n",
+    "    return list(islice(iterable, n))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ , emb_size = w1.shape\n",
+    "\n",
+    "def matrix_embeddings():\n",
+    "  embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), emb_size))\n",
+    "  embedding_matrix[word_index['<pad>']] = np.zeros((emb_size,))\n",
+    "\n",
+    "  for word,i in take(len(V),word_index.items()):\n",
+    "    temp = get_word_embedding(word)\n",
+    "    if temp is not None:\n",
+    "        embedding_matrix[i] = temp.cpu()\n",
+    "\n",
+    "  return embedding_matrix\n",
+    "\n",
+    "encoded_text = encode(text.text)\n",
+    "labels = np.array(text['label'])\n",
+    "embeds = torch.tensor(matrix_embeddings())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame({'encoded':list(encoded_text), 'label':labels})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xtrain,xtest,ytrain,ytest = train_test_split(encoded_text,labels,shuffle=True,test_size=0.1,random_state=45)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 10\n",
+    "\n",
+    "from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,SequentialSampler)\n",
+    "\n",
+    "def get_dataloader(traindata,trainlabels,testdata,testlabels,batchsize):\n",
+    "   \n",
+    "   traindata = torch.tensor(traindata).float()\n",
+    "   trainlabels = torch.tensor(trainlabels)\n",
+    "   testdata = torch.tensor(testdata).float()\n",
+    "   testlabels = torch.tensor(testlabels)\n",
+    "\n",
+    "   test = TensorDataset(testdata,testlabels)\n",
+    "   test_dataload = DataLoader(test,sampler=RandomSampler(test),batch_size=batchsize,drop_last=True)\n",
+    "   train = TensorDataset(traindata,trainlabels)\n",
+    "   train_dataload = DataLoader(train,sampler=RandomSampler(train),batch_size=batchsize,drop_last=True)\n",
+    "\n",
+    "   return train_dataload,test_dataload\n",
+    "\n",
+    "train_dataload,test_dataload = get_dataloader(xtrain, ytrain,xtest,ytest,batch_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class lstmmodel(nn.Module):\n",
+    "  def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):\n",
+    "    super().__init__()\n",
+    "    \n",
+    "    self.output_size = output_size\n",
+    "    self.n_layers = n_layers\n",
+    "    self.hidden_dim = hidden_dim\n",
+    "    \n",
+    "    #Pretrained Embeddings\n",
+    "    self.embedding = nn.Embedding.from_pretrained(embeds.type(torch.float32),freeze=True)\n",
+    "    #LSM Layer\n",
+    "    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)\n",
+    "    \n",
+    "    #Dropout Layer\n",
+    "    self.dropout = nn.Dropout(0.3)\n",
+    "    \n",
+    "    #Fully connected Layer\n",
+    "    self.fc = nn.Linear(hidden_dim, output_size)\n",
+    "    \n",
+    "    #sigmoid Layer\n",
+    "    self.sig = nn.Sigmoid()\n",
+    "\n",
+    "  def forward(self, x, hidden):\n",
+    "    batch_size = x.size(0)\n",
+    "    x = x.long()\n",
+    "    embed = self.embedding(x)\n",
+    "    lstm_out, hidden = self.lstm(embed.type(torch.float32), hidden)\n",
+    "    lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)\n",
+    "    out = self.dropout(lstm_out)\n",
+    "    out = self.fc(out)\n",
+    "    sig_out = self.sig(out)\n",
+    "\n",
+    "    sig_out = sig_out.view(batch_size, -1)\n",
+    "    sig_out = sig_out[:, -1]\n",
+    "\n",
+    "    return sig_out, hidden\n",
+    "  \n",
+    "  def init_hidden(self, batch_size):\n",
+    "    weight = next(self.parameters()).data\n",
+    "\n",
+    "    hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), \n",
+    "              weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())\n",
+    "\n",
+    "      \n",
+    "    return hidden"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "vocab_size = embeds.shape[0]\n",
+    "output_size = 2\n",
+    "embedding_dim = embeds.shape[1]\n",
+    "hidden_dim = 256\n",
+    "n_layers = 1\n",
+    "\n",
+    "model = lstmmodel(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)\n",
+    "\n",
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# loss and optimization functions\n",
+    "lr=0.05\n",
+    "criterion = nn.BCELoss()\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=lr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = model.to(device)\n",
+    "criterion = criterion.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "epochs = 10\n",
+    "counter = 0 \n",
+    "print_every = 50\n",
+    "clip = 5\n",
+    "\n",
+    "model.cuda()\n",
+    "\n",
+    "model.train()\n",
+    "for e in range(epochs):\n",
+    "  tot =  []\n",
+    "  h = model.init_hidden(batch_size)\n",
+    "  for inputs, labels in train_dataload:\n",
+    "\n",
+    "    counter += 1\n",
+    "    \n",
+    "    inputs, labels = inputs.type(torch.float64).cuda(), labels.cuda()\n",
+    "    h = tuple([each.data for each in h])\n",
+    "    \n",
+    "    model.zero_grad()\n",
+    "    \n",
+    "    output, h = model(inputs, h)\n",
+    "    loss = criterion(output.squeeze(), labels.float())\n",
+    "    loss.backward()\n",
+    "    nn.utils.clip_grad_norm(model.parameters(), clip)\n",
+    "    optimizer.step()\n",
+    "    tot.append(loss.item())\n",
+    "    if counter % print_every == 0:\n",
+    "      print(\"Epoch: {}/{}...\".format(e+1, epochs),\n",
+    "                      \"Step: {}...\".format(counter),\n",
+    "                      \"Loss: {:.6f}...\".format(loss.item()))\n",
+    "plt.plot(tot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_losses = []\n",
+    "num_correct = 0\n",
+    "\n",
+    "\n",
+    "h = model.init_hidden(10)\n",
+    "model.eval()\n",
+    "\n",
+    "for inputs, labels in test_dataload:\n",
+    "  h = tuple([each.data for each in h])\n",
+    "  \n",
+    "  inputs, labels = inputs.cuda(), labels.cuda()\n",
+    "  \n",
+    "  output, h = model(inputs, h)\n",
+    "  test_loss = criterion(output.squeeze(), labels.float())\n",
+    "  test_losses.append(test_loss.item())\n",
+    "  pred = torch.round(output.squeeze())\n",
+    "  correct_tensor = pred.eq(labels.float().view_as(pred))\n",
+    "  correct = np.squeeze(correct_tensor.cpu().numpy())\n",
+    "  num_correct += np.sum(correct)\n",
+    "\n",
+    "print(\"Test loss: {:.3f}\".format(np.mean(test_losses)))\n",
+    "\n",
+    "test_acc = (num_correct)/len(test_dataload.dataset) * 100\n",
+    "print(\"Test accuracy: {:.3f}\".format(test_acc))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}