-
Notifications
You must be signed in to change notification settings - Fork 0
/
SentimentAnalysis.py
101 lines (94 loc) · 3.27 KB
/
SentimentAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import glob
import csv
import string
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
sentences = []
sw = stopwords.words("english")
#Please excuse this cluster fuck of code
def clean_text(t):
# This text remoes
t= nltk.word_tokenize(t, language='english')
c_t = []
# Clears text of any special characters, right now this work is based on english speakers
for i in t:
if i in "qwertyuiopasdfghjklzxcvbnm ":
c_t.append(i)
t = []
# Lemmatizer is used to try and bring different usages and possibly spellings of flavors to a single
lemmatizer = WordNetLemmatizer()
for i in c_t:
# TODO: add stop word filtering
t.append(lemmatizer.lemmatize(i))
c_t = " ".join(t)
return c_t
# Pulled from sentdex tutorial to save some lines code
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
print("loaded training documents")
#shuffles data before it is translated
random.shuffle(documents)
x_train = []
y_train = []
# Translated training data labels to 1 and 0
for i in range(len(documents)):
if documents[i][1] == "neg":
y_train.append(0)
text = " ".join(documents[i][0])
text = nltk.word_tokenize(text, language='english')
x_train.append(clean_text(" ".join(documents[i][0])))
else:
y_train.append(1)
x_train.append(clean_text(" ".join(documents[i][0])))
print("prepped docs")
#vectorizes training data and trains the classifier
vectorizer = CountVectorizer(analyzer = "word", max_features = 5000, tokenizer = None, stop_words = None, preprocessor = None)
x_train = vectorizer.fit_transform(x_train)
x_train = x_train.toarray()
classy = MultinomialNB()
classy.fit(x_train, y_train)
print("trained")
reddit = []
# clean2.txt has a portion of the total scraped reddit comments, cleans them again as a precaution
f = open("clean2.txt")
for line in f:
line = line.lower()
line = clean_text(line)
reddit.append(line)
print("loaded reddit docs")
# t_reddit is just a copy of the reddit text used to analyze flavor later
t_reddit = reddit
reddit = vectorizer.fit_transform(reddit)
reddit = reddit.toarray()
result = classy.predict(reddit)
print("Predicted reddit docs")
flavors = open("")
#keeps count of results
flavor_list = []
for i in flavors:
i = clean_text(i)
flavor_list.append(i)
flavor_results = {}
# Compares scraped flavors to reddit text
for i in range(len(t_reddit)):
for j in flavor_list:
if j in t_reddit[i]:
if j in flavor_results.keys():
if result[i] == 0:
flavor_results[j] = [flavor_results[j][0], flavor_results[j][1]+1]
else:
flavor_results[j] = [flavor_results[j][0]+1, flavor_results[j][1]]
else:
if result[i] == 0:
flavor_results[j] = [0, 1]
else:
flavor_results[j] = [1, 0]
print(flavor_results)