-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_processing.py
177 lines (160 loc) · 7.39 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import re
import nltk
import spacy
import unicodedata
import requests
from spacy_syllables import SpacySyllables
from bs4 import BeautifulSoup
from nltk import TweetTokenizer
from spacy.lang.es import Spanish
from spacy.lang.en import English
from nltk.util import ngrams
class TextProcessing(object):
name = 'Text Processing'
lang = 'es'
def __init__(self, lang: str = 'es'):
self.lang = lang
@staticmethod
def nlp(text: str) -> list:
try:
list_tagger = []
tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
doc = tp_nlp(text.lower())
print('original_text: {0}'.format(text))
for token in doc:
item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
'syllables': token._.syllables}
list_tagger.append(item)
return list_tagger
except Exception as e:
print('Error nlp: {0}'.format(e))
@staticmethod
def load_spacy(lang: str) -> object:
try:
spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
if not spacy.util.is_package(spacy_model[lang]):
spacy.cli.download(spacy_model[lang])
component = spacy.load(spacy_model[lang])
SpacySyllables(component)
component.add_pipe('syllables', last=True)
print('- Text Processing: {0}'.format(component.pipe_names))
return component
except Exception as e:
print('Error load spacy: {0}'.format(e))
@staticmethod
def proper_encoding(text: str) -> str:
try:
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
return text.decode("utf-8")
except Exception as e:
print('Error proper_encoding: {0}'.format(e))
@staticmethod
def stopwords(text: str) -> str:
try:
nlp = Spanish() if TextProcessing.lang == 'es' else English()
doc = nlp(text)
token_list = [token.text for token in doc]
sentence = []
for word in token_list:
lexeme = nlp.vocab[word]
if not lexeme.is_stop:
sentence.append(word)
return ' '.join(sentence)
except Exception as e:
print('Error stopwords: {0}'.format(e))
@staticmethod
def remove_patterns(text: str) -> str:
try:
text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
return text.lower()
except Exception as e:
print('Error remove_patterns: {0}'.format(e))
@staticmethod
def transformer(text: str, stopwords: bool = False) -> str:
try:
text_out = TextProcessing.proper_encoding(text)
text_out = text_out.lower()
text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
text_out = re.sub(
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
'[URL]', text_out)
text_out = re.sub("@([A-Za-z0-9_]{1,40})", '[MENTION]', text_out)
text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
text_out = TextProcessing.remove_patterns(text_out)
# text_out = TextAnalysis.lemmatization(text_out) if lemmatizer else text_out
text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
text_out = re.sub(r'\s+', ' ', text_out).strip()
text_out = text_out.rstrip()
return text_out if text_out != ' ' else None
except Exception as e:
print('Error transformer: {0}'.format(e))
@staticmethod
def tokenizer(text: str) -> list:
try:
text_tokenizer = TweetTokenizer()
return text_tokenizer.tokenize(text)
except Exception as e:
print('Error make_ngrams: {0}'.format(e))
@staticmethod
def make_ngrams(text: str, num: int):
try:
n_grams = ngrams(nltk.word_tokenize(text), num)
return [' '.join(grams) for grams in n_grams]
except Exception as e:
print('Error make_ngrams: {0}'.format(e))
@staticmethod
def get_URL_title(text: str):
result = ''
pattern = r'\([0-9]*:[0-9]*\) => ' # Definimos los patrones a buscar y variables
patern2 = r'\[|\]' # con las que manipularemos los datos
patern3 = r'[\-\?\:\;\$\%\^\&\*\(\)\|\!\`\'\"\,\<\.\>]'
URL_cont = ''
try:
text = TextProcessing.transformer(text)
urx = re.sub(patern2, '', re.sub(pattern, '', str(text.urls)))
if urx != "None": # Se leeran los urls para obtener el titulo de las paginas
if "," in urx: # aqui se revisa si existe mas de 1 url
tado = urx.split(",")
else:
tado = urx + "," + "https://www.google.com"
tado = tado.split(",") # en caso contrario se agrega una direccion default
for cor in tado:
link = cor # para evitar errores en este ciclo
reqs = requests.get(link)
soup = BeautifulSoup(reqs.text, 'html.parser')
for title in soup.find_all('title'):
if title.getText() == "Google":
URL_cont += "Null" # aqui se elimina la pagina default
elif title.getText() != "Página no encontrada":
var = title.getText() # en caso de obtener el titulo de la pagina
temp0 = re.sub(patern3, '', var) # aqui normalizaremos el
temp0 = temp0.lower()
URL_cont += "" + str(temp0) # Se guarda en el contenido
else:
URL_cont += "Null" # si la pagina no es encontrada
URL_cont += "~"
elif urx == "None":
URL_cont += "Null" + "~" # En caso de no haber urls , se agrega null
result = URL_cont.split("~")
except Exception as e:
print('Error delete_special_patterns: {0}'.format(e))
return result
if __name__ == '__main__':
tp_es = TextProcessing(lang='es')
result_es = tp_es.nlp(
'Ahora a la gente todo le parece tóxico, más si dices lo que sientes o te molesta…y NO, tóxico es quedarse '
'callado por miedo a arruinar algo. Hay que aprender a quererse primero.')
for i in result_es:
print(i)
tp_en = TextProcessing(lang='en')
result_en = tp_en.nlp("The data doesn’t lie: here's what one of our teams learned when they tried a 4-day workweek.")
for i in result_en:
print(i)