text_processing.py

import re
import nltk
import spacy
import unicodedata
import requests
from spacy_syllables import SpacySyllables
from bs4 import BeautifulSoup
from nltk import TweetTokenizer
from spacy.lang.es import Spanish
from spacy.lang.en import English
from nltk.util import ngrams


class TextProcessing(object):
    name = 'Text Processing'
    lang = 'es'

    def __init__(self, lang: str = 'es'):
        self.lang = lang

    @staticmethod
    def nlp(text: str) -> list:
        try:
            list_tagger = []
            tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
            doc = tp_nlp(text.lower())
            print('original_text: {0}'.format(text))
            for token in doc:
                item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
                        'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
                        'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
                        'syllables': token._.syllables}
                list_tagger.append(item)
            return list_tagger
        except Exception as e:
            print('Error nlp: {0}'.format(e))

    @staticmethod
    def load_spacy(lang: str) -> object:
        try:
            spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
            if not spacy.util.is_package(spacy_model[lang]):
                spacy.cli.download(spacy_model[lang])

            component = spacy.load(spacy_model[lang])
            SpacySyllables(component)
            component.add_pipe('syllables', last=True)
            print('- Text Processing: {0}'.format(component.pipe_names))
            return component
        except Exception as e:
            print('Error load spacy: {0}'.format(e))

    @staticmethod
    def proper_encoding(text: str) -> str:
        try:
            text = unicodedata.normalize('NFD', text)
            text = text.encode('ascii', 'ignore')
            return text.decode("utf-8")
        except Exception as e:
            print('Error proper_encoding: {0}'.format(e))

    @staticmethod
    def stopwords(text: str) -> str:
        try:
            nlp = Spanish() if TextProcessing.lang == 'es' else English()
            doc = nlp(text)
            token_list = [token.text for token in doc]
            sentence = []
            for word in token_list:
                lexeme = nlp.vocab[word]
                if not lexeme.is_stop:
                    sentence.append(word)
            return ' '.join(sentence)
        except Exception as e:
            print('Error stopwords: {0}'.format(e))

    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))

    @staticmethod
    def transformer(text: str, stopwords: bool = False) -> str:
        try:
            text_out = TextProcessing.proper_encoding(text)
            text_out = text_out.lower()
            text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
            text_out = re.sub(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
                r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                '[URL]', text_out)
            text_out = re.sub("@([A-Za-z0-9_]{1,40})", '[MENTION]', text_out)
            text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
            text_out = TextProcessing.remove_patterns(text_out)
            # text_out = TextAnalysis.lemmatization(text_out) if lemmatizer else text_out
            text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
            text_out = re.sub(r'\s+', ' ', text_out).strip()
            text_out = text_out.rstrip()
            return text_out if text_out != ' ' else None
        except Exception as e:
            print('Error transformer: {0}'.format(e))

    @staticmethod
    def tokenizer(text: str) -> list:
        try:
            text_tokenizer = TweetTokenizer()
            return text_tokenizer.tokenize(text)
        except Exception as e:
            print('Error make_ngrams: {0}'.format(e))

    @staticmethod
    def make_ngrams(text: str, num: int):
        try:
            n_grams = ngrams(nltk.word_tokenize(text), num)
            return [' '.join(grams) for grams in n_grams]
        except Exception as e:
            print('Error make_ngrams: {0}'.format(e))

    @staticmethod
    def get_URL_title(text: str):
        result = ''
        pattern = r'\([0-9]*:[0-9]*\) => '  # Definimos los patrones a buscar y variables
        patern2 = r'\[|\]'  # con las que manipularemos los datos
        patern3 = r'[\-\?\:\;\$\%\^\&\*\(\)\|\!\`\'\"\,\<\.\>]'
        URL_cont = ''

        try:
            text = TextProcessing.transformer(text)
            urx = re.sub(patern2, '', re.sub(pattern, '', str(text.urls)))
            if urx != "None":  # Se leeran los urls para obtener el titulo de las paginas
                if "," in urx:  # aqui se revisa si existe mas de 1 url
                    tado = urx.split(",")
                else:
                    tado = urx + "," + "https://www.google.com"
                    tado = tado.split(",")  # en caso contrario se agrega una direccion default
                for cor in tado:
                    link = cor  # para evitar errores en este ciclo
                    reqs = requests.get(link)
                    soup = BeautifulSoup(reqs.text, 'html.parser')
                    for title in soup.find_all('title'):
                        if title.getText() == "Google":
                            URL_cont += "Null"  # aqui se elimina la pagina default
                        elif title.getText() != "Página no encontrada":
                            var = title.getText()  # en caso de obtener el titulo de la pagina
                            temp0 = re.sub(patern3, '', var)  # aqui normalizaremos el
                            temp0 = temp0.lower()
                            URL_cont += "" + str(temp0)  # Se guarda en el contenido
                        else:
                            URL_cont += "Null"  # si la pagina no es encontrada
                URL_cont += "~"
            elif urx == "None":
                URL_cont += "Null" + "~"  # En caso de no haber urls , se agrega null
                result = URL_cont.split("~")
        except Exception as e:
            print('Error delete_special_patterns: {0}'.format(e))
        return result


if __name__ == '__main__':
    tp_es = TextProcessing(lang='es')
    result_es = tp_es.nlp(
        'Ahora a la gente todo le parece tóxico, más si dices lo que sientes o te molesta…y NO, tóxico es quedarse '
        'callado por miedo a arruinar algo. Hay que aprender a quererse primero.')
    for i in result_es:
        print(i)

    tp_en = TextProcessing(lang='en')
    result_en = tp_en.nlp("The data doesn’t lie: here's what one of our teams learned when they tried a 4-day workweek.")
    for i in result_en:
        print(i)