-
Notifications
You must be signed in to change notification settings - Fork 0
/
writingcsv.py
51 lines (43 loc) · 1.53 KB
/
writingcsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import json
import string
from urllib.request import urlopen
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
import mysql.connector
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
df = pd.read_csv("IT contentIDs and URLs - ITIdsUrls.csv")
# print(df[0:2])
def jsonload(url):
with urlopen(url) as response:
source = response.read()
data = json.loads(source)
soup = BeautifulSoup((data['data']['detail']['description']),"html.parser")
for data in soup(['blockquote']):
data.decompose()
article = (soup.find_all('p'))
desc = " "
for y in article:
if(len(y.text) > 20):
# print(y.text.strip())
desc = desc + y.text.strip()
# print(desc + "\n")
# desc = re.sub('http://\S+|https://\S+|pic.twitter.com/[\w]*|@[\w]*|', '', desc)
desc = re.sub(r'\s+', ' ',desc)
# desc = desc.encode('ascii', 'ignore').decode("utf-8")
# desc = desc.replace('-'," ")
# desc = desc.replace('.'," ")
# translator = str.maketrans('', '', string.punctuation)
# desc = desc.translate(translator)
return(str(desc))
# docs = []
# for url in df["API URL"]:
# docs.append(jsonload(url))
# ids = list(df['content_id'])
# data = pd.DataFrame(list(zip(ids,docs)),columns =['content_id', 'description'])
# data.to_csv(r'preprocessed_desc.csv', index=False)
print(jsonload(df["API URL"][0]))