-
Notifications
You must be signed in to change notification settings - Fork 3
/
DoctoraliaWebCrawler.py
146 lines (106 loc) · 4.89 KB
/
DoctoraliaWebCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Author: Fabio Rodrigues Jorge
Email: [email protected]
Description: Web Crawler to extract information from Doctoralia site
"""
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import datetime
import csv
from Doctor import Doctor
BASE_DOMAIN = "https://www.doctoralia.com.br"
# Variables to help to control the application.
DOCTOR_LIST_THRESHOLD = 999
SPECIALIZATION_LIST_THRESHOLD = 999
MAX_PAGINATION = -1 # -1 means unlimited
def init_webdriver_config(impl_delay=30):
"""Function to set the configuration of the Selenium web Driver"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})
chrome_driver = webdriver.Chrome(chrome_options=chrome_options)
chrome_driver.implicitly_wait(impl_delay)
return chrome_driver
def get_soup_page(_driver, url):
"""Helper function that navigates and returns an BeautifulSoup page"""
_driver.get(url)
time.sleep(2)
return BeautifulSoup(driver.page_source, 'html.parser')
def has_next_pagination(_soup, current_pagination, max_pagination=-1):
"""Function to check if has a next pagination. Extra functionality to limit the max_number of paginations"""
if max_pagination != -1 and current_pagination >= max_pagination:
return False
else:
return len(_soup.select("ul.pagination li.next")) == 1
def create_doctor(_soup):
"""Helper function to extract information from soup and create the Doctor object"""
# name_select = _soup.select("div.unified-doctor-header-info div.unified-doctor-header-info__name span")[:2]
name_select = _soup.select_one("div.unified-doctor-header-info div.unified-doctor-header-info__name")\
.find(itemprop="name")
name = name_select.text
image_select = _soup.select("div.unified-doctor-header-info a.avatar")
image_link = image_select[0]["href"] if len(image_select) > 0 else ""
specialization = ", ".join(map(lambda x: x.text, _soup.select("div.unified-doctor-header-info h2 a")))
experiences_select = _soup.find("span", text='Experiência em:')
if experiences_select is not None:
experiences_select = experiences_select.parent.parent.parent.parent.find_all("li")
experiences = ", ".join(map(lambda x: x.text, experiences_select))
else:
experiences = ""
address_select = _soup.select_one("div.calendar-address")
if address_select is not None:
city_select = address_select.select_one("span.city")
city = city_select['content'] if city_select is not None else ""
state_select = address_select.select_one("span.region")
state = state_select['content'] if state_select is not None else ""
address_select = address_select.select_one("span.street")
address = address_select.text if address_select is not None else ""
else:
city = ""
state = ""
address = ""
telephone_select = soup.select_one("div.calendar-address div.modal i.svg-icon__phone")
if telephone_select is not None:
telephone = telephone_select.parent.find("b").text.strip()
else:
telephone = ""
_doctor = Doctor(name=name, image_link=image_link, specialization=specialization, experiences=experiences,
city=city, state=state, address=address, telephone=telephone)
print(_doctor)
return _doctor
# if __name__ == '__main__':
driver = init_webdriver_config(30)
doctors = []
init_url = "{0}/{1}".format(BASE_DOMAIN, 'especializacoes-medicas')
soup = get_soup_page(driver, init_url)
specialization_list = soup.select("div section div h3 div a.text-muted")
for spe in specialization_list[:SPECIALIZATION_LIST_THRESHOLD]:
page = 0
while True:
page += 1
next_url = "{0}{1}/{2}".format(BASE_DOMAIN, spe["href"], page)
soup = get_soup_page(driver, next_url)
check_next_pagination = has_next_pagination(soup, page, max_pagination=MAX_PAGINATION)
doctor_list_per_pagination = soup.select("a.rank-element-name__link")
# get doctor list per page
for doctor_page in doctor_list_per_pagination[:DOCTOR_LIST_THRESHOLD]:
url_doctor = doctor_page['href']
soup = get_soup_page(driver, url_doctor)
try:
doctor = create_doctor(soup)
doctors.append(doctor)
except Exception as err:
print("Failed to extract doctor due to {0}".format(err))
# has_next_pagination
if not check_next_pagination:
break
file_name = "{0}_{1}.csv".format('doctoralia_extract', datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"))
with open(file_name, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(Doctor.CSV_HEADER)
for doc in doctors:
writer.writerow(doc.to_csv())
driver.quit()
print(">> WebCrawler Finished <<")