-
Notifications
You must be signed in to change notification settings - Fork 0
/
drake.py
153 lines (130 loc) · 7.08 KB
/
drake.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import requests
from bs4 import BeautifulSoup
import time
import json
import os
# Function to scrape product details from a given URL
def scrapeNewBalanceProducts(url, product_list):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15'
}
while True:
# Fetch the page
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
products = soup.find_all('div', class_='product-layout')
# Break the loop if no products are found
if not products:
break
for product in products:
product_info = {}
image_tag = product.find('img')
if image_tag:
product_info['image_url'] = image_tag['src']
name_tag = product.find('h4')
if name_tag:
product_info['name'] = name_tag.text.strip()
product_link = product.find('a', href=True)
if product_link:
product_info['product_url'] = product_link['href']
# SKU and pricing
sku_tag = product.find('div', class_='sku')
if sku_tag:
product_info['sku'] = sku_tag.text.strip()
price_new_tag = product.find('span', class_='price-new')
price_old_tag = product.find('span', class_='price-old')
if price_new_tag:
product_info['sale_price'] = price_new_tag.text.strip()
if price_old_tag:
product_info['drake_price'] = price_old_tag.text.strip()
discount_tag = product.find('span', class_='label-discount')
if discount_tag:
product_info['discount'] = discount_tag.text.strip()
# Scrape product detail information
product_detail_response = requests.get(product_info['product_url'], headers=headers)
if product_detail_response.status_code == 200:
product_detail_soup = BeautifulSoup(product_detail_response.content, 'html.parser')
# Scrape the sizes available
size_select = product_detail_soup.find('select', id=lambda x: x and 'option' in x)
if size_select:
sizes = []
for option in size_select.find_all('option'):
size_text = option.text.strip()
if size_text and 'VUI LÒNG CHỌN SIZE' not in size_text:
# Replace non-breaking spaces and other special characters with standard spaces
size_text = size_text.replace('\u00A0', ' ') # Replace non-breaking space
size_text = size_text.replace('\u200B', '') # Remove zero-width space
sizes.append(size_text)
product_info['sizes'] = sizes
# Scrape additional attributes
attribute_groups = product_detail_soup.find_all('div', class_='attribute-group')
for group in attribute_groups:
h3_tag = group.find('h3')
if h3_tag and 'Đặc tính sản phẩm' in h3_tag.text:
attributes = group.find('ul', class_='attribute-list')
if attributes:
for attribute in attributes.find_all('li'):
label = attribute.find('label', class_='label')
data = attribute.find('span', class_='data')
if label and data:
label_text = label.get_text(strip=True).replace(':', '')
data_text = data.get_text(separator="\n", strip=True)
if label_text and data_text:
if label_text == "Giới tính":
product_info['gender'] = data_text
elif label_text == "Màu sắc":
product_info['color'] = data_text
elif label_text == "Phần thân":
product_info['upper_material'] = data_text
elif label_text == "Lớp lót":
product_info['lining'] = data_text
elif label_text == "Đế giày":
product_info['sole'] = data_text
elif label_text == "Tính năng sản phẩm":
product_info['product_features'] = data_text
# Append the product information to the list
product_list.append(product_info)
# Pause to avoid overwhelming the server
time.sleep(2)
# Find and follow the pagination 'next' link (>)
pagination = soup.find('ul', class_='pagination')
if pagination:
next_link = pagination.find('a', string='>')
if next_link and next_link['href']:
url = next_link['href'] # Update the URL to the next page
else:
break # No more pages, exit the loop
else:
break # No pagination found, exit the loop
else:
print(f"Failed to retrieve page with status code: {response.status_code}")
break
# Function to scrape all routes and save to a single JSON file
def scrapAllRoutes():
url_list = [
"https://drake.vn/new-balance",
"https://drake.vn/converse",
"https://drake.vn/palladium",
"https://drake.vn/sneaker-buzz",
"https://drake.vn/ncaa",
"https://drake.vn/k-swiss",
"https://drake.vn/supra",
"https://drake.vn/accessoriesapparel",
"https://drake.vn/vans"
]
# Initialize an empty list to store all products
product_list = []
# Load existing data if the file exists
if os.path.exists('drake.json'):
with open('drake.json', 'r', encoding='utf-8') as json_file:
product_list = json.load(json_file)
# Loop through each URL and scrape products
for url in url_list:
scrapeNewBalanceProducts(url, product_list)
# Save the combined data to a JSON file with utf-8 encoding
with open('drake.json', 'w', encoding='utf-8') as json_file:
json.dump(product_list, json_file, indent=4, ensure_ascii=False)
print("Scraping completed. Data saved to 'drake.json'.")
# Run the scraping function
scrapAllRoutes()