Spaces:
Runtime error
Runtime error
import scrapy | |
from scrapy_splash import SplashRequest | |
# μλλ μμΈ λλΌμμ ν¬λ‘€λ§μ ν΄μ¨λ€. | |
class WinenaraSpider(scrapy.Spider): | |
name = "winenara" | |
start_urls = ["https://www.winenara.com/shop/product/product_lists?sh_category1_cd=10000&sh_category2_cd=10100&sh_category3_cd=&sh_order_by=all&sh_sort_order_by=&sh_filter_code=&sh_rcd="] | |
def parse(self, response): | |
detail_page_links = response.css(".table_box::attr(href)") | |
yield from response.follow_all(detail_page_links, self.parse_winenara) | |
next_page = response.css('a[rel="next"]::attr(href)').get() | |
if next_page is not None: | |
yield response.follow(next_page, self.parse) | |
def parse_vivino_page(self, response): | |
# parse the vivino page here | |
def extract_with_css(query): | |
return response.css(query).get(default="").strip() | |
def extract_features(): | |
details = {} | |
details_sections = response.css('.tasteStructure__tasteCharacteristic--jLtsE').getall() | |
for section in details_sections: | |
current_key = section.css('.tasteStructure__property--CLNl_::text').get().lower().strip() | |
style = response.css('span.indicatorBar__progress--3aXLX::attr(style)').get() | |
left = [s.strip() for s in style.split(';') if 'left' in s][0] | |
left_value = left.split(":")[1].strip() | |
current_value = str(float(left_value)/100) | |
if current_key and current_value: | |
details[current_key] = current_value.lower() | |
return details | |
def extract_info(): | |
details = {} | |
details_sections = response.css('.breadCrumbs__link--1TY6b span').getall() | |
for section in details_sections: | |
current_key = section.css('a::attr(data-cy)').get().lower().strip() | |
current_value = section.css('a::text').get().lower().strip() | |
if current_key and current_value: | |
details[current_key] = current_value.lower() | |
return details | |
data_dict = response.meta['data_dict'] | |
data_dict['vivino']['rating'] = extract_with_css('.vivinoRating_averageValue__uDdPM::text') | |
data_dict['vivino']['rating_num'] = extract_with_css('.vivinoRating_caption__xL84P::text') | |
data_dict['vivino']['price'] = extract_with_css('.purchaseAvailability__currentPrice--3mO4u::text') | |
data_dict['vivino']['features'] = extract_features() | |
data_dict['vivino']['food_pairing'] = response.css('.foodPairing__foodImage--2OYHg::attr(aria-label)').getall() | |
data_dict['vivino']['info'] = extract_info() | |
yield data_dict | |
def parse_winenara(self, response): | |
data_dict = {} | |
def extract_with_css(query): | |
return response.css(query).get(default="").strip() | |
def extract_features(): | |
details = {} | |
details_sections = response.css('dl.details') | |
for section in details_sections: | |
current_key = section.css('dt::text').get().lower().strip() | |
current_value = section.css('dd span.label[style*="background"]::text').get() | |
if current_key and current_value: | |
details[current_key] = current_value.lower() | |
return details | |
def extract_tag(): | |
return response.css(".cate_label .label::text").getall() | |
def extract_img_url(): | |
img_src = response.css(".lozad::attr(data-src)").get(default="") | |
if img_src: | |
img_url = response.urljoin(img_src) # Assuming img_src is a relative url | |
return img_url | |
data_dict['url'] = response.url | |
data_dict['price'] = extract_with_css("p ins::text") | |
data_dict['name'] = extract_with_css(".prd_name::text") | |
data_dict['en_name'] = extract_with_css(".prd_en_name::text") | |
data_dict['img_url'] = [extract_img_url()] | |
data_dict['features'] = extract_features() | |
data_dict['tag'] = extract_tag() | |
data_dict['rating'] = extract_with_css(".info strong::text") | |
data_dict['vivino_link'] = extract_with_css("div.box > a::attr(href)") | |
# data_dict['vivino'] = {} | |
if data_dict['vivino_link'] == '': | |
yield None | |
else: | |
yield data_dict |