SungBeom's picture
Upload folder using huggingface_hub
4a51346
raw
history blame
4.46 kB
import scrapy
from scrapy_splash import SplashRequest
# μ•„λž˜λŠ” 와인 λ‚˜λΌμ—μ„œ 크둀링을 ν•΄μ˜¨λ‹€.
class WinenaraSpider(scrapy.Spider):
name = "winenara"
start_urls = ["https://www.winenara.com/shop/product/product_lists?sh_category1_cd=10000&sh_category2_cd=10100&sh_category3_cd=&sh_order_by=all&sh_sort_order_by=&sh_filter_code=&sh_rcd="]
def parse(self, response):
detail_page_links = response.css(".table_box::attr(href)")
yield from response.follow_all(detail_page_links, self.parse_winenara)
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, self.parse)
def parse_vivino_page(self, response):
# parse the vivino page here
def extract_with_css(query):
return response.css(query).get(default="").strip()
def extract_features():
details = {}
details_sections = response.css('.tasteStructure__tasteCharacteristic--jLtsE').getall()
for section in details_sections:
current_key = section.css('.tasteStructure__property--CLNl_::text').get().lower().strip()
style = response.css('span.indicatorBar__progress--3aXLX::attr(style)').get()
left = [s.strip() for s in style.split(';') if 'left' in s][0]
left_value = left.split(":")[1].strip()
current_value = str(float(left_value)/100)
if current_key and current_value:
details[current_key] = current_value.lower()
return details
def extract_info():
details = {}
details_sections = response.css('.breadCrumbs__link--1TY6b span').getall()
for section in details_sections:
current_key = section.css('a::attr(data-cy)').get().lower().strip()
current_value = section.css('a::text').get().lower().strip()
if current_key and current_value:
details[current_key] = current_value.lower()
return details
data_dict = response.meta['data_dict']
data_dict['vivino']['rating'] = extract_with_css('.vivinoRating_averageValue__uDdPM::text')
data_dict['vivino']['rating_num'] = extract_with_css('.vivinoRating_caption__xL84P::text')
data_dict['vivino']['price'] = extract_with_css('.purchaseAvailability__currentPrice--3mO4u::text')
data_dict['vivino']['features'] = extract_features()
data_dict['vivino']['food_pairing'] = response.css('.foodPairing__foodImage--2OYHg::attr(aria-label)').getall()
data_dict['vivino']['info'] = extract_info()
yield data_dict
def parse_winenara(self, response):
data_dict = {}
def extract_with_css(query):
return response.css(query).get(default="").strip()
def extract_features():
details = {}
details_sections = response.css('dl.details')
for section in details_sections:
current_key = section.css('dt::text').get().lower().strip()
current_value = section.css('dd span.label[style*="background"]::text').get()
if current_key and current_value:
details[current_key] = current_value.lower()
return details
def extract_tag():
return response.css(".cate_label .label::text").getall()
def extract_img_url():
img_src = response.css(".lozad::attr(data-src)").get(default="")
if img_src:
img_url = response.urljoin(img_src) # Assuming img_src is a relative url
return img_url
data_dict['url'] = response.url
data_dict['price'] = extract_with_css("p ins::text")
data_dict['name'] = extract_with_css(".prd_name::text")
data_dict['en_name'] = extract_with_css(".prd_en_name::text")
data_dict['img_url'] = [extract_img_url()]
data_dict['features'] = extract_features()
data_dict['tag'] = extract_tag()
data_dict['rating'] = extract_with_css(".info strong::text")
data_dict['vivino_link'] = extract_with_css("div.box > a::attr(href)")
# data_dict['vivino'] = {}
if data_dict['vivino_link'] == '':
yield None
else:
yield data_dict