File size: 4,459 Bytes
4a51346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import scrapy
from scrapy_splash import SplashRequest

# μ•„λž˜λŠ” 와인 λ‚˜λΌμ—μ„œ 크둀링을 ν•΄μ˜¨λ‹€.
class WinenaraSpider(scrapy.Spider):
    name = "winenara"

    start_urls = ["https://www.winenara.com/shop/product/product_lists?sh_category1_cd=10000&sh_category2_cd=10100&sh_category3_cd=&sh_order_by=all&sh_sort_order_by=&sh_filter_code=&sh_rcd="]

    def parse(self, response):
        detail_page_links = response.css(".table_box::attr(href)")
        yield from response.follow_all(detail_page_links, self.parse_winenara)

        next_page = response.css('a[rel="next"]::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

    def parse_vivino_page(self, response):
        # parse the vivino page here
        def extract_with_css(query):
            return response.css(query).get(default="").strip()
        
        def extract_features():            
            details = {}
            details_sections = response.css('.tasteStructure__tasteCharacteristic--jLtsE').getall()
            for section in details_sections:
                current_key = section.css('.tasteStructure__property--CLNl_::text').get().lower().strip()
                style = response.css('span.indicatorBar__progress--3aXLX::attr(style)').get()
                left = [s.strip() for s in style.split(';') if 'left' in s][0]
                left_value = left.split(":")[1].strip()
                current_value = str(float(left_value)/100)
                if current_key and current_value:
                    details[current_key] = current_value.lower()
            return details
        
        def extract_info():
            details = {}
            details_sections = response.css('.breadCrumbs__link--1TY6b span').getall()
            for section in details_sections:
                current_key = section.css('a::attr(data-cy)').get().lower().strip()
                current_value = section.css('a::text').get().lower().strip()
                if current_key and current_value:
                    details[current_key] = current_value.lower()
            return details

        data_dict = response.meta['data_dict']
        
        data_dict['vivino']['rating'] = extract_with_css('.vivinoRating_averageValue__uDdPM::text')
        data_dict['vivino']['rating_num'] = extract_with_css('.vivinoRating_caption__xL84P::text')
        data_dict['vivino']['price'] = extract_with_css('.purchaseAvailability__currentPrice--3mO4u::text')
        data_dict['vivino']['features'] = extract_features()
        data_dict['vivino']['food_pairing'] = response.css('.foodPairing__foodImage--2OYHg::attr(aria-label)').getall()
        data_dict['vivino']['info'] = extract_info()

        yield data_dict

    def parse_winenara(self, response):
        data_dict = {}
        def extract_with_css(query):
            return response.css(query).get(default="").strip()
        
        def extract_features():            
            details = {}
            details_sections = response.css('dl.details')
            for section in details_sections:
                current_key = section.css('dt::text').get().lower().strip()
                current_value = section.css('dd span.label[style*="background"]::text').get()
                if current_key and current_value:
                    details[current_key] = current_value.lower()
            return details

        def extract_tag():
            return response.css(".cate_label .label::text").getall()
        
        def extract_img_url():
            img_src = response.css(".lozad::attr(data-src)").get(default="")
            if img_src:
                img_url = response.urljoin(img_src)  # Assuming img_src is a relative url
                return img_url

        


        data_dict['url'] = response.url
        data_dict['price'] = extract_with_css("p ins::text")
        data_dict['name'] = extract_with_css(".prd_name::text")
        data_dict['en_name'] = extract_with_css(".prd_en_name::text")
        data_dict['img_url'] = [extract_img_url()]
        data_dict['features'] = extract_features()
        data_dict['tag'] = extract_tag()
        data_dict['rating'] = extract_with_css(".info strong::text")
        data_dict['vivino_link'] = extract_with_css("div.box > a::attr(href)")  
        # data_dict['vivino'] = {}
        if data_dict['vivino_link'] == '':
            yield None
        else:
            yield data_dict