import puppeteer from 'puppeteer'; import minimist from 'minimist'; import fs from 'fs-extra'; const run = async ( name, countryCode = 'US', stateCode = '', minPrice, maxPrice, noPriceIncluded, minRatings, maxRatings, minAverage, maxAverage, ) => { // set country and state const setShipTo = async (countryCode, stateCode) => { return await page.evaluate( async (countryCode, stateCode) => { const fetchResult = await fetch('', { headers: { 'content-type': 'application/json', 'x-csrf-token': document.querySelector('[name="csrf-token"]').content, }, body: JSON.stringify({ country_code: countryCode, state_code: stateCode, }), method: 'PUT', }); if (fetchResult.status === 200) { const result = await fetchResult.json(); if ( result.ship_to.country_code.toLowerCase() === countryCode.toLowerCase() && result.ship_to.state_code.toLowerCase() === stateCode.toLowerCase() ) { return true; } else { return false; } } else { return false; } }, countryCode, stateCode, ); }; // check country and state const isShipTo = async (countryCode, stateCode) => { return await page.evaluate( (countryCode, stateCode) => { if ( countryCode.toLowerCase() === window.__PRELOADED_COUNTRY_CODE__.toLowerCase() && stateCode.toLowerCase() === window.__PRELOADED_STATE_CODE__.toLowerCase() ) { return true; } return false; }, countryCode, stateCode, ); }; // collect items from the page const collectItems = () => { const numerize = (stringNumber) => { const str = stringNumber.replace(/[^0-9,.]+/g, '').replace(',', '.'); return parseFloat(str); }; const CARDS_SELECTOR = '.card.card-lg'; const NAME_SELECTOR = '.wine-card__name'; const COUNTRY_SELECTOR = '.wine-card__region [data-item-type="country"]'; const REGION_SELECTOR = '.wine-card__region .link-color-alt-grey'; const AVERAGE_RATING_SELECTOR = '.average__number'; const RATINGS_SELECTOR = '.average__stars .text-micro'; const RATING_REPLACMENT = 'ratings'; const LINK_SELECTOR = 'a'; const THUMB_SELECTOR = 'figure'; const THUMB_REGEX = /"(.*)"/; const PRICE_SELECTOR = '.wine-price-value'; const data = [...document.querySelectorAll(CARDS_SELECTOR)].map((e) => { const name = e.querySelector(NAME_SELECTOR).textContent.trim(); const link = e.querySelector(LINK_SELECTOR).href; const thumb = e.querySelector(THUMB_SELECTOR) ? 'https:' + e.querySelector(THUMB_SELECTOR).style.backgroundImage.match(THUMB_REGEX)[1] : undefined; const country = e.querySelector(COUNTRY_SELECTOR).textContent.trim(); const region = e.querySelector(REGION_SELECTOR).textContent.trim(); const average_rating = e.querySelector(AVERAGE_RATING_SELECTOR) ? numerize(e.querySelector(AVERAGE_RATING_SELECTOR).textContent.trim()) : undefined; const ratings = e.querySelector(RATINGS_SELECTOR) ? Number( e.querySelector(RATINGS_SELECTOR).textContent.replace(RATING_REPLACMENT, '').trim(), ) : undefined; const price = e.querySelector(PRICE_SELECTOR) ? numerize(e.querySelector(PRICE_SELECTOR).textContent.trim()) : undefined; return { name: name, link: link, thumb: thumb, country: country, region: region, average_rating: average_rating, ratings: ratings, price: price, }; }); return data; }; // Set default state for the US if (countryCode.toLowerCase() === 'us' && stateCode === '') { stateCode = 'CA'; } const BASE_URL = ''; const SEARCH_PATH = '/search/wines?q='; const STATUS_FULL = 'FULL_DATA'; const STATUS_ERROR_RESPONSE = 'RESPONSE_ERROR'; const STATUS_ERROR_SHIP_TO = 'SHIP_TO_ERROR'; const STATUS_ERROR_SHIP_TO_CONFIRM = 'SHIP_TO_CONFIRM_ERROR'; const STATUS_ERROR_EXCEPTION = 'SOME_EXCEPTION'; const PAUSE_MULTIPLIER = 15; const result = { vinos: [] }; const browser = await puppeteer.launch({ headless: true, defaultViewport: { width: 1920, height: 1040 }, devtools: false, args: ['--start-maximized'], }); const page = await browser.newPage(); // need to set User Agent else an empty result // it seems they detect headless Chrome await page.setUserAgent( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', ); // To save bandwidth block all types of requests except "document", "xhr", "fetch" await page.setRequestInterception(true); page.on('request', (request) => { if (['document', 'xhr', 'fetch', 'script'].includes(request.resourceType())) { request.continue(); } else { request.abort(); } }); try { page.setDefaultNavigationTimeout(0); // load home page await page.goto(BASE_URL, { waitUntil: 'networkidle2' }); // check the country and state let isDestinationRight = await isShipTo(countryCode, stateCode); if (!isDestinationRight) { // set country and state const resultSetShipTo = await setShipTo(countryCode, stateCode); if (resultSetShipTo) { await page.goto(BASE_URL, { waitUntil: 'networkidle2' }); // check the country and state isDestinationRight = await isShipTo(countryCode, stateCode); if (!isDestinationRight) { console.log('"Ship To" changing can not be confirmed!'); result.status = STATUS_ERROR_SHIP_TO_CONFIRM; return; } } else { console.log('"Ship To" was not changed!'); result.status = STATUS_ERROR_SHIP_TO; return; } } let index = 1; let isNext = false; let pause = 0; do { isNext = false; const response = await page.goto(`${BASE_URL}${SEARCH_PATH}${name}&start=${index}`, { waitUntil: 'networkidle2', }); if (response.ok()) { pause = 0; const pageItems = await page.evaluate(collectItems); if (pageItems.length) { console.log('Results were collected from the page:', index); result.vinos.push(...pageItems); index++; isNext = true; } else { // no more data result.status = STATUS_FULL; } } else if (response.status() === 429) { pause++; await page.waitForTimeout(pause * PAUSE_MULTIPLIER * 1000); console.log(`Waited for ${pause * PAUSE_MULTIPLIER} seconds on the page ${index}`); isNext = true; } else { // return some error info result.http_status = response.status(); // http status result.page_index = index; // index of the problem page result.status = STATUS_ERROR_RESPONSE; } } while (isNext); // Filter data result.vinos = result.vinos.filter((e) => { if (minPrice && (e.price || !noPriceIncluded) && e.price < minPrice) return false; if (maxPrice && e.price > maxPrice) return false; if (minRatings && e.ratings < minRatings) return false; if (maxRatings && e.ratings > maxRatings) return false; if (minAverage && e.average_rating < minAverage) return false; if (maxAverage && e.average_rating > maxAverage) return false; return true; }); // console.log(JSON.stringify(result.vinos, null, 2)); } catch (error) { result.status = STATUS_ERROR_EXCEPTION; result.message = error; console.log('Exception:', error); } finally { console.log('Finish!'); // output results to the file const outFile = fs.createWriteStream('./output/'+name+'.json'); outFile.write(JSON.stringify(result, null, 2)); outFile.end(); await browser.close(); } }; const args = minimist(process.argv.slice(2)); console.log(args); const { name, country, state, minPrice, maxPrice, noPriceIncluded, minRatings, maxRatings, minAverage, maxAverage, } = args; run( name, country, state, minPrice, maxPrice, noPriceIncluded, minRatings, maxRatings, minAverage, maxAverage, );