{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import requests\n", "import csv\n", "import time\n", "from urllib.parse import urlparse, parse_qs, urlencode, urlunparse\n", "\n", "main_url = 'http://loveread.ec/'\n", "base_url = 'http://loveread.ec/index_book.php?id_genre=1&p=85'\n", "def increment_page(url):\n", " parsed_url = urlparse(url)\n", " query_params = parse_qs(parsed_url.query)\n", " if 'p' in query_params:\n", " current_page = int(query_params['p'][0])\n", " next_page = current_page + 1\n", " query_params['p'] = str(next_page)\n", " new_query_string = urlencode(query_params, doseq=True)\n", " new_url = urlunparse(parsed_url._replace(query=new_query_string))\n", " return new_url\n", " else:\n", " return url # если параметр p не найден, возвращаем исходный URL\n", " \n", "def parcing(num_books, output_csv='books.csv'):\n", " count = 0\n", " current_url = base_url\n", " headers = {\n", " \"Accept\": \"image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5\",\n", " \"User-Agent\": \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0\"\n", " }\n", " # Открываем CSV-файл для записи\n", " with open(output_csv, mode='w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerow(['page_url', 'image_url', 'author', 'title', 'annotation'])\n", " while count < num_books:\n", " # print(f'Fetching URL: {current_url}')\n", " response = requests.get(current_url, headers=headers)\n", " soup = BeautifulSoup(response.text, 'lxml')\n", " # Найдем все блоки с книгами\n", " book_blocks = soup.find_all('tr', class_='td_center_color')\n", " for i in range(0, len(book_blocks), 2):\n", " if count >= num_books:\n", " break\n", " book_info_block = book_blocks[i]\n", " book_annotation_block = book_blocks[i + 1]\n", " title_tag = book_info_block.find('a', title=True)\n", " # if not title_tag:\n", " # continue\n", " title = title_tag['title']\n", " # print(title)\n", " author_tag = book_info_block.find('a', href=lambda x: x and 'biography-author' in x)\n", " # if not author_tag:\n", " # continue\n", " author = author_tag.text.strip()\n", " # print(author)\n", " annotation = book_annotation_block.find('p').text.strip()\n", " # print(annotation)\n", " image_tag = book_info_block.find('img', class_='margin-right_8')\n", " # if not image_tag:\n", " # continue\n", " image_url = main_url + image_tag['src']\n", " # print(image_url)\n", " # if not book_url_tag:\n", " # continue\n", " book_url_tag = book_info_block.find('a', href=lambda x: x and 'view_global.php?' in x)['href']\n", " # print(book_url_tag)\n", " page_url = main_url + book_url_tag\n", " # print(page_url)\n", " # Записываем данные в CSV-файл\n", " writer.writerow([page_url, image_url, author, title, annotation])\n", " count += 1\n", " # Каждые 10 книг делаем паузу на 10 секунд\n", " if count % 10 == 0:\n", " time.sleep(10)\n", " print(f'Парсинг в процессе, спарсил {count} книг')\n", " # Получаем URL следующей страницы\n", " current_url = increment_page(current_url)\n", " # print(f'Next URL: {current_url}')\n", " print('Парсинг окончен')\n", " \n", "parcing(1000, 'books_1000.csv')\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }