明白了!以下是一个适用于爬取 图集谷(tujigu) 妹子图集的 Python 爬虫示例(只爬取某一页),并预留了接口,方便你后续二次开发全站抓取或分类筛选。


🐍 一、基础环境准备

pip install requests beautifulsoup4 fake-useragent tqdm

📷 二、按页面抓取图集(示例:第 1 页)

import os
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from tqdm import tqdm

BASE_URL = "https://www.tujigu.com"
START_PAGE = 1  # 你可以修改页码
SAVE_DIR = "图集谷"

headers = {
    "User-Agent": UserAgent().random
}

def fetch_html(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = 'gb2312'  # 图集谷网页编码是 gb2312
    return resp.text

def parse_gallery_urls(html):
    soup = BeautifulSoup(html, "html.parser")
    urls = []
    for div in soup.select(".hezi > ul > li > a"):
        href = div.get("href")
        if href:
            urls.append(BASE_URL + href)
    return urls

def parse_image_urls(gallery_url):
    html = fetch_html(gallery_url)
    soup = BeautifulSoup(html, "html.parser")
    title = soup.h1.text.strip().replace('/', '-')
    images = soup.select(".content > a > img")
    img_urls = [img['src'] for img in images if 'src' in img.attrs]
    return title, img_urls

def save_images(title, img_urls):
    folder = os.path.join(SAVE_DIR, title)
    os.makedirs(folder, exist_ok=True)
    for url in tqdm(img_urls, desc=title):
        filename = os.path.join(folder, url.split('/')[-1])
        try:
            with open(filename, 'wb') as f:
                f.write(requests.get(url, headers=headers, timeout=10).content)
        except Exception as e:
            print("下载失败:", url, e)

def main():
    url = f"{BASE_URL}/Home/Article/show/typeid/2/page/{START_PAGE}.html"
    print("正在抓取页面:", url)
    html = fetch_html(url)
    gallery_urls = parse_gallery_urls(html)

    for gallery_url in gallery_urls:
        try:
            title, img_urls = parse_image_urls(gallery_url)
            save_images(title, img_urls)
        except Exception as e:
            print("出错:", e)

if __name__ == "__main__":
    main()

🧩 三、支持扩展(你可以实现):

  • ✅ 加入关键词或妹子名字筛选;
  • ✅ 多线程/协程加速下载;
  • ✅ 抓取图集页数与大图分页(多页图集);
  • ✅ 下载封面 + 全文元数据(如发布日期、分类等);
  • ✅ 自动跳页抓取全站 page/1~N
  • ✅ 自动生成 HTML 本地图库浏览页面。