Python 爬虫入门【十五】：煎蛋网（Jiandan）图片抓取教程

下面给出一套合规、稳健、可直接运行的入门级爬虫教程（以抓取网页中的图片为例）。内容包含：注意事项（合规与礼节）、工作流程、示例代码（同步与并发两个版本）、常见问题及如何保存文件。示例代码尽量通用（不依赖对某一站点的特定 DOM 结构），在实际使用前请先检查目标站点的 robots.txt 与使用条款并尊重版权与隐私。

一、重要注意事项（必须读）

合法与伦理：先确认目标页面允许抓取。不要绕过登录、验证码或付费墙。不要抓取未成年人相关或违法/侵犯隐私的内容。
尊重 robots.txt：/robots.txt 可能限制抓取路径。
识别内容与年龄限制：若目标包含成人内容，请确保你在法律允许的地区且用于合法用途。
礼貌抓取：设置合理的 User-Agent、限速（例如每秒一两个请求或更慢），使用重试与回退策略，避免短时间内刷爆服务器。
异常处理：避免因一张图片失败导致程序崩溃；做好重试、超时、断点续传（可选）。
不要公开/传播侵权内容：抓取仅用于学习或已获授权的用途。

二、抓取工作流程（通用）

请求目标页面（带合适 header，如 User-Agent）。
解析 HTML（使用 BeautifulSoup / lxml / parsel）。
从 HTML 中提取图片 URL（常见位置：<img src="">、data-original、CSS 背景图）。
规范化 URL（补全相对路径、处理 // 协议相对地址）。
下载图片并保存到本地（使用合适文件名，避免重名）。
加入限速、重试、日志与异常处理。

三、同步示例（requests + BeautifulSoup）

保存为 jiandan_images_sync.py，在命令行运行：python jiandan_images_sync.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
同步示例：适合小量图片抓取
依赖：requests, beautifulsoup4
pip install requests beautifulsoup4
"""

import os
import time
import logging
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ImageCrawler/1.0; +https://example.com/bot)"
}
TIMEOUT = 15
RETRY = 3
SLEEP_BETWEEN_REQUESTS = 1.0  # 秒，礼貌爬取

def fetch(url):
    for attempt in range(RETRY):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            resp.raise_for_status()
            return resp
        except Exception as e:
            logging.warning("fetch failed %s (attempt %d/%d): %s", url, attempt+1, RETRY, e)
            time.sleep(1 + attempt*1)
    return None

def parse_image_urls(page_url, html):
    """
    通用解析：查找 &lt;img> 标签，以及常见的延迟加载属性 data-original / data-src 等
    返回：去重的绝对 URL 列表
    """
    soup = BeautifulSoup(html, "lxml")
    imgs = set()
    for img in soup.find_all("img"):
        # 常见属性
        for attr in ("data-original", "data-src", "data-lazy", "src"):
            src = img.get(attr)
            if src:
                # 过滤 tiny icons / data URIs
                if src.startswith("data:"):
                    continue
                abs_url = urljoin(page_url, src)
                imgs.add(abs_url)
                break
    # 也可以扩展到 background-image 等
    return sorted(imgs)

def make_filename_from_url(img_url, out_dir):
    parsed = urlparse(img_url)
    name = os.path.basename(parsed.path)
    if not name:
        # fallback
        name = parsed.netloc.replace('.', '_')
    # 防止没有扩展名
    if '.' not in name:
        name += '.jpg'
    # 保证唯一
    file_path = os.path.join(out_dir, name)
    base, ext = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{ext}"
        counter += 1
    return file_path

def download_image(img_url, out_dir):
    resp = fetch(img_url)
    if not resp:
        logging.error("Failed to download image: %s", img_url)
        return False
    path = make_filename_from_url(img_url, out_dir)
    try:
        with open(path, "wb") as f:
            f.write(resp.content)
        logging.info("Saved: %s", path)
        return True
    except Exception as e:
        logging.error("Write failed %s: %s", path, e)
        return False

def crawl_page(page_url, out_dir):
    resp = fetch(page_url)
    if not resp:
        logging.error("Failed to fetch page: %s", page_url)
        return
    img_urls = parse_image_urls(page_url, resp.text)
    logging.info("Found %d image urls on %s", len(img_urls), page_url)
    os.makedirs(out_dir, exist_ok=True)
    for img_url in img_urls:
        download_image(img_url, out_dir)
        time.sleep(SLEEP_BETWEEN_REQUESTS)

if __name__ == "__main__":
    # 示例入口：你可以把要抓取的页面 URL 列表填在这里
    pages = [
        "https://jandan.net/ooxx",  # 示例页面（视实际页面而定）
    ]
    out_dir = "images"
    for p in pages:
        crawl_page(p, out_dir)

说明：

parse_image_urls 尝试读取 data-original、data-src、src 等属性，适应延迟加载场景。
SLEEP_BETWEEN_REQUESTS 控制速度。
对文件命名做了唯一性保护。

四、并发示例（aiohttp + asyncio）——适合大量图片但对目标站点更友好需更谨慎

保存为 jiandan_images_async.py，依赖：aiohttp、aiofiles、beautifulsoup4、async_timeout。
安装：pip install aiohttp aiofiles beautifulsoup4 async-timeout

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import asyncio
import aiohttp
import aiofiles
import async_timeout
import os
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import logging
import time

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; AsyncImageCrawler/1.0)"}
CONCURRENCY = 8
TIMEOUT = 15
RETRY = 3

async def fetch_text(session, url):
    for attempt in range(RETRY):
        try:
            with async_timeout.timeout(TIMEOUT):
                async with session.get(url, headers=HEADERS) as resp:
                    resp.raise_for_status()
                    return await resp.text()
        except Exception as e:
            logging.warning("fetch_text failed %s attempt %d: %s", url, attempt+1, e)
            await asyncio.sleep(1 + attempt)
    return None

def parse_image_urls(page_url, html):
    soup = BeautifulSoup(html, "lxml")
    imgs = set()
    for img in soup.find_all("img"):
        for attr in ("data-original", "data-src", "data-lazy", "src"):
            src = img.get(attr)
            if src and not src.startswith("data:"):
                imgs.add(urljoin(page_url, src))
                break
    return sorted(imgs)

def make_filename_from_url(img_url, out_dir):
    parsed = urlparse(img_url)
    name = os.path.basename(parsed.path) or parsed.netloc.replace('.', '_')
    if '.' not in name:
        name += '.jpg'
    file_path = os.path.join(out_dir, name)
    base, ext = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{ext}"
        counter += 1
    return file_path

async def download_image(session, img_url, out_dir, sem):
    async with sem:
        for attempt in range(RETRY):
            try:
                with async_timeout.timeout(TIMEOUT):
                    async with session.get(img_url, headers=HEADERS) as resp:
                        resp.raise_for_status()
                        content = await resp.read()
                        path = make_filename_from_url(img_url, out_dir)
                        async with aiofiles.open(path, 'wb') as f:
                            await f.write(content)
                        logging.info("Saved: %s", path)
                        return True
            except Exception as e:
                logging.warning("download failed %s attempt %d: %s", img_url, attempt+1, e)
                await asyncio.sleep(1 + attempt)
        logging.error("Give up downloading %s", img_url)
        return False

async def main(pages, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    sem = asyncio.Semaphore(CONCURRENCY)
    async with aiohttp.ClientSession() as session:
        for page in pages:
            html = await fetch_text(session, page)
            if not html:
                logging.error("Cannot fetch %s", page)
                continue
            img_urls = parse_image_urls(page, html)
            tasks = [download_image(session, u, out_dir, sem) for u in img_urls]
            # 并发下载当前页面图片
            await asyncio.gather(*tasks)
            # 礼貌等待
            await asyncio.sleep(1)

if __name__ == "__main__":
    pages = ["https://jandan.net/ooxx"]
    out_dir = "images_async"
    asyncio.run(main(pages, out_dir))

说明：

CONCURRENCY 控制并发数量，切勿过大。
并发爬取更高效，但对目标服务器压力更大，请谨慎使用并保持礼貌速率限制。

五、扩展功能（实用建议）

分页抓取：煎蛋这种站点通常分页，解析页面中的“下一页”链接并循环抓取（并注意翻页速率）。
断点续传与记录：使用数据库或本地文件记录已下载 URL，避免重复下载。
限速与代理：如需要，可在请求中添加延时或使用代理池（注意合法性）。
图片去重：可对图片内容做哈希（MD5/SHA1）以去重。
异常告警：失败时写日志并告警（邮件/钉钉/Slack）。
遵守爬取政策：为站点提供 robots.txt 检查函数并遵守之。

六、常见报错与排查

connection refused / timeout：目标站点可能封 IP、网络不通或超时设置过短。
403 Forbidden：可能因 User-Agent、Referer、登录限制或反爬策略。可适当模拟浏览器 header（但不要伪造身份用于绕过限制）。
图片 URL 为相对地址或以 // 开头：使用 urljoin 规范化。
HTML 结构变化：检查页面实际 DOM，更新解析逻辑（CSS 选择器或属性名）。

七、示例：如何检查 robots.txt（简单函数）

import requests
from urllib.parse import urljoin

def allowed_by_robots(base_url, path="/"):
    robots_url = urljoin(base_url, "/robots.txt")
    try:
        r = requests.get(robots_url, timeout=5)
        if r.status_code != 200:
            return True  # robots.txt 不存在则默认允许（但仍需谨慎）
        txt = r.text.lower()
        # 非完整解析，仅简易检查 Disallow
        for line in txt.splitlines():
            line = line.strip()
            if line.startswith("disallow:"):
                dis = line.split(":",1)[1].strip()
                if path.startswith(dis):
                    return False
        return True
    except Exception:
        return False

Python爬虫入门【15】:煎蛋网XXOO图片抓取

Python 爬虫入门【十五】：煎蛋网（Jiandan）图片抓取教程

一、重要注意事项（必须读）

二、抓取工作流程（通用）

三、同步示例（requests + BeautifulSoup）

四、并发示例（aiohttp + asyncio）——适合大量图片但对目标站点更友好需更谨慎

五、扩展功能（实用建议）

六、常见报错与排查

七、示例：如何检查 robots.txt（简单函数）

lichongyang

发表回复取消回复

近期文章

近期评论

归档

分类

Python爬虫入门【15】:煎蛋网XXOO图片抓取

Python 爬虫入门 【十五】：煎蛋网（Jiandan）图片抓取 教程

一、重要注意事项（必须读）

二、抓取工作流程（通用）

三、同步示例（requests + BeautifulSoup）

四、并发示例（aiohttp + asyncio）——适合大量图片但对目标站点更友好需更谨慎

五、扩展功能（实用建议）

六、常见报错与排查

七、示例：如何检查 robots.txt（简单函数）

lichongyang

发表回复 取消回复

近期文章

近期评论

归档

分类

Python 爬虫入门【十五】：煎蛋网（Jiandan）图片抓取教程

发表回复取消回复