{"id":755,"date":"2025-12-12T10:27:53","date_gmt":"2025-12-12T02:27:53","guid":{"rendered":"https:\/\/www.52runoob.com\/?p=755"},"modified":"2025-12-12T10:27:53","modified_gmt":"2025-12-12T02:27:53","slug":"python%e7%88%ac%e8%99%ab%e5%85%a5%e9%97%a8%e3%80%9015%e3%80%91%e7%85%8e%e8%9b%8b%e7%bd%91xxoo%e5%9b%be%e7%89%87%e6%8a%93%e5%8f%96","status":"publish","type":"post","link":"https:\/\/www.52runoob.com\/index.php\/2025\/12\/12\/python%e7%88%ac%e8%99%ab%e5%85%a5%e9%97%a8%e3%80%9015%e3%80%91%e7%85%8e%e8%9b%8b%e7%bd%91xxoo%e5%9b%be%e7%89%87%e6%8a%93%e5%8f%96\/","title":{"rendered":"Python\u722c\u866b\u5165\u95e8\u301015\u3011:\u714e\u86cb\u7f51XXOO\u56fe\u7247\u6293\u53d6"},"content":{"rendered":"\n<h1 class=\"wp-block-heading\">Python \u722c\u866b\u5165\u95e8 \u3010\u5341\u4e94\u3011\uff1a\u714e\u86cb\u7f51\uff08Jiandan\uff09\u56fe\u7247\u6293\u53d6 \u6559\u7a0b<\/h1>\n\n\n\n<p>\u4e0b\u9762\u7ed9\u51fa\u4e00\u5957<strong>\u5408\u89c4\u3001\u7a33\u5065\u3001\u53ef\u76f4\u63a5\u8fd0\u884c<\/strong>\u7684\u5165\u95e8\u7ea7\u722c\u866b\u6559\u7a0b\uff08\u4ee5\u6293\u53d6\u7f51\u9875\u4e2d\u7684\u56fe\u7247\u4e3a\u4f8b\uff09\u3002\u5185\u5bb9\u5305\u542b\uff1a\u6ce8\u610f\u4e8b\u9879\uff08\u5408\u89c4\u4e0e\u793c\u8282\uff09\u3001\u5de5\u4f5c\u6d41\u7a0b\u3001\u793a\u4f8b\u4ee3\u7801\uff08\u540c\u6b65\u4e0e\u5e76\u53d1\u4e24\u4e2a\u7248\u672c\uff09\u3001\u5e38\u89c1\u95ee\u9898\u53ca\u5982\u4f55\u4fdd\u5b58\u6587\u4ef6\u3002\u793a\u4f8b\u4ee3\u7801\u5c3d\u91cf\u901a\u7528\uff08\u4e0d\u4f9d\u8d56\u5bf9\u67d0\u4e00\u7ad9\u70b9\u7684\u7279\u5b9a DOM \u7ed3\u6784\uff09\uff0c\u5728\u5b9e\u9645\u4f7f\u7528\u524d\u8bf7<strong>\u5148\u68c0\u67e5\u76ee\u6807\u7ad9\u70b9\u7684 robots.txt \u4e0e\u4f7f\u7528\u6761\u6b3e\u5e76\u5c0a\u91cd\u7248\u6743\u4e0e\u9690\u79c1<\/strong>\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e00\u3001\u91cd\u8981\u6ce8\u610f\u4e8b\u9879\uff08\u5fc5\u987b\u8bfb\uff09<\/h1>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>\u5408\u6cd5\u4e0e\u4f26\u7406<\/strong>\uff1a\u5148\u786e\u8ba4\u76ee\u6807\u9875\u9762\u5141\u8bb8\u6293\u53d6\u3002\u4e0d\u8981\u7ed5\u8fc7\u767b\u5f55\u3001\u9a8c\u8bc1\u7801\u6216\u4ed8\u8d39\u5899\u3002\u4e0d\u8981\u6293\u53d6\u672a\u6210\u5e74\u4eba\u76f8\u5173\u6216\u8fdd\u6cd5\/\u4fb5\u72af\u9690\u79c1\u7684\u5185\u5bb9\u3002<\/li>\n\n\n\n<li><strong>\u5c0a\u91cd robots.txt<\/strong>\uff1a<code>\/robots.txt<\/code>\u00a0\u53ef\u80fd\u9650\u5236\u6293\u53d6\u8def\u5f84\u3002<\/li>\n\n\n\n<li><strong>\u8bc6\u522b\u5185\u5bb9\u4e0e\u5e74\u9f84\u9650\u5236<\/strong>\uff1a\u82e5\u76ee\u6807\u5305\u542b\u6210\u4eba\u5185\u5bb9\uff0c\u8bf7\u786e\u4fdd\u4f60\u5728\u6cd5\u5f8b\u5141\u8bb8\u7684\u5730\u533a\u4e14\u7528\u4e8e\u5408\u6cd5\u7528\u9014\u3002<\/li>\n\n\n\n<li><strong>\u793c\u8c8c\u6293\u53d6<\/strong>\uff1a\u8bbe\u7f6e\u5408\u7406\u7684 User-Agent\u3001\u9650\u901f\uff08\u4f8b\u5982\u6bcf\u79d2\u4e00\u4e24\u4e2a\u8bf7\u6c42\u6216\u66f4\u6162\uff09\uff0c\u4f7f\u7528\u91cd\u8bd5\u4e0e\u56de\u9000\u7b56\u7565\uff0c\u907f\u514d\u77ed\u65f6\u95f4\u5185\u5237\u7206\u670d\u52a1\u5668\u3002<\/li>\n\n\n\n<li><strong>\u5f02\u5e38\u5904\u7406<\/strong>\uff1a\u907f\u514d\u56e0\u4e00\u5f20\u56fe\u7247\u5931\u8d25\u5bfc\u81f4\u7a0b\u5e8f\u5d29\u6e83\uff1b\u505a\u597d\u91cd\u8bd5\u3001\u8d85\u65f6\u3001\u65ad\u70b9\u7eed\u4f20\uff08\u53ef\u9009\uff09\u3002<\/li>\n\n\n\n<li><strong>\u4e0d\u8981\u516c\u5f00\/\u4f20\u64ad\u4fb5\u6743\u5185\u5bb9<\/strong>\uff1a\u6293\u53d6\u4ec5\u7528\u4e8e\u5b66\u4e60\u6216\u5df2\u83b7\u6388\u6743\u7684\u7528\u9014\u3002<\/li>\n<\/ol>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e8c\u3001\u6293\u53d6\u5de5\u4f5c\u6d41\u7a0b\uff08\u901a\u7528\uff09<\/h1>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u8bf7\u6c42\u76ee\u6807\u9875\u9762\uff08\u5e26\u5408\u9002 header\uff0c\u5982 User-Agent\uff09\u3002<\/li>\n\n\n\n<li>\u89e3\u6790 HTML\uff08\u4f7f\u7528 BeautifulSoup \/ lxml \/ parsel\uff09\u3002<\/li>\n\n\n\n<li>\u4ece HTML \u4e2d\u63d0\u53d6\u56fe\u7247 URL\uff08\u5e38\u89c1\u4f4d\u7f6e\uff1a<code>&lt;img src=\"\"><\/code>\u3001<code>data-original<\/code>\u3001CSS \u80cc\u666f\u56fe\uff09\u3002<\/li>\n\n\n\n<li>\u89c4\u8303\u5316 URL\uff08\u8865\u5168\u76f8\u5bf9\u8def\u5f84\u3001\u5904\u7406\u00a0<code>\/\/<\/code>\u00a0\u534f\u8bae\u76f8\u5bf9\u5730\u5740\uff09\u3002<\/li>\n\n\n\n<li>\u4e0b\u8f7d\u56fe\u7247\u5e76\u4fdd\u5b58\u5230\u672c\u5730\uff08\u4f7f\u7528\u5408\u9002\u6587\u4ef6\u540d\uff0c\u907f\u514d\u91cd\u540d\uff09\u3002<\/li>\n\n\n\n<li>\u52a0\u5165\u9650\u901f\u3001\u91cd\u8bd5\u3001\u65e5\u5fd7\u4e0e\u5f02\u5e38\u5904\u7406\u3002<\/li>\n<\/ol>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e09\u3001\u540c\u6b65\u793a\u4f8b\uff08requests + BeautifulSoup\uff09<\/h1>\n\n\n\n<p>\u4fdd\u5b58\u4e3a&nbsp;<code>jiandan_images_sync.py<\/code>\uff0c\u5728\u547d\u4ee4\u884c\u8fd0\u884c\uff1a<code>python jiandan_images_sync.py<\/code><\/p>\n\n\n<div class=\"wp-block-syntaxhighlighter-code \"><pre class=\"brush: plain; title: ; notranslate\" title=\"\">\n#!\/usr\/bin\/env python3\n# -*- coding: utf-8 -*-\n\n&quot;&quot;&quot;\n\u540c\u6b65\u793a\u4f8b\uff1a\u9002\u5408\u5c0f\u91cf\u56fe\u7247\u6293\u53d6\n\u4f9d\u8d56\uff1arequests, beautifulsoup4\npip install requests beautifulsoup4\n&quot;&quot;&quot;\n\nimport os\nimport time\nimport logging\nfrom urllib.parse import urljoin, urlparse\nimport requests\nfrom bs4 import BeautifulSoup\n\nlogging.basicConfig(level=logging.INFO, format=&#039;%(asctime)s %(levelname)s: %(message)s&#039;)\n\nHEADERS = {\n    &quot;User-Agent&quot;: &quot;Mozilla\/5.0 (compatible; ImageCrawler\/1.0; +https:\/\/example.com\/bot)&quot;\n}\nTIMEOUT = 15\nRETRY = 3\nSLEEP_BETWEEN_REQUESTS = 1.0  # \u79d2\uff0c\u793c\u8c8c\u722c\u53d6\n\ndef fetch(url):\n    for attempt in range(RETRY):\n        try:\n            resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)\n            resp.raise_for_status()\n            return resp\n        except Exception as e:\n            logging.warning(&quot;fetch failed %s (attempt %d\/%d): %s&quot;, url, attempt+1, RETRY, e)\n            time.sleep(1 + attempt*1)\n    return None\n\ndef parse_image_urls(page_url, html):\n    &quot;&quot;&quot;\n    \u901a\u7528\u89e3\u6790\uff1a\u67e5\u627e &amp;lt;img&gt; \u6807\u7b7e\uff0c\u4ee5\u53ca\u5e38\u89c1\u7684\u5ef6\u8fdf\u52a0\u8f7d\u5c5e\u6027 data-original \/ data-src \u7b49\n    \u8fd4\u56de\uff1a\u53bb\u91cd\u7684\u7edd\u5bf9 URL \u5217\u8868\n    &quot;&quot;&quot;\n    soup = BeautifulSoup(html, &quot;lxml&quot;)\n    imgs = set()\n    for img in soup.find_all(&quot;img&quot;):\n        # \u5e38\u89c1\u5c5e\u6027\n        for attr in (&quot;data-original&quot;, &quot;data-src&quot;, &quot;data-lazy&quot;, &quot;src&quot;):\n            src = img.get(attr)\n            if src:\n                # \u8fc7\u6ee4 tiny icons \/ data URIs\n                if src.startswith(&quot;data:&quot;):\n                    continue\n                abs_url = urljoin(page_url, src)\n                imgs.add(abs_url)\n                break\n    # \u4e5f\u53ef\u4ee5\u6269\u5c55\u5230 background-image \u7b49\n    return sorted(imgs)\n\ndef make_filename_from_url(img_url, out_dir):\n    parsed = urlparse(img_url)\n    name = os.path.basename(parsed.path)\n    if not name:\n        # fallback\n        name = parsed.netloc.replace(&#039;.&#039;, &#039;_&#039;)\n    # \u9632\u6b62\u6ca1\u6709\u6269\u5c55\u540d\n    if &#039;.&#039; not in name:\n        name += &#039;.jpg&#039;\n    # \u4fdd\u8bc1\u552f\u4e00\n    file_path = os.path.join(out_dir, name)\n    base, ext = os.path.splitext(file_path)\n    counter = 1\n    while os.path.exists(file_path):\n        file_path = f&quot;{base}_{counter}{ext}&quot;\n        counter += 1\n    return file_path\n\ndef download_image(img_url, out_dir):\n    resp = fetch(img_url)\n    if not resp:\n        logging.error(&quot;Failed to download image: %s&quot;, img_url)\n        return False\n    path = make_filename_from_url(img_url, out_dir)\n    try:\n        with open(path, &quot;wb&quot;) as f:\n            f.write(resp.content)\n        logging.info(&quot;Saved: %s&quot;, path)\n        return True\n    except Exception as e:\n        logging.error(&quot;Write failed %s: %s&quot;, path, e)\n        return False\n\ndef crawl_page(page_url, out_dir):\n    resp = fetch(page_url)\n    if not resp:\n        logging.error(&quot;Failed to fetch page: %s&quot;, page_url)\n        return\n    img_urls = parse_image_urls(page_url, resp.text)\n    logging.info(&quot;Found %d image urls on %s&quot;, len(img_urls), page_url)\n    os.makedirs(out_dir, exist_ok=True)\n    for img_url in img_urls:\n        download_image(img_url, out_dir)\n        time.sleep(SLEEP_BETWEEN_REQUESTS)\n\nif __name__ == &quot;__main__&quot;:\n    # \u793a\u4f8b\u5165\u53e3\uff1a\u4f60\u53ef\u4ee5\u628a\u8981\u6293\u53d6\u7684\u9875\u9762 URL \u5217\u8868\u586b\u5728\u8fd9\u91cc\n    pages = &#x5B;\n        &quot;https:\/\/jandan.net\/ooxx&quot;,  # \u793a\u4f8b\u9875\u9762\uff08\u89c6\u5b9e\u9645\u9875\u9762\u800c\u5b9a\uff09\n    ]\n    out_dir = &quot;images&quot;\n    for p in pages:\n        crawl_page(p, out_dir)\n\n<\/pre><\/div>\n\n\n<p>\u8bf4\u660e\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><code>parse_image_urls<\/code>\u00a0\u5c1d\u8bd5\u8bfb\u53d6\u00a0<code>data-original<\/code>\u3001<code>data-src<\/code>\u3001<code>src<\/code>\u00a0\u7b49\u5c5e\u6027\uff0c\u9002\u5e94\u5ef6\u8fdf\u52a0\u8f7d\u573a\u666f\u3002<\/li>\n\n\n\n<li><code>SLEEP_BETWEEN_REQUESTS<\/code>\u00a0\u63a7\u5236\u901f\u5ea6\u3002<\/li>\n\n\n\n<li>\u5bf9\u6587\u4ef6\u547d\u540d\u505a\u4e86\u552f\u4e00\u6027\u4fdd\u62a4\u3002<\/li>\n<\/ul>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u56db\u3001\u5e76\u53d1\u793a\u4f8b\uff08aiohttp + asyncio\uff09\u2014\u2014\u9002\u5408\u5927\u91cf\u56fe\u7247\u4f46\u5bf9\u76ee\u6807\u7ad9\u70b9\u66f4\u53cb\u597d\u9700\u66f4\u8c28\u614e<\/h1>\n\n\n\n<p>\u4fdd\u5b58\u4e3a&nbsp;<code>jiandan_images_async.py<\/code>\uff0c\u4f9d\u8d56\uff1a<code>aiohttp<\/code>\u3001<code>aiofiles<\/code>\u3001<code>beautifulsoup4<\/code>\u3001<code>async_timeout<\/code>\u3002<br>\u5b89\u88c5\uff1a<code>pip install aiohttp aiofiles beautifulsoup4 async-timeout<\/code><\/p>\n\n\n<div class=\"wp-block-syntaxhighlighter-code \"><pre class=\"brush: plain; title: ; notranslate\" title=\"\">\n#!\/usr\/bin\/env python3\n# -*- coding: utf-8 -*-\n\nimport asyncio\nimport aiohttp\nimport aiofiles\nimport async_timeout\nimport os\nfrom urllib.parse import urljoin, urlparse\nfrom bs4 import BeautifulSoup\nimport logging\nimport time\n\nlogging.basicConfig(level=logging.INFO, format=&#039;%(asctime)s %(levelname)s: %(message)s&#039;)\n\nHEADERS = {&quot;User-Agent&quot;: &quot;Mozilla\/5.0 (compatible; AsyncImageCrawler\/1.0)&quot;}\nCONCURRENCY = 8\nTIMEOUT = 15\nRETRY = 3\n\nasync def fetch_text(session, url):\n    for attempt in range(RETRY):\n        try:\n            with async_timeout.timeout(TIMEOUT):\n                async with session.get(url, headers=HEADERS) as resp:\n                    resp.raise_for_status()\n                    return await resp.text()\n        except Exception as e:\n            logging.warning(&quot;fetch_text failed %s attempt %d: %s&quot;, url, attempt+1, e)\n            await asyncio.sleep(1 + attempt)\n    return None\n\ndef parse_image_urls(page_url, html):\n    soup = BeautifulSoup(html, &quot;lxml&quot;)\n    imgs = set()\n    for img in soup.find_all(&quot;img&quot;):\n        for attr in (&quot;data-original&quot;, &quot;data-src&quot;, &quot;data-lazy&quot;, &quot;src&quot;):\n            src = img.get(attr)\n            if src and not src.startswith(&quot;data:&quot;):\n                imgs.add(urljoin(page_url, src))\n                break\n    return sorted(imgs)\n\ndef make_filename_from_url(img_url, out_dir):\n    parsed = urlparse(img_url)\n    name = os.path.basename(parsed.path) or parsed.netloc.replace(&#039;.&#039;, &#039;_&#039;)\n    if &#039;.&#039; not in name:\n        name += &#039;.jpg&#039;\n    file_path = os.path.join(out_dir, name)\n    base, ext = os.path.splitext(file_path)\n    counter = 1\n    while os.path.exists(file_path):\n        file_path = f&quot;{base}_{counter}{ext}&quot;\n        counter += 1\n    return file_path\n\nasync def download_image(session, img_url, out_dir, sem):\n    async with sem:\n        for attempt in range(RETRY):\n            try:\n                with async_timeout.timeout(TIMEOUT):\n                    async with session.get(img_url, headers=HEADERS) as resp:\n                        resp.raise_for_status()\n                        content = await resp.read()\n                        path = make_filename_from_url(img_url, out_dir)\n                        async with aiofiles.open(path, &#039;wb&#039;) as f:\n                            await f.write(content)\n                        logging.info(&quot;Saved: %s&quot;, path)\n                        return True\n            except Exception as e:\n                logging.warning(&quot;download failed %s attempt %d: %s&quot;, img_url, attempt+1, e)\n                await asyncio.sleep(1 + attempt)\n        logging.error(&quot;Give up downloading %s&quot;, img_url)\n        return False\n\nasync def main(pages, out_dir):\n    os.makedirs(out_dir, exist_ok=True)\n    sem = asyncio.Semaphore(CONCURRENCY)\n    async with aiohttp.ClientSession() as session:\n        for page in pages:\n            html = await fetch_text(session, page)\n            if not html:\n                logging.error(&quot;Cannot fetch %s&quot;, page)\n                continue\n            img_urls = parse_image_urls(page, html)\n            tasks = &#x5B;download_image(session, u, out_dir, sem) for u in img_urls]\n            # \u5e76\u53d1\u4e0b\u8f7d\u5f53\u524d\u9875\u9762\u56fe\u7247\n            await asyncio.gather(*tasks)\n            # \u793c\u8c8c\u7b49\u5f85\n            await asyncio.sleep(1)\n\nif __name__ == &quot;__main__&quot;:\n    pages = &#x5B;&quot;https:\/\/jandan.net\/ooxx&quot;]\n    out_dir = &quot;images_async&quot;\n    asyncio.run(main(pages, out_dir))\n\n<\/pre><\/div>\n\n\n<p>\u8bf4\u660e\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><code>CONCURRENCY<\/code>\u00a0\u63a7\u5236\u5e76\u53d1\u6570\u91cf\uff0c\u5207\u52ff\u8fc7\u5927\u3002<\/li>\n\n\n\n<li>\u5e76\u53d1\u722c\u53d6\u66f4\u9ad8\u6548\uff0c\u4f46\u5bf9\u76ee\u6807\u670d\u52a1\u5668\u538b\u529b\u66f4\u5927\uff0c\u8bf7\u8c28\u614e\u4f7f\u7528\u5e76\u4fdd\u6301\u793c\u8c8c\u901f\u7387\u9650\u5236\u3002<\/li>\n<\/ul>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e94\u3001\u6269\u5c55\u529f\u80fd\uff08\u5b9e\u7528\u5efa\u8bae\uff09<\/h1>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>\u5206\u9875\u6293\u53d6<\/strong>\uff1a\u714e\u86cb\u8fd9\u79cd\u7ad9\u70b9\u901a\u5e38\u5206\u9875\uff0c\u89e3\u6790\u9875\u9762\u4e2d\u7684\u201c\u4e0b\u4e00\u9875\u201d\u94fe\u63a5\u5e76\u5faa\u73af\u6293\u53d6\uff08\u5e76\u6ce8\u610f\u7ffb\u9875\u901f\u7387\uff09\u3002<\/li>\n\n\n\n<li><strong>\u65ad\u70b9\u7eed\u4f20\u4e0e\u8bb0\u5f55<\/strong>\uff1a\u4f7f\u7528\u6570\u636e\u5e93\u6216\u672c\u5730\u6587\u4ef6\u8bb0\u5f55\u5df2\u4e0b\u8f7d URL\uff0c\u907f\u514d\u91cd\u590d\u4e0b\u8f7d\u3002<\/li>\n\n\n\n<li><strong>\u9650\u901f\u4e0e\u4ee3\u7406<\/strong>\uff1a\u5982\u9700\u8981\uff0c\u53ef\u5728\u8bf7\u6c42\u4e2d\u6dfb\u52a0\u5ef6\u65f6\u6216\u4f7f\u7528\u4ee3\u7406\u6c60\uff08\u6ce8\u610f\u5408\u6cd5\u6027\uff09\u3002<\/li>\n\n\n\n<li><strong>\u56fe\u7247\u53bb\u91cd<\/strong>\uff1a\u53ef\u5bf9\u56fe\u7247\u5185\u5bb9\u505a\u54c8\u5e0c\uff08MD5\/SHA1\uff09\u4ee5\u53bb\u91cd\u3002<\/li>\n\n\n\n<li><strong>\u5f02\u5e38\u544a\u8b66<\/strong>\uff1a\u5931\u8d25\u65f6\u5199\u65e5\u5fd7\u5e76\u544a\u8b66\uff08\u90ae\u4ef6\/\u9489\u9489\/Slack\uff09\u3002<\/li>\n\n\n\n<li><strong>\u9075\u5b88\u722c\u53d6\u653f\u7b56<\/strong>\uff1a\u4e3a\u7ad9\u70b9\u63d0\u4f9b robots.txt \u68c0\u67e5\u51fd\u6570\u5e76\u9075\u5b88\u4e4b\u3002<\/li>\n<\/ol>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u516d\u3001\u5e38\u89c1\u62a5\u9519\u4e0e\u6392\u67e5<\/h1>\n\n\n\n<ul class=\"wp-block-list\">\n<li><code>connection refused<\/code>\u00a0\/\u00a0<code>timeout<\/code>\uff1a\u76ee\u6807\u7ad9\u70b9\u53ef\u80fd\u5c01 IP\u3001\u7f51\u7edc\u4e0d\u901a\u6216\u8d85\u65f6\u8bbe\u7f6e\u8fc7\u77ed\u3002<\/li>\n\n\n\n<li><code>403 Forbidden<\/code>\uff1a\u53ef\u80fd\u56e0 User-Agent\u3001Referer\u3001\u767b\u5f55\u9650\u5236\u6216\u53cd\u722c\u7b56\u7565\u3002\u53ef\u9002\u5f53\u6a21\u62df\u6d4f\u89c8\u5668 header\uff08\u4f46\u4e0d\u8981\u4f2a\u9020\u8eab\u4efd\u7528\u4e8e\u7ed5\u8fc7\u9650\u5236\uff09\u3002<\/li>\n\n\n\n<li>\u56fe\u7247 URL \u4e3a\u76f8\u5bf9\u5730\u5740\u6216\u4ee5\u00a0<code>\/\/<\/code>\u00a0\u5f00\u5934\uff1a\u4f7f\u7528\u00a0<code>urljoin<\/code>\u00a0\u89c4\u8303\u5316\u3002<\/li>\n\n\n\n<li>HTML \u7ed3\u6784\u53d8\u5316\uff1a\u68c0\u67e5\u9875\u9762\u5b9e\u9645 DOM\uff0c\u66f4\u65b0\u89e3\u6790\u903b\u8f91\uff08CSS \u9009\u62e9\u5668\u6216\u5c5e\u6027\u540d\uff09\u3002<\/li>\n<\/ul>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e03\u3001\u793a\u4f8b\uff1a\u5982\u4f55\u68c0\u67e5 robots.txt\uff08\u7b80\u5355\u51fd\u6570\uff09<\/h1>\n\n\n<div class=\"wp-block-syntaxhighlighter-code \"><pre class=\"brush: plain; title: ; notranslate\" title=\"\">\nimport requests\nfrom urllib.parse import urljoin\n\ndef allowed_by_robots(base_url, path=&quot;\/&quot;):\n    robots_url = urljoin(base_url, &quot;\/robots.txt&quot;)\n    try:\n        r = requests.get(robots_url, timeout=5)\n        if r.status_code != 200:\n            return True  # robots.txt \u4e0d\u5b58\u5728\u5219\u9ed8\u8ba4\u5141\u8bb8\uff08\u4f46\u4ecd\u9700\u8c28\u614e\uff09\n        txt = r.text.lower()\n        # \u975e\u5b8c\u6574\u89e3\u6790\uff0c\u4ec5\u7b80\u6613\u68c0\u67e5 Disallow\n        for line in txt.splitlines():\n            line = line.strip()\n            if line.startswith(&quot;disallow:&quot;):\n                dis = line.split(&quot;:&quot;,1)&#x5B;1].strip()\n                if path.startswith(dis):\n                    return False\n        return True\n    except Exception:\n        return False\n\n<\/pre><\/div>","protected":false},"excerpt":{"rendered":"<p>Python \u722c\u866b\u5165\u95e8 \u3010\u5341\u4e94\u3011\uff1a\u714e\u86cb\u7f51\uff08Jiandan\uff09\u56fe\u7247\u6293\u53d6 \u6559\u7a0b \u4e0b\u9762\u7ed9&#8230; <a class=\"more-link\" href=\"https:\/\/www.52runoob.com\/index.php\/2025\/12\/12\/python%e7%88%ac%e8%99%ab%e5%85%a5%e9%97%a8%e3%80%9015%e3%80%91%e7%85%8e%e8%9b%8b%e7%bd%91xxoo%e5%9b%be%e7%89%87%e6%8a%93%e5%8f%96\/\">Continue Reading &rarr;<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[2],"tags":[],"class_list":["post-755","post","type-post","status-publish","format-standard","hentry","category-2"],"amp_enabled":true,"_links":{"self":[{"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/posts\/755","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/comments?post=755"}],"version-history":[{"count":1,"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/posts\/755\/revisions"}],"predecessor-version":[{"id":756,"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/posts\/755\/revisions\/756"}],"wp:attachment":[{"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/media?parent=755"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/categories?post=755"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.52runoob.com\/index.php\/wp-json\/wp\/v2\/tags?post=755"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}