python正则获取站长之家风景图,保存到本地
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# 获取站长之家风景图:https://sc.chinaz.com/tupian/fengjingtupian.html,长时间爬取会出现图片响应超时的问题。
# 首先从第一页中获取第一页所有图片详情页链接和下一页的链接
# 对详情页就行解析下载,下载完毕请求下一页,并重复上一步操作,直到最后一页为止。
# 在下载图片前,先获取所有已下载的图片名字,如果存在则不下载
import os, re, time
import requests
# 导入自定义随机请求头的包
from utils.header import get_ua
class Chinaz():
def __init__(self):
self.url = "https://sc.chinaz.com/tupian/fengjingtupian.html"
self.base_url = "https://sc.chinaz.com/tupian/"
self.img_file = "imgs"
if not os.path.exists(self.img_file):
os.makedirs(self.img_file)
else:
# 如果文件夹存在,则获取里面所有的文件名字,也有可能文件夹里什么都没有
for root, dirs, files in os.walk(self.img_file):
self.files= files
# 专注于发送请求,并返回响应对象
def get_html(self, url):
resp = requests.get(url, headers={"User-Agent": get_ua()})
resp.encoding = 'utf-8'
if resp.status_code == 200:
return resp
# 获取所有图片网页地址和下一页链接
def get_all(self, html):
# 图片列表页的规则
img_url_list_patt = re.compile(r'<p><a target="_blank" href="(.*?)" alt=".*?">')
all_img_urls = img_url_list_patt.findall(html)
all_img_urls = ["https:" + i for i in all_img_urls]
# 图片详情页的规则
img_url_patt = re.compile(r'<a href="(.*?)" title="(.*?)" class="image_gall">')
for img_url in all_img_urls:
img_html = self.get_html(img_url)
if img_html:
res_img_urls = img_url_patt.findall(img_html.text)
res_img_url = "https:" + res_img_urls[0][0]
res_img_title = res_img_urls[0][1]
# 图片名字:江面风景唯美意境图片zzpic9603.jpg
res_img_title += res_img_url.split("/")[-1]
# 如果文件夹为空或者图片名字不存在则下载
if not self.files or self.img_exist(res_img_title):
try:
self.download_img(res_img_url, res_img_title)
except Exception as e:
print("%s,该图片下载失败,跳过,出错原因:%s" % (res_img_title,e))
continue
else:
print("该图片已存在,无需下载:%s" % res_img_title)
# 图片列表页获取下一页规则
next_patt = re.compile(r'(fengjing.*?)"\s+class="nextpage">下一页</a>')
next_page = next_patt.findall(html)
try:
next_page = self.base_url + next_page[0].split('"')[-1]
print("即将处理链接:", next_page)
resp = self.get_html(next_page)
if resp:
self.get_all(resp.text)
except IndexError:
print("没有下一页了!")
except Exception as e:
print("出错了:", e)
# 获取图片地址并下载
def download_img(self, img_url, res_img_title):
time.sleep(1)
print("下载图片:", res_img_title)
resp = self.get_html(img_url)
if resp:
with open(self.img_file + "/" + res_img_title, 'wb')as f:
f.write(resp.content)
else:
print("%s下载图片失败,忽略~" % res_img_title)
# 下载图片之前先判断,即将要下载的图片名字是否存在,如果存在则不再下载
def img_exist(self, res_img_title):
if res_img_title not in self.files:
return True
# for root, dirs, files in os.walk(self.img_file):
# if res_img_title not in files:
# return True
if __name__ == '__main__':
cz = Chinaz()
html = cz.get_html(cz.url)
cz.get_all(html.text)
get_ua请求头可以自己随机设置一个,或者参考:https://www.cdsy.xyz/computer/programme/Python/241210/cd64979.html