python正则获取站长之家风景图,保存到本地
- # -*- coding: utf-8 -*-
- # !/usr/bin/env python
- # 获取站长之家风景图:https://sc.chinaz.com/tupian/fengjingtupian.html,长时间爬取会出现图片响应超时的问题。
- # 首先从第一页中获取第一页所有图片详情页链接和下一页的链接
- # 对详情页就行解析下载,下载完毕请求下一页,并重复上一步操作,直到最后一页为止。
- # 在下载图片前,先获取所有已下载的图片名字,如果存在则不下载
-
- import os, re, time
- import requests
- # 导入自定义随机请求头的包
- from utils.header import get_ua
-
-
- class Chinaz():
- def __init__(self):
- self.url = "https://sc.chinaz.com/tupian/fengjingtupian.html"
- self.base_url = "https://sc.chinaz.com/tupian/"
- self.img_file = "imgs"
- if not os.path.exists(self.img_file):
- os.makedirs(self.img_file)
- else:
- # 如果文件夹存在,则获取里面所有的文件名字,也有可能文件夹里什么都没有
- for root, dirs, files in os.walk(self.img_file):
- self.files= files
-
- # 专注于发送请求,并返回响应对象
- def get_html(self, url):
- resp = requests.get(url, headers={"User-Agent": get_ua()})
- resp.encoding = 'utf-8'
- if resp.status_code == 200:
- return resp
-
- # 获取所有图片网页地址和下一页链接
- def get_all(self, html):
- # 图片列表页的规则
- img_url_list_patt = re.compile(r'<p><a target="_blank" href="(.*?)" alt=".*?">')
- all_img_urls = img_url_list_patt.findall(html)
- all_img_urls = ["https:" + i for i in all_img_urls]
- # 图片详情页的规则
- img_url_patt = re.compile(r'<a href="(.*?)" title="(.*?)" class="image_gall">')
- for img_url in all_img_urls:
- img_html = self.get_html(img_url)
- if img_html:
- res_img_urls = img_url_patt.findall(img_html.text)
- res_img_url = "https:" + res_img_urls[0][0]
- res_img_title = res_img_urls[0][1]
- # 图片名字:江面风景唯美意境图片zzpic9603.jpg
- res_img_title += res_img_url.split("/")[-1]
- # 如果文件夹为空或者图片名字不存在则下载
- if not self.files or self.img_exist(res_img_title):
- try:
- self.download_img(res_img_url, res_img_title)
- except Exception as e:
- print("%s,该图片下载失败,跳过,出错原因:%s" % (res_img_title,e))
- continue
- else:
- print("该图片已存在,无需下载:%s" % res_img_title)
-
- # 图片列表页获取下一页规则
- next_patt = re.compile(r'(fengjing.*?)"\s+class="nextpage">下一页</a>')
- next_page = next_patt.findall(html)
- try:
- next_page = self.base_url + next_page[0].split('"')[-1]
- print("即将处理链接:", next_page)
- resp = self.get_html(next_page)
- if resp:
- self.get_all(resp.text)
- except IndexError:
- print("没有下一页了!")
- except Exception as e:
- print("出错了:", e)
-
- # 获取图片地址并下载
- def download_img(self, img_url, res_img_title):
- time.sleep(1)
- print("下载图片:", res_img_title)
- resp = self.get_html(img_url)
- if resp:
- with open(self.img_file + "/" + res_img_title, 'wb')as f:
- f.write(resp.content)
- else:
- print("%s下载图片失败,忽略~" % res_img_title)
-
- # 下载图片之前先判断,即将要下载的图片名字是否存在,如果存在则不再下载
- def img_exist(self, res_img_title):
- if res_img_title not in self.files:
- return True
- # for root, dirs, files in os.walk(self.img_file):
- # if res_img_title not in files:
- # return True
-
-
- if __name__ == '__main__':
- cz = Chinaz()
- html = cz.get_html(cz.url)
- cz.get_all(html.text)
-
get_ua请求头可以自己随机设置一个,或者参考:https://www.cdsy.xyz/computer/programme/Python/241210/cd64979.html