我们先上url,链接:https://www.xiurenwang.vip/bang?f=2
图片内容我就不展示了,自己可以打开看下
我们要拿到每个列表的详情页的图片,我们对这一页数据循环提取,我们发现链接就在响应中,我们拼接网址进行请求
这一页数据里包含好几张图片,如果注册一个账号,用cookie保持登录,爬取的会更多,图片我尽量少展示,太夸张,然后我们上代码:
import requests
from lxml import etree
import os
class XiuPeople():
# 初始化方法
def __init__(self):
# url
self.url = 'https://www.xiurenwang.vip/bang?f=2'
# headers
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 发送请求获取数据
def get_data(self):
response = requests.get(url=self.url, headers=self.headers)
# 返回响应
return response
# 获取页面列表链接
def parse_list(self, response):
html = etree.HTML(response.content)
# 每一个a列表链接
node_list = html.xpath('//div[@class="list"]/li/a/@href')
# 文件名列表
name_list = html.xpath('//div[@class="tit"]/a/text()')
# name_list = html.xpath('')
li_list = list()
for node in node_list:
# 拼接url
node_link = 'https://www.xiurenwang.vip/' + node
# 将每个连接添加到列表返回
li_list.append(node_link)
# 返回列表url和文件名字
return li_list, name_list
# 内页解析
def parse_detail(self, li_list):
# 遍历列表
img_list = list()
for li in li_list:
self.url = li
response = self.get_data()
html = etree.HTML(response.content)
img_node = html.xpath('//div[@id="image"]/a/@href')
for img_link in img_node:
# 循环将每个图片链接放入列表
img_list.append(img_link)
print(img_link)
return img_list
# 保存数据
def save_data(self, img_list, name_list):
# 循环每个文件名
for name in name_list:
# 循环每个图片链接
for link in img_list:
title = str(link).split("/")[-1].split(".")[0]
# 去除文件命中/,与路径冲突
stitle = str(name).replace('/', '')
add_title = stitle + title
self.url = link
response = self.get_data()
# 保存数据
with open("./picture/" + add_title + '.jpg', "wb") as f:
f.write(response.content)
# 调用
def run(self):
response = self.get_data()
li_list, name_list = self.parse_list(response)
img_list = self.parse_detail(li_list)
self.save_data(img_list, name_list)
if __name__ == '__main__':
try:
os.mkdir("./picture")
except:
print("文件夹已新建")
xiu = XiuPeople()
xiu.run()
实现翻页
第240页:https://www.xiurenwang.vip/bang/page/240?f=2
第2页:https://www.xiurenwang.vip/bang/page/2?f=2
两个url找变化,变化部分是page/后的数字,判断终止条件,如果尾页请求页码数大约>240,我们就把代码break掉
# 翻页
def next_page(self):
for i in range(2,241):
next_url = 'https://www.xiurenwang.vip/bang/page/{}?f=2'.format(i)
self.url = next_url
print(self.url)
response = self.get_data()
li_list, name_list = self.parse_list(response)
img_list = self.parse_detail(li_list)
self.save_data(img_list, name_list)
self.next_page()
if i > 241:
break
所有代码如下:
import requests
from lxml import etree
import os
class XiuPeople():
# 初始化方法
def __init__(self):
# url
self.url = 'https://www.xiurenwang.vip/bang?f=2'
# headers
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 发送请求获取数据
def get_data(self):
response = requests.get(url=self.url, headers=self.headers)
# 返回响应
return response
# 获取页面列表链接
def parse_list(self, response):
html = etree.HTML(response.content)
# 每一个a列表链接
node_list = html.xpath('//div[@class="list"]/li/a/@href')
# 文件名列表
name_list = html.xpath('//div[@class="tit"]/a/text()')
# name_list = html.xpath('')
li_list = list()
for node in node_list:
# 拼接url
node_link = 'https://www.xiurenwang.vip/' + node
# 将每个连接添加到列表返回
li_list.append(node_link)
# 返回列表url和文件名字
return li_list, name_list
# 内页解析
def parse_detail(self, li_list):
# 遍历列表
img_list = list()
for li in li_list:
self.url = li
response = self.get_data()
html = etree.HTML(response.content)
img_node = html.xpath('//div[@id="image"]/a/@href')
for img_link in img_node:
# 循环将每个图片链接放入列表
img_list.append(img_link)
print(img_link)
return img_list
# 保存数据
def save_data(self, img_list, name_list):
# 循环每个文件名
for name in name_list:
# 循环每个图片链接
for link in img_list:
title = str(link).split("/")[-1].split(".")[0]
# 去除文件命中/,与路径冲突
stitle = str(name).replace('/', '')
add_title = stitle + title
self.url = link
response = self.get_data()
# 保存数据
with open("./picture/" + add_title + '.jpg', "wb") as f:
f.write(response.content)
# 翻页
def next_page(self):
for i in range(2,241):
next_url = 'https://www.xiurenwang.vip/bang/page/{}?f=2'.format(i)
self.url = next_url
print(self.url)
response = self.get_data()
li_list, name_list = self.parse_list(response)
img_list = self.parse_detail(li_list)
self.save_data(img_list, name_list)
self.next_page()
if i > 241:
break
# 调用
def run(self):
response = self.get_data()
li_list, name_list = self.parse_list(response)
img_list = self.parse_detail(li_list)
self.save_data(img_list, name_list)
self.next_page()
if __name__ == '__main__':
try:
os.mkdir("./picture")
except:
print("文件夹已新建")
xiu = XiuPeople()
xiu.run()
效果结果截图:
效果自己想,都被提示了!!!!!
喜欢的话多多点赞!!!!!!!