爬取美女图片实现翻页

时间：03-29来源：作者：点击数：33

我们先上url，链接：https://www.xiurenwang.vip/bang?f=2

图片内容我就不展示了，自己可以打开看下

我们要拿到每个列表的详情页的图片，我们对这一页数据循环提取，我们发现链接就在响应中，我们拼接网址进行请求

这一页数据里包含好几张图片，如果注册一个账号，用cookie保持登录，爬取的会更多，图片我尽量少展示，太夸张，然后我们上代码：

import requests

from lxml import etree
import os


class XiuPeople():
    # 初始化方法
    def __init__(self):
        # url
        self.url = 'https://www.xiurenwang.vip/bang?f=2'
        # headers
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
        }

    # 发送请求获取数据
    def get_data(self):
        response = requests.get(url=self.url, headers=self.headers)
        # 返回响应
        return response

    # 获取页面列表链接
    def parse_list(self, response):
        html = etree.HTML(response.content)
        # 每一个a列表链接
        node_list = html.xpath('//div[@class="list"]/li/a/@href')
        # 文件名列表
        name_list = html.xpath('//div[@class="tit"]/a/text()')
        # name_list = html.xpath('')
        li_list = list()
        for node in node_list:
            # 拼接url
            node_link = 'https://www.xiurenwang.vip/' + node
            # 将每个连接添加到列表返回
            li_list.append(node_link)
        # 返回列表url和文件名字
        return li_list, name_list

    # 内页解析
    def parse_detail(self, li_list):
        # 遍历列表
        img_list = list()
        for li in li_list:
            self.url = li
            response = self.get_data()
            html = etree.HTML(response.content)
            img_node = html.xpath('//div[@id="image"]/a/@href')
            for img_link in img_node:
                # 循环将每个图片链接放入列表
                img_list.append(img_link)
                print(img_link)
        return img_list

    #  保存数据
    def save_data(self, img_list, name_list):
        # 循环每个文件名
        for name in name_list:
            # 循环每个图片链接
            for link in img_list:
                title = str(link).split("/")[-1].split(".")[0]
                # 去除文件命中/,与路径冲突
                stitle = str(name).replace('/', '')
                add_title = stitle + title
                self.url = link
                response = self.get_data()
                # 保存数据
                with open("./picture/" + add_title + '.jpg', "wb") as f:
                    f.write(response.content)


    # 调用
    def run(self):
        response = self.get_data()
        li_list, name_list = self.parse_list(response)
        img_list = self.parse_detail(li_list)
        self.save_data(img_list, name_list)


if __name__ == '__main__':
    try:
        os.mkdir("./picture")
    except:
        print("文件夹已新建")
    xiu = XiuPeople()
    xiu.run()

实现翻页

第240页：https://www.xiurenwang.vip/bang/page/240?f=2

第2页：https://www.xiurenwang.vip/bang/page/2?f=2

两个url找变化，变化部分是page/后的数字，判断终止条件，如果尾页请求页码数大约>240,我们就把代码break掉

    # 翻页
    def next_page(self):
        for i in range(2,241):
            next_url = 'https://www.xiurenwang.vip/bang/page/{}?f=2'.format(i)
            self.url = next_url
            print(self.url)
            response = self.get_data()
            li_list, name_list = self.parse_list(response)
            img_list = self.parse_detail(li_list)
            self.save_data(img_list, name_list)
            self.next_page()
            if i > 241:
                break

所有代码如下：

import requests

from lxml import etree
import os


class XiuPeople():
    # 初始化方法
    def __init__(self):
        # url
        self.url = 'https://www.xiurenwang.vip/bang?f=2'
        # headers
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
        }

    # 发送请求获取数据
    def get_data(self):
        response = requests.get(url=self.url, headers=self.headers)
        # 返回响应
        return response

    # 获取页面列表链接
    def parse_list(self, response):
        html = etree.HTML(response.content)
        # 每一个a列表链接
        node_list = html.xpath('//div[@class="list"]/li/a/@href')
        # 文件名列表
        name_list = html.xpath('//div[@class="tit"]/a/text()')
        # name_list = html.xpath('')
        li_list = list()
        for node in node_list:
            # 拼接url
            node_link = 'https://www.xiurenwang.vip/' + node
            # 将每个连接添加到列表返回
            li_list.append(node_link)
        # 返回列表url和文件名字
        return li_list, name_list

    # 内页解析
    def parse_detail(self, li_list):
        # 遍历列表
        img_list = list()
        for li in li_list:
            self.url = li
            response = self.get_data()
            html = etree.HTML(response.content)
            img_node = html.xpath('//div[@id="image"]/a/@href')
            for img_link in img_node:
                # 循环将每个图片链接放入列表
                img_list.append(img_link)
                print(img_link)
        return img_list

    #  保存数据
    def save_data(self, img_list, name_list):
        # 循环每个文件名
        for name in name_list:
            # 循环每个图片链接
            for link in img_list:
                title = str(link).split("/")[-1].split(".")[0]
                # 去除文件命中/,与路径冲突
                stitle = str(name).replace('/', '')
                add_title = stitle + title
                self.url = link
                response = self.get_data()
                # 保存数据
                with open("./picture/" + add_title + '.jpg', "wb") as f:
                    f.write(response.content)

    # 翻页
    def next_page(self):
        for i in range(2,241):
            next_url = 'https://www.xiurenwang.vip/bang/page/{}?f=2'.format(i)
            self.url = next_url
            print(self.url)
            response = self.get_data()
            li_list, name_list = self.parse_list(response)
            img_list = self.parse_detail(li_list)
            self.save_data(img_list, name_list)
            self.next_page()
            if i > 241:
                break

    # 调用
    def run(self):
        response = self.get_data()
        li_list, name_list = self.parse_list(response)
        img_list = self.parse_detail(li_list)
        self.save_data(img_list, name_list)
        self.next_page()


if __name__ == '__main__':
    try:
        os.mkdir("./picture")
    except:
        print("文件夹已新建")
    xiu = XiuPeople()
    xiu.run()