您当前的位置:首页 > 计算机 > 编程开发 > Python

python爬虫 爬取图片

时间:11-05来源:作者:点击数:

爬取单张图片

#网址:https://pic.netbian.com/   我们爬取:https://pic.netbian.com/4kmeinv/
from lxml import etree
import requests
import os

if __name__=="__main__":
    if not os.path.exists('zhaopian'):
        os.mkdir('zhaopian')

    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }
    url='https://pic.netbian.com/4kmeinv/'

    response=requests.get(url=url,headers=headers)
    #response.encoding= 'utf-8'
    resp=response.text
    #数据解析
    tree=etree.HTML(resp)


    li_list=tree.xpath('//div[@class="slist"]/ul/li')

    for li in li_list:
        tupian_src='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
        #print(tupian_src)
        name=li.xpath('./a/b/text()')[0]+'.jpg'
        name = name.encode('iso-8859-1').decode('gbk')
        #print(name)

        data=requests.get(url=tupian_src,headers=headers).content
        #存储地址
        path='zhaopian/'+name

        with open(path,'wb') as fp:
            fp.write(data)
            print("下载成功!!!")
import os
import requests
from lxml import etree
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"
}

url = "https://pic.netbian.com/4kmeinv/"

girl_data=requests.get(url=url,headers=header).text
# 处理乱码  .decode('gbk')  可去掉
girl_data=girl_data.encode("iso-8859-1").decode('gbk')
#实例化模型
girl_etree=etree.HTML(girl_data)

#xpath表达式代码后说明
picture_loc=girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@src")
picture_name_list=girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@alt")

#新增文件夹
if not os.path.exists("you_knew_about_picture"):
    os.mkdir("./you_knew_about_picture")

#增加索引定 picture_name_list
for i,each_loc in enumerate(picture_loc):
	#完整网址
    new_loc="https://pic.netbian.com/"+each_loc
    #爬取图片    
    each_picture_data=requests.get(new_loc,headers=header).content
    #each_picture_name由文件路径和名组成
    each_picture_name="you_knew_about_picture/"+picture_name_list[i]+".jpg"
    #打开文件
    fp=open(each_picture_name,mode="wb")
    #写入
    fp.write(each_picture_data)
    fp.close()
    #提示完成
    print(each_picture_name.split("/")[-1]+" have been over")

爬取多张图片

#爬取多页:第一页;https://pic.netbian.com/4kmeinv/
#第二页:https://pic.netbian.com/4kmeinv/index_2.html
#第三页:https://pic.netbian.com/4kmeinv/index_3.html
import requests
from lxml import etree
import os

#爬取2-4页
if __name__=="__main__":
    if not os.path.exists('zhao'):
        os.mkdir('zhao')
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }

    for i in range(2,5):
        url=f'https://pic.netbian.com/4kmeinv/index_{i}.html'
        #print(url)
        resp=requests.get(url=url,headers=headers).text

        tree=etree.HTML(resp)
        li_list=tree.xpath('//*[@id="main"]/div[3]/ul/li')
        for li in li_list:
            src_url='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
            #print(src_url)
            src_name=li.xpath('./a/b/text()')[0]+'.jpg'
            name=src_name.encode('iso-8859-1').decode('gbk')
            #print(name)
            data=requests.get(url=src_url,headers=headers).content
            path='zhao/'+name

            with open(path,'wb') as fp:
                fp.write(data)
                print('下载成功!!!')


别人代码

import os
import requests
from lxml import etree

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.5211 SLBChan/25"
}

picture_loc = []  # 图片存地址
picture_name_list = []  # 存图片名
# 第2,3页图片,可自行调节
# 不能包括1,因为1页面网址和后面网址不一样,如想包括,可添加if条件判断
for i in range(2, 4):
    # 一定要在循环内,否则一直为"https://pic.netbian.com/4kmeinv/index_2.html"
    # 关于为什么后面是/4kmeinv/index_{0}.html 代码后讲解
    url = "https://pic.netbian.com/4kmeinv/index_{0}.html"
    url = url.format(i)
    girl_data = requests.get(url=url, headers=header).text
    girl_data = girl_data.encode("iso-8859-1").decode('gbk')
    girl_etree = etree.HTML(girl_data, )
    # 地址压入
    picture_loc.extend(girl_etree.xpath("//ul[@class='clearfix']/li/a/img/@src"))
    # 图片名压入
    picture_name_list.extend(girl_etree.xpath("//ul[@class='clearfix']/li/a/b/text()"))

if not os.path.exists("you_knew_about_picture"):
    os.mkdir("./you_knew_about_picture")

a = 0  # 记录图片个数
for i, each_loc in enumerate(picture_loc):
    new_loc = "https://pic.netbian.com/" + each_loc

    each_picture_data = requests.get(new_loc, headers=header).content

    each_picture_name = "you_knew_about_picture/" + str(a) + " . " + picture_name_list[i] + ".jpg"

    fp = open(each_picture_name, mode="wb")
    fp.write(each_picture_data)
    fp.close()

    print(each_picture_name.split("/")[-1] + " have been over")
    a = a + 1
print(a)
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门